diff options
Diffstat (limited to 'llvm/lib/Transforms/InstCombine')
16 files changed, 35991 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp new file mode 100644 index 000000000000..8bc34825f8a7 --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -0,0 +1,2204 @@ +//===- InstCombineAddSub.cpp ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the visit functions for add, fadd, sub, and fsub. +// +//===----------------------------------------------------------------------===// + +#include "InstCombineInternal.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/AlignOf.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/KnownBits.h" +#include <cassert> +#include <utility> + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "instcombine" + +namespace { + + /// Class representing coefficient of floating-point addend. + /// This class needs to be highly efficient, which is especially true for + /// the constructor. As of I write this comment, the cost of the default + /// constructor is merely 4-byte-store-zero (Assuming compiler is able to + /// perform write-merging). + /// + class FAddendCoef { + public: + // The constructor has to initialize a APFloat, which is unnecessary for + // most addends which have coefficient either 1 or -1. So, the constructor + // is expensive. In order to avoid the cost of the constructor, we should + // reuse some instances whenever possible. The pre-created instances + // FAddCombine::Add[0-5] embodies this idea. + FAddendCoef() = default; + ~FAddendCoef(); + + // If possible, don't define operator+/operator- etc because these + // operators inevitably call FAddendCoef's constructor which is not cheap. + void operator=(const FAddendCoef &A); + void operator+=(const FAddendCoef &A); + void operator*=(const FAddendCoef &S); + + void set(short C) { + assert(!insaneIntVal(C) && "Insane coefficient"); + IsFp = false; IntVal = C; + } + + void set(const APFloat& C); + + void negate(); + + bool isZero() const { return isInt() ? !IntVal : getFpVal().isZero(); } + Value *getValue(Type *) const; + + bool isOne() const { return isInt() && IntVal == 1; } + bool isTwo() const { return isInt() && IntVal == 2; } + bool isMinusOne() const { return isInt() && IntVal == -1; } + bool isMinusTwo() const { return isInt() && IntVal == -2; } + + private: + bool insaneIntVal(int V) { return V > 4 || V < -4; } + + APFloat *getFpValPtr() + { return reinterpret_cast<APFloat *>(&FpValBuf.buffer[0]); } + + const APFloat *getFpValPtr() const + { return reinterpret_cast<const APFloat *>(&FpValBuf.buffer[0]); } + + const APFloat &getFpVal() const { + assert(IsFp && BufHasFpVal && "Incorret state"); + return *getFpValPtr(); + } + + APFloat &getFpVal() { + assert(IsFp && BufHasFpVal && "Incorret state"); + return *getFpValPtr(); + } + + bool isInt() const { return !IsFp; } + + // If the coefficient is represented by an integer, promote it to a + // floating point. + void convertToFpType(const fltSemantics &Sem); + + // Construct an APFloat from a signed integer. + // TODO: We should get rid of this function when APFloat can be constructed + // from an *SIGNED* integer. + APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val); + + bool IsFp = false; + + // True iff FpValBuf contains an instance of APFloat. + bool BufHasFpVal = false; + + // The integer coefficient of an individual addend is either 1 or -1, + // and we try to simplify at most 4 addends from neighboring at most + // two instructions. So the range of <IntVal> falls in [-4, 4]. APInt + // is overkill of this end. + short IntVal = 0; + + AlignedCharArrayUnion<APFloat> FpValBuf; + }; + + /// FAddend is used to represent floating-point addend. An addend is + /// represented as <C, V>, where the V is a symbolic value, and C is a + /// constant coefficient. A constant addend is represented as <C, 0>. + class FAddend { + public: + FAddend() = default; + + void operator+=(const FAddend &T) { + assert((Val == T.Val) && "Symbolic-values disagree"); + Coeff += T.Coeff; + } + + Value *getSymVal() const { return Val; } + const FAddendCoef &getCoef() const { return Coeff; } + + bool isConstant() const { return Val == nullptr; } + bool isZero() const { return Coeff.isZero(); } + + void set(short Coefficient, Value *V) { + Coeff.set(Coefficient); + Val = V; + } + void set(const APFloat &Coefficient, Value *V) { + Coeff.set(Coefficient); + Val = V; + } + void set(const ConstantFP *Coefficient, Value *V) { + Coeff.set(Coefficient->getValueAPF()); + Val = V; + } + + void negate() { Coeff.negate(); } + + /// Drill down the U-D chain one step to find the definition of V, and + /// try to break the definition into one or two addends. + static unsigned drillValueDownOneStep(Value* V, FAddend &A0, FAddend &A1); + + /// Similar to FAddend::drillDownOneStep() except that the value being + /// splitted is the addend itself. + unsigned drillAddendDownOneStep(FAddend &Addend0, FAddend &Addend1) const; + + private: + void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; } + + // This addend has the value of "Coeff * Val". + Value *Val = nullptr; + FAddendCoef Coeff; + }; + + /// FAddCombine is the class for optimizing an unsafe fadd/fsub along + /// with its neighboring at most two instructions. + /// + class FAddCombine { + public: + FAddCombine(InstCombiner::BuilderTy &B) : Builder(B) {} + + Value *simplify(Instruction *FAdd); + + private: + using AddendVect = SmallVector<const FAddend *, 4>; + + Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota); + + /// Convert given addend to a Value + Value *createAddendVal(const FAddend &A, bool& NeedNeg); + + /// Return the number of instructions needed to emit the N-ary addition. + unsigned calcInstrNumber(const AddendVect& Vect); + + Value *createFSub(Value *Opnd0, Value *Opnd1); + Value *createFAdd(Value *Opnd0, Value *Opnd1); + Value *createFMul(Value *Opnd0, Value *Opnd1); + Value *createFNeg(Value *V); + Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota); + void createInstPostProc(Instruction *NewInst, bool NoNumber = false); + + // Debugging stuff are clustered here. + #ifndef NDEBUG + unsigned CreateInstrNum; + void initCreateInstNum() { CreateInstrNum = 0; } + void incCreateInstNum() { CreateInstrNum++; } + #else + void initCreateInstNum() {} + void incCreateInstNum() {} + #endif + + InstCombiner::BuilderTy &Builder; + Instruction *Instr = nullptr; + }; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// +// Implementation of +// {FAddendCoef, FAddend, FAddition, FAddCombine}. +// +//===----------------------------------------------------------------------===// +FAddendCoef::~FAddendCoef() { + if (BufHasFpVal) + getFpValPtr()->~APFloat(); +} + +void FAddendCoef::set(const APFloat& C) { + APFloat *P = getFpValPtr(); + + if (isInt()) { + // As the buffer is meanless byte stream, we cannot call + // APFloat::operator=(). + new(P) APFloat(C); + } else + *P = C; + + IsFp = BufHasFpVal = true; +} + +void FAddendCoef::convertToFpType(const fltSemantics &Sem) { + if (!isInt()) + return; + + APFloat *P = getFpValPtr(); + if (IntVal > 0) + new(P) APFloat(Sem, IntVal); + else { + new(P) APFloat(Sem, 0 - IntVal); + P->changeSign(); + } + IsFp = BufHasFpVal = true; +} + +APFloat FAddendCoef::createAPFloatFromInt(const fltSemantics &Sem, int Val) { + if (Val >= 0) + return APFloat(Sem, Val); + + APFloat T(Sem, 0 - Val); + T.changeSign(); + + return T; +} + +void FAddendCoef::operator=(const FAddendCoef &That) { + if (That.isInt()) + set(That.IntVal); + else + set(That.getFpVal()); +} + +void FAddendCoef::operator+=(const FAddendCoef &That) { + enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven; + if (isInt() == That.isInt()) { + if (isInt()) + IntVal += That.IntVal; + else + getFpVal().add(That.getFpVal(), RndMode); + return; + } + + if (isInt()) { + const APFloat &T = That.getFpVal(); + convertToFpType(T.getSemantics()); + getFpVal().add(T, RndMode); + return; + } + + APFloat &T = getFpVal(); + T.add(createAPFloatFromInt(T.getSemantics(), That.IntVal), RndMode); +} + +void FAddendCoef::operator*=(const FAddendCoef &That) { + if (That.isOne()) + return; + + if (That.isMinusOne()) { + negate(); + return; + } + + if (isInt() && That.isInt()) { + int Res = IntVal * (int)That.IntVal; + assert(!insaneIntVal(Res) && "Insane int value"); + IntVal = Res; + return; + } + + const fltSemantics &Semantic = + isInt() ? That.getFpVal().getSemantics() : getFpVal().getSemantics(); + + if (isInt()) + convertToFpType(Semantic); + APFloat &F0 = getFpVal(); + + if (That.isInt()) + F0.multiply(createAPFloatFromInt(Semantic, That.IntVal), + APFloat::rmNearestTiesToEven); + else + F0.multiply(That.getFpVal(), APFloat::rmNearestTiesToEven); +} + +void FAddendCoef::negate() { + if (isInt()) + IntVal = 0 - IntVal; + else + getFpVal().changeSign(); +} + +Value *FAddendCoef::getValue(Type *Ty) const { + return isInt() ? + ConstantFP::get(Ty, float(IntVal)) : + ConstantFP::get(Ty->getContext(), getFpVal()); +} + +// The definition of <Val> Addends +// ========================================= +// A + B <1, A>, <1,B> +// A - B <1, A>, <1,B> +// 0 - B <-1, B> +// C * A, <C, A> +// A + C <1, A> <C, NULL> +// 0 +/- 0 <0, NULL> (corner case) +// +// Legend: A and B are not constant, C is constant +unsigned FAddend::drillValueDownOneStep + (Value *Val, FAddend &Addend0, FAddend &Addend1) { + Instruction *I = nullptr; + if (!Val || !(I = dyn_cast<Instruction>(Val))) + return 0; + + unsigned Opcode = I->getOpcode(); + + if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) { + ConstantFP *C0, *C1; + Value *Opnd0 = I->getOperand(0); + Value *Opnd1 = I->getOperand(1); + if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero()) + Opnd0 = nullptr; + + if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero()) + Opnd1 = nullptr; + + if (Opnd0) { + if (!C0) + Addend0.set(1, Opnd0); + else + Addend0.set(C0, nullptr); + } + + if (Opnd1) { + FAddend &Addend = Opnd0 ? Addend1 : Addend0; + if (!C1) + Addend.set(1, Opnd1); + else + Addend.set(C1, nullptr); + if (Opcode == Instruction::FSub) + Addend.negate(); + } + + if (Opnd0 || Opnd1) + return Opnd0 && Opnd1 ? 2 : 1; + + // Both operands are zero. Weird! + Addend0.set(APFloat(C0->getValueAPF().getSemantics()), nullptr); + return 1; + } + + if (I->getOpcode() == Instruction::FMul) { + Value *V0 = I->getOperand(0); + Value *V1 = I->getOperand(1); + if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) { + Addend0.set(C, V1); + return 1; + } + + if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) { + Addend0.set(C, V0); + return 1; + } + } + + return 0; +} + +// Try to break *this* addend into two addends. e.g. Suppose this addend is +// <2.3, V>, and V = X + Y, by calling this function, we obtain two addends, +// i.e. <2.3, X> and <2.3, Y>. +unsigned FAddend::drillAddendDownOneStep + (FAddend &Addend0, FAddend &Addend1) const { + if (isConstant()) + return 0; + + unsigned BreakNum = FAddend::drillValueDownOneStep(Val, Addend0, Addend1); + if (!BreakNum || Coeff.isOne()) + return BreakNum; + + Addend0.Scale(Coeff); + + if (BreakNum == 2) + Addend1.Scale(Coeff); + + return BreakNum; +} + +Value *FAddCombine::simplify(Instruction *I) { + assert(I->hasAllowReassoc() && I->hasNoSignedZeros() && + "Expected 'reassoc'+'nsz' instruction"); + + // Currently we are not able to handle vector type. + if (I->getType()->isVectorTy()) + return nullptr; + + assert((I->getOpcode() == Instruction::FAdd || + I->getOpcode() == Instruction::FSub) && "Expect add/sub"); + + // Save the instruction before calling other member-functions. + Instr = I; + + FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1; + + unsigned OpndNum = FAddend::drillValueDownOneStep(I, Opnd0, Opnd1); + + // Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1. + unsigned Opnd0_ExpNum = 0; + unsigned Opnd1_ExpNum = 0; + + if (!Opnd0.isConstant()) + Opnd0_ExpNum = Opnd0.drillAddendDownOneStep(Opnd0_0, Opnd0_1); + + // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1. + if (OpndNum == 2 && !Opnd1.isConstant()) + Opnd1_ExpNum = Opnd1.drillAddendDownOneStep(Opnd1_0, Opnd1_1); + + // Step 3: Try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1 + if (Opnd0_ExpNum && Opnd1_ExpNum) { + AddendVect AllOpnds; + AllOpnds.push_back(&Opnd0_0); + AllOpnds.push_back(&Opnd1_0); + if (Opnd0_ExpNum == 2) + AllOpnds.push_back(&Opnd0_1); + if (Opnd1_ExpNum == 2) + AllOpnds.push_back(&Opnd1_1); + + // Compute instruction quota. We should save at least one instruction. + unsigned InstQuota = 0; + + Value *V0 = I->getOperand(0); + Value *V1 = I->getOperand(1); + InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) && + (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1; + + if (Value *R = simplifyFAdd(AllOpnds, InstQuota)) + return R; + } + + if (OpndNum != 2) { + // The input instruction is : "I=0.0 +/- V". If the "V" were able to be + // splitted into two addends, say "V = X - Y", the instruction would have + // been optimized into "I = Y - X" in the previous steps. + // + const FAddendCoef &CE = Opnd0.getCoef(); + return CE.isOne() ? Opnd0.getSymVal() : nullptr; + } + + // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1] + if (Opnd1_ExpNum) { + AddendVect AllOpnds; + AllOpnds.push_back(&Opnd0); + AllOpnds.push_back(&Opnd1_0); + if (Opnd1_ExpNum == 2) + AllOpnds.push_back(&Opnd1_1); + + if (Value *R = simplifyFAdd(AllOpnds, 1)) + return R; + } + + // step 5: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1] + if (Opnd0_ExpNum) { + AddendVect AllOpnds; + AllOpnds.push_back(&Opnd1); + AllOpnds.push_back(&Opnd0_0); + if (Opnd0_ExpNum == 2) + AllOpnds.push_back(&Opnd0_1); + + if (Value *R = simplifyFAdd(AllOpnds, 1)) + return R; + } + + return nullptr; +} + +Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { + unsigned AddendNum = Addends.size(); + assert(AddendNum <= 4 && "Too many addends"); + + // For saving intermediate results; + unsigned NextTmpIdx = 0; + FAddend TmpResult[3]; + + // Points to the constant addend of the resulting simplified expression. + // If the resulting expr has constant-addend, this constant-addend is + // desirable to reside at the top of the resulting expression tree. Placing + // constant close to supper-expr(s) will potentially reveal some optimization + // opportunities in super-expr(s). + const FAddend *ConstAdd = nullptr; + + // Simplified addends are placed <SimpVect>. + AddendVect SimpVect; + + // The outer loop works on one symbolic-value at a time. Suppose the input + // addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ... + // The symbolic-values will be processed in this order: x, y, z. + for (unsigned SymIdx = 0; SymIdx < AddendNum; SymIdx++) { + + const FAddend *ThisAddend = Addends[SymIdx]; + if (!ThisAddend) { + // This addend was processed before. + continue; + } + + Value *Val = ThisAddend->getSymVal(); + unsigned StartIdx = SimpVect.size(); + SimpVect.push_back(ThisAddend); + + // The inner loop collects addends sharing same symbolic-value, and these + // addends will be later on folded into a single addend. Following above + // example, if the symbolic value "y" is being processed, the inner loop + // will collect two addends "<b1,y>" and "<b2,Y>". These two addends will + // be later on folded into "<b1+b2, y>". + for (unsigned SameSymIdx = SymIdx + 1; + SameSymIdx < AddendNum; SameSymIdx++) { + const FAddend *T = Addends[SameSymIdx]; + if (T && T->getSymVal() == Val) { + // Set null such that next iteration of the outer loop will not process + // this addend again. + Addends[SameSymIdx] = nullptr; + SimpVect.push_back(T); + } + } + + // If multiple addends share same symbolic value, fold them together. + if (StartIdx + 1 != SimpVect.size()) { + FAddend &R = TmpResult[NextTmpIdx ++]; + R = *SimpVect[StartIdx]; + for (unsigned Idx = StartIdx + 1; Idx < SimpVect.size(); Idx++) + R += *SimpVect[Idx]; + + // Pop all addends being folded and push the resulting folded addend. + SimpVect.resize(StartIdx); + if (Val) { + if (!R.isZero()) { + SimpVect.push_back(&R); + } + } else { + // Don't push constant addend at this time. It will be the last element + // of <SimpVect>. + ConstAdd = &R; + } + } + } + + assert((NextTmpIdx <= array_lengthof(TmpResult) + 1) && + "out-of-bound access"); + + if (ConstAdd) + SimpVect.push_back(ConstAdd); + + Value *Result; + if (!SimpVect.empty()) + Result = createNaryFAdd(SimpVect, InstrQuota); + else { + // The addition is folded to 0.0. + Result = ConstantFP::get(Instr->getType(), 0.0); + } + + return Result; +} + +Value *FAddCombine::createNaryFAdd + (const AddendVect &Opnds, unsigned InstrQuota) { + assert(!Opnds.empty() && "Expect at least one addend"); + + // Step 1: Check if the # of instructions needed exceeds the quota. + + unsigned InstrNeeded = calcInstrNumber(Opnds); + if (InstrNeeded > InstrQuota) + return nullptr; + + initCreateInstNum(); + + // step 2: Emit the N-ary addition. + // Note that at most three instructions are involved in Fadd-InstCombine: the + // addition in question, and at most two neighboring instructions. + // The resulting optimized addition should have at least one less instruction + // than the original addition expression tree. This implies that the resulting + // N-ary addition has at most two instructions, and we don't need to worry + // about tree-height when constructing the N-ary addition. + + Value *LastVal = nullptr; + bool LastValNeedNeg = false; + + // Iterate the addends, creating fadd/fsub using adjacent two addends. + for (const FAddend *Opnd : Opnds) { + bool NeedNeg; + Value *V = createAddendVal(*Opnd, NeedNeg); + if (!LastVal) { + LastVal = V; + LastValNeedNeg = NeedNeg; + continue; + } + + if (LastValNeedNeg == NeedNeg) { + LastVal = createFAdd(LastVal, V); + continue; + } + + if (LastValNeedNeg) + LastVal = createFSub(V, LastVal); + else + LastVal = createFSub(LastVal, V); + + LastValNeedNeg = false; + } + + if (LastValNeedNeg) { + LastVal = createFNeg(LastVal); + } + +#ifndef NDEBUG + assert(CreateInstrNum == InstrNeeded && + "Inconsistent in instruction numbers"); +#endif + + return LastVal; +} + +Value *FAddCombine::createFSub(Value *Opnd0, Value *Opnd1) { + Value *V = Builder.CreateFSub(Opnd0, Opnd1); + if (Instruction *I = dyn_cast<Instruction>(V)) + createInstPostProc(I); + return V; +} + +Value *FAddCombine::createFNeg(Value *V) { + Value *Zero = cast<Value>(ConstantFP::getZeroValueForNegation(V->getType())); + Value *NewV = createFSub(Zero, V); + if (Instruction *I = dyn_cast<Instruction>(NewV)) + createInstPostProc(I, true); // fneg's don't receive instruction numbers. + return NewV; +} + +Value *FAddCombine::createFAdd(Value *Opnd0, Value *Opnd1) { + Value *V = Builder.CreateFAdd(Opnd0, Opnd1); + if (Instruction *I = dyn_cast<Instruction>(V)) + createInstPostProc(I); + return V; +} + +Value *FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) { + Value *V = Builder.CreateFMul(Opnd0, Opnd1); + if (Instruction *I = dyn_cast<Instruction>(V)) + createInstPostProc(I); + return V; +} + +void FAddCombine::createInstPostProc(Instruction *NewInstr, bool NoNumber) { + NewInstr->setDebugLoc(Instr->getDebugLoc()); + + // Keep track of the number of instruction created. + if (!NoNumber) + incCreateInstNum(); + + // Propagate fast-math flags + NewInstr->setFastMathFlags(Instr->getFastMathFlags()); +} + +// Return the number of instruction needed to emit the N-ary addition. +// NOTE: Keep this function in sync with createAddendVal(). +unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) { + unsigned OpndNum = Opnds.size(); + unsigned InstrNeeded = OpndNum - 1; + + // The number of addends in the form of "(-1)*x". + unsigned NegOpndNum = 0; + + // Adjust the number of instructions needed to emit the N-ary add. + for (const FAddend *Opnd : Opnds) { + if (Opnd->isConstant()) + continue; + + // The constant check above is really for a few special constant + // coefficients. + if (isa<UndefValue>(Opnd->getSymVal())) + continue; + + const FAddendCoef &CE = Opnd->getCoef(); + if (CE.isMinusOne() || CE.isMinusTwo()) + NegOpndNum++; + + // Let the addend be "c * x". If "c == +/-1", the value of the addend + // is immediately available; otherwise, it needs exactly one instruction + // to evaluate the value. + if (!CE.isMinusOne() && !CE.isOne()) + InstrNeeded++; + } + if (NegOpndNum == OpndNum) + InstrNeeded++; + return InstrNeeded; +} + +// Input Addend Value NeedNeg(output) +// ================================================================ +// Constant C C false +// <+/-1, V> V coefficient is -1 +// <2/-2, V> "fadd V, V" coefficient is -2 +// <C, V> "fmul V, C" false +// +// NOTE: Keep this function in sync with FAddCombine::calcInstrNumber. +Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) { + const FAddendCoef &Coeff = Opnd.getCoef(); + + if (Opnd.isConstant()) { + NeedNeg = false; + return Coeff.getValue(Instr->getType()); + } + + Value *OpndVal = Opnd.getSymVal(); + + if (Coeff.isMinusOne() || Coeff.isOne()) { + NeedNeg = Coeff.isMinusOne(); + return OpndVal; + } + + if (Coeff.isTwo() || Coeff.isMinusTwo()) { + NeedNeg = Coeff.isMinusTwo(); + return createFAdd(OpndVal, OpndVal); + } + + NeedNeg = false; + return createFMul(OpndVal, Coeff.getValue(Instr->getType())); +} + +// Checks if any operand is negative and we can convert add to sub. +// This function checks for following negative patterns +// ADD(XOR(OR(Z, NOT(C)), C)), 1) == NEG(AND(Z, C)) +// ADD(XOR(AND(Z, C), C), 1) == NEG(OR(Z, ~C)) +// XOR(AND(Z, C), (C + 1)) == NEG(OR(Z, ~C)) if C is even +static Value *checkForNegativeOperand(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + + // This function creates 2 instructions to replace ADD, we need at least one + // of LHS or RHS to have one use to ensure benefit in transform. + if (!LHS->hasOneUse() && !RHS->hasOneUse()) + return nullptr; + + Value *X = nullptr, *Y = nullptr, *Z = nullptr; + const APInt *C1 = nullptr, *C2 = nullptr; + + // if ONE is on other side, swap + if (match(RHS, m_Add(m_Value(X), m_One()))) + std::swap(LHS, RHS); + + if (match(LHS, m_Add(m_Value(X), m_One()))) { + // if XOR on other side, swap + if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1)))) + std::swap(X, RHS); + + if (match(X, m_Xor(m_Value(Y), m_APInt(C1)))) { + // X = XOR(Y, C1), Y = OR(Z, C2), C2 = NOT(C1) ==> X == NOT(AND(Z, C1)) + // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, AND(Z, C1)) + if (match(Y, m_Or(m_Value(Z), m_APInt(C2))) && (*C2 == ~(*C1))) { + Value *NewAnd = Builder.CreateAnd(Z, *C1); + return Builder.CreateSub(RHS, NewAnd, "sub"); + } else if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && (*C1 == *C2)) { + // X = XOR(Y, C1), Y = AND(Z, C2), C2 == C1 ==> X == NOT(OR(Z, ~C1)) + // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, OR(Z, ~C1)) + Value *NewOr = Builder.CreateOr(Z, ~(*C1)); + return Builder.CreateSub(RHS, NewOr, "sub"); + } + } + } + + // Restore LHS and RHS + LHS = I.getOperand(0); + RHS = I.getOperand(1); + + // if XOR is on other side, swap + if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1)))) + std::swap(LHS, RHS); + + // C2 is ODD + // LHS = XOR(Y, C1), Y = AND(Z, C2), C1 == (C2 + 1) => LHS == NEG(OR(Z, ~C2)) + // ADD(LHS, RHS) == SUB(RHS, OR(Z, ~C2)) + if (match(LHS, m_Xor(m_Value(Y), m_APInt(C1)))) + if (C1->countTrailingZeros() == 0) + if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && *C1 == (*C2 + 1)) { + Value *NewOr = Builder.CreateOr(Z, ~(*C2)); + return Builder.CreateSub(RHS, NewOr, "sub"); + } + return nullptr; +} + +/// Wrapping flags may allow combining constants separated by an extend. +static Instruction *foldNoWrapAdd(BinaryOperator &Add, + InstCombiner::BuilderTy &Builder) { + Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1); + Type *Ty = Add.getType(); + Constant *Op1C; + if (!match(Op1, m_Constant(Op1C))) + return nullptr; + + // Try this match first because it results in an add in the narrow type. + // (zext (X +nuw C2)) + C1 --> zext (X + (C2 + trunc(C1))) + Value *X; + const APInt *C1, *C2; + if (match(Op1, m_APInt(C1)) && + match(Op0, m_OneUse(m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2))))) && + C1->isNegative() && C1->sge(-C2->sext(C1->getBitWidth()))) { + Constant *NewC = + ConstantInt::get(X->getType(), *C2 + C1->trunc(C2->getBitWidth())); + return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty); + } + + // More general combining of constants in the wide type. + // (sext (X +nsw NarrowC)) + C --> (sext X) + (sext(NarrowC) + C) + Constant *NarrowC; + if (match(Op0, m_OneUse(m_SExt(m_NSWAdd(m_Value(X), m_Constant(NarrowC)))))) { + Constant *WideC = ConstantExpr::getSExt(NarrowC, Ty); + Constant *NewC = ConstantExpr::getAdd(WideC, Op1C); + Value *WideX = Builder.CreateSExt(X, Ty); + return BinaryOperator::CreateAdd(WideX, NewC); + } + // (zext (X +nuw NarrowC)) + C --> (zext X) + (zext(NarrowC) + C) + if (match(Op0, m_OneUse(m_ZExt(m_NUWAdd(m_Value(X), m_Constant(NarrowC)))))) { + Constant *WideC = ConstantExpr::getZExt(NarrowC, Ty); + Constant *NewC = ConstantExpr::getAdd(WideC, Op1C); + Value *WideX = Builder.CreateZExt(X, Ty); + return BinaryOperator::CreateAdd(WideX, NewC); + } + + return nullptr; +} + +Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) { + Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1); + Constant *Op1C; + if (!match(Op1, m_Constant(Op1C))) + return nullptr; + + if (Instruction *NV = foldBinOpIntoSelectOrPhi(Add)) + return NV; + + Value *X; + Constant *Op00C; + + // add (sub C1, X), C2 --> sub (add C1, C2), X + if (match(Op0, m_Sub(m_Constant(Op00C), m_Value(X)))) + return BinaryOperator::CreateSub(ConstantExpr::getAdd(Op00C, Op1C), X); + + Value *Y; + + // add (sub X, Y), -1 --> add (not Y), X + if (match(Op0, m_OneUse(m_Sub(m_Value(X), m_Value(Y)))) && + match(Op1, m_AllOnes())) + return BinaryOperator::CreateAdd(Builder.CreateNot(Y), X); + + // zext(bool) + C -> bool ? C + 1 : C + if (match(Op0, m_ZExt(m_Value(X))) && + X->getType()->getScalarSizeInBits() == 1) + return SelectInst::Create(X, AddOne(Op1C), Op1); + + // ~X + C --> (C-1) - X + if (match(Op0, m_Not(m_Value(X)))) + return BinaryOperator::CreateSub(SubOne(Op1C), X); + + const APInt *C; + if (!match(Op1, m_APInt(C))) + return nullptr; + + // (X | C2) + C --> (X | C2) ^ C2 iff (C2 == -C) + const APInt *C2; + if (match(Op0, m_Or(m_Value(), m_APInt(C2))) && *C2 == -*C) + return BinaryOperator::CreateXor(Op0, ConstantInt::get(Add.getType(), *C2)); + + if (C->isSignMask()) { + // If wrapping is not allowed, then the addition must set the sign bit: + // X + (signmask) --> X | signmask + if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap()) + return BinaryOperator::CreateOr(Op0, Op1); + + // If wrapping is allowed, then the addition flips the sign bit of LHS: + // X + (signmask) --> X ^ signmask + return BinaryOperator::CreateXor(Op0, Op1); + } + + // Is this add the last step in a convoluted sext? + // add(zext(xor i16 X, -32768), -32768) --> sext X + Type *Ty = Add.getType(); + if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) && + C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C) + return CastInst::Create(Instruction::SExt, X, Ty); + + if (C->isOneValue() && Op0->hasOneUse()) { + // add (sext i1 X), 1 --> zext (not X) + // TODO: The smallest IR representation is (select X, 0, 1), and that would + // not require the one-use check. But we need to remove a transform in + // visitSelect and make sure that IR value tracking for select is equal or + // better than for these ops. + if (match(Op0, m_SExt(m_Value(X))) && + X->getType()->getScalarSizeInBits() == 1) + return new ZExtInst(Builder.CreateNot(X), Ty); + + // Shifts and add used to flip and mask off the low bit: + // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1 + const APInt *C3; + if (match(Op0, m_AShr(m_Shl(m_Value(X), m_APInt(C2)), m_APInt(C3))) && + C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) { + Value *NotX = Builder.CreateNot(X); + return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1)); + } + } + + return nullptr; +} + +// Matches multiplication expression Op * C where C is a constant. Returns the +// constant value in C and the other operand in Op. Returns true if such a +// match is found. +static bool MatchMul(Value *E, Value *&Op, APInt &C) { + const APInt *AI; + if (match(E, m_Mul(m_Value(Op), m_APInt(AI)))) { + C = *AI; + return true; + } + if (match(E, m_Shl(m_Value(Op), m_APInt(AI)))) { + C = APInt(AI->getBitWidth(), 1); + C <<= *AI; + return true; + } + return false; +} + +// Matches remainder expression Op % C where C is a constant. Returns the +// constant value in C and the other operand in Op. Returns the signedness of +// the remainder operation in IsSigned. Returns true if such a match is +// found. +static bool MatchRem(Value *E, Value *&Op, APInt &C, bool &IsSigned) { + const APInt *AI; + IsSigned = false; + if (match(E, m_SRem(m_Value(Op), m_APInt(AI)))) { + IsSigned = true; + C = *AI; + return true; + } + if (match(E, m_URem(m_Value(Op), m_APInt(AI)))) { + C = *AI; + return true; + } + if (match(E, m_And(m_Value(Op), m_APInt(AI))) && (*AI + 1).isPowerOf2()) { + C = *AI + 1; + return true; + } + return false; +} + +// Matches division expression Op / C with the given signedness as indicated +// by IsSigned, where C is a constant. Returns the constant value in C and the +// other operand in Op. Returns true if such a match is found. +static bool MatchDiv(Value *E, Value *&Op, APInt &C, bool IsSigned) { + const APInt *AI; + if (IsSigned && match(E, m_SDiv(m_Value(Op), m_APInt(AI)))) { + C = *AI; + return true; + } + if (!IsSigned) { + if (match(E, m_UDiv(m_Value(Op), m_APInt(AI)))) { + C = *AI; + return true; + } + if (match(E, m_LShr(m_Value(Op), m_APInt(AI)))) { + C = APInt(AI->getBitWidth(), 1); + C <<= *AI; + return true; + } + } + return false; +} + +// Returns whether C0 * C1 with the given signedness overflows. +static bool MulWillOverflow(APInt &C0, APInt &C1, bool IsSigned) { + bool overflow; + if (IsSigned) + (void)C0.smul_ov(C1, overflow); + else + (void)C0.umul_ov(C1, overflow); + return overflow; +} + +// Simplifies X % C0 + (( X / C0 ) % C1) * C0 to X % (C0 * C1), where (C0 * C1) +// does not overflow. +Value *InstCombiner::SimplifyAddWithRemainder(BinaryOperator &I) { + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + Value *X, *MulOpV; + APInt C0, MulOpC; + bool IsSigned; + // Match I = X % C0 + MulOpV * C0 + if (((MatchRem(LHS, X, C0, IsSigned) && MatchMul(RHS, MulOpV, MulOpC)) || + (MatchRem(RHS, X, C0, IsSigned) && MatchMul(LHS, MulOpV, MulOpC))) && + C0 == MulOpC) { + Value *RemOpV; + APInt C1; + bool Rem2IsSigned; + // Match MulOpC = RemOpV % C1 + if (MatchRem(MulOpV, RemOpV, C1, Rem2IsSigned) && + IsSigned == Rem2IsSigned) { + Value *DivOpV; + APInt DivOpC; + // Match RemOpV = X / C0 + if (MatchDiv(RemOpV, DivOpV, DivOpC, IsSigned) && X == DivOpV && + C0 == DivOpC && !MulWillOverflow(C0, C1, IsSigned)) { + Value *NewDivisor = + ConstantInt::get(X->getType()->getContext(), C0 * C1); + return IsSigned ? Builder.CreateSRem(X, NewDivisor, "srem") + : Builder.CreateURem(X, NewDivisor, "urem"); + } + } + } + + return nullptr; +} + +/// Fold +/// (1 << NBits) - 1 +/// Into: +/// ~(-(1 << NBits)) +/// Because a 'not' is better for bit-tracking analysis and other transforms +/// than an 'add'. The new shl is always nsw, and is nuw if old `and` was. +static Instruction *canonicalizeLowbitMask(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *NBits; + if (!match(&I, m_Add(m_OneUse(m_Shl(m_One(), m_Value(NBits))), m_AllOnes()))) + return nullptr; + + Constant *MinusOne = Constant::getAllOnesValue(NBits->getType()); + Value *NotMask = Builder.CreateShl(MinusOne, NBits, "notmask"); + // Be wary of constant folding. + if (auto *BOp = dyn_cast<BinaryOperator>(NotMask)) { + // Always NSW. But NUW propagates from `add`. + BOp->setHasNoSignedWrap(); + BOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); + } + + return BinaryOperator::CreateNot(NotMask, I.getName()); +} + +static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) { + assert(I.getOpcode() == Instruction::Add && "Expecting add instruction"); + Type *Ty = I.getType(); + auto getUAddSat = [&]() { + return Intrinsic::getDeclaration(I.getModule(), Intrinsic::uadd_sat, Ty); + }; + + // add (umin X, ~Y), Y --> uaddsat X, Y + Value *X, *Y; + if (match(&I, m_c_Add(m_c_UMin(m_Value(X), m_Not(m_Value(Y))), + m_Deferred(Y)))) + return CallInst::Create(getUAddSat(), { X, Y }); + + // add (umin X, ~C), C --> uaddsat X, C + const APInt *C, *NotC; + if (match(&I, m_Add(m_UMin(m_Value(X), m_APInt(NotC)), m_APInt(C))) && + *C == ~*NotC) + return CallInst::Create(getUAddSat(), { X, ConstantInt::get(Ty, *C) }); + + return nullptr; +} + +Instruction * +InstCombiner::canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract( + BinaryOperator &I) { + assert((I.getOpcode() == Instruction::Add || + I.getOpcode() == Instruction::Or || + I.getOpcode() == Instruction::Sub) && + "Expecting add/or/sub instruction"); + + // We have a subtraction/addition between a (potentially truncated) *logical* + // right-shift of X and a "select". + Value *X, *Select; + Instruction *LowBitsToSkip, *Extract; + if (!match(&I, m_c_BinOp(m_TruncOrSelf(m_CombineAnd( + m_LShr(m_Value(X), m_Instruction(LowBitsToSkip)), + m_Instruction(Extract))), + m_Value(Select)))) + return nullptr; + + // `add`/`or` is commutative; but for `sub`, "select" *must* be on RHS. + if (I.getOpcode() == Instruction::Sub && I.getOperand(1) != Select) + return nullptr; + + Type *XTy = X->getType(); + bool HadTrunc = I.getType() != XTy; + + // If there was a truncation of extracted value, then we'll need to produce + // one extra instruction, so we need to ensure one instruction will go away. + if (HadTrunc && !match(&I, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) + return nullptr; + + // Extraction should extract high NBits bits, with shift amount calculated as: + // low bits to skip = shift bitwidth - high bits to extract + // The shift amount itself may be extended, and we need to look past zero-ext + // when matching NBits, that will matter for matching later. + Constant *C; + Value *NBits; + if (!match( + LowBitsToSkip, + m_ZExtOrSelf(m_Sub(m_Constant(C), m_ZExtOrSelf(m_Value(NBits))))) || + !match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt(C->getType()->getScalarSizeInBits(), + X->getType()->getScalarSizeInBits())))) + return nullptr; + + // Sign-extending value can be zero-extended if we `sub`tract it, + // or sign-extended otherwise. + auto SkipExtInMagic = [&I](Value *&V) { + if (I.getOpcode() == Instruction::Sub) + match(V, m_ZExtOrSelf(m_Value(V))); + else + match(V, m_SExtOrSelf(m_Value(V))); + }; + + // Now, finally validate the sign-extending magic. + // `select` itself may be appropriately extended, look past that. + SkipExtInMagic(Select); + + ICmpInst::Predicate Pred; + const APInt *Thr; + Value *SignExtendingValue, *Zero; + bool ShouldSignext; + // It must be a select between two values we will later establish to be a + // sign-extending value and a zero constant. The condition guarding the + // sign-extension must be based on a sign bit of the same X we had in `lshr`. + if (!match(Select, m_Select(m_ICmp(Pred, m_Specific(X), m_APInt(Thr)), + m_Value(SignExtendingValue), m_Value(Zero))) || + !isSignBitCheck(Pred, *Thr, ShouldSignext)) + return nullptr; + + // icmp-select pair is commutative. + if (!ShouldSignext) + std::swap(SignExtendingValue, Zero); + + // If we should not perform sign-extension then we must add/or/subtract zero. + if (!match(Zero, m_Zero())) + return nullptr; + // Otherwise, it should be some constant, left-shifted by the same NBits we + // had in `lshr`. Said left-shift can also be appropriately extended. + // Again, we must look past zero-ext when looking for NBits. + SkipExtInMagic(SignExtendingValue); + Constant *SignExtendingValueBaseConstant; + if (!match(SignExtendingValue, + m_Shl(m_Constant(SignExtendingValueBaseConstant), + m_ZExtOrSelf(m_Specific(NBits))))) + return nullptr; + // If we `sub`, then the constant should be one, else it should be all-ones. + if (I.getOpcode() == Instruction::Sub + ? !match(SignExtendingValueBaseConstant, m_One()) + : !match(SignExtendingValueBaseConstant, m_AllOnes())) + return nullptr; + + auto *NewAShr = BinaryOperator::CreateAShr(X, LowBitsToSkip, + Extract->getName() + ".sext"); + NewAShr->copyIRFlags(Extract); // Preserve `exact`-ness. + if (!HadTrunc) + return NewAShr; + + Builder.Insert(NewAShr); + return TruncInst::CreateTruncOrBitCast(NewAShr, I.getType()); +} + +Instruction *InstCombiner::visitAdd(BinaryOperator &I) { + if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1), + I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (SimplifyAssociativeOrCommutative(I)) + return &I; + + if (Instruction *X = foldVectorBinop(I)) + return X; + + // (A*B)+(A*C) -> A*(B+C) etc + if (Value *V = SimplifyUsingDistributiveLaws(I)) + return replaceInstUsesWith(I, V); + + if (Instruction *X = foldAddWithConstant(I)) + return X; + + if (Instruction *X = foldNoWrapAdd(I, Builder)) + return X; + + // FIXME: This should be moved into the above helper function to allow these + // transforms for general constant or constant splat vectors. + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + Type *Ty = I.getType(); + if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) { + Value *XorLHS = nullptr; ConstantInt *XorRHS = nullptr; + if (match(LHS, m_Xor(m_Value(XorLHS), m_ConstantInt(XorRHS)))) { + unsigned TySizeBits = Ty->getScalarSizeInBits(); + const APInt &RHSVal = CI->getValue(); + unsigned ExtendAmt = 0; + // If we have ADD(XOR(AND(X, 0xFF), 0x80), 0xF..F80), it's a sext. + // If we have ADD(XOR(AND(X, 0xFF), 0xF..F80), 0x80), it's a sext. + if (XorRHS->getValue() == -RHSVal) { + if (RHSVal.isPowerOf2()) + ExtendAmt = TySizeBits - RHSVal.logBase2() - 1; + else if (XorRHS->getValue().isPowerOf2()) + ExtendAmt = TySizeBits - XorRHS->getValue().logBase2() - 1; + } + + if (ExtendAmt) { + APInt Mask = APInt::getHighBitsSet(TySizeBits, ExtendAmt); + if (!MaskedValueIsZero(XorLHS, Mask, 0, &I)) + ExtendAmt = 0; + } + + if (ExtendAmt) { + Constant *ShAmt = ConstantInt::get(Ty, ExtendAmt); + Value *NewShl = Builder.CreateShl(XorLHS, ShAmt, "sext"); + return BinaryOperator::CreateAShr(NewShl, ShAmt); + } + + // If this is a xor that was canonicalized from a sub, turn it back into + // a sub and fuse this add with it. + if (LHS->hasOneUse() && (XorRHS->getValue()+1).isPowerOf2()) { + KnownBits LHSKnown = computeKnownBits(XorLHS, 0, &I); + if ((XorRHS->getValue() | LHSKnown.Zero).isAllOnesValue()) + return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI), + XorLHS); + } + // (X + signmask) + C could have gotten canonicalized to (X^signmask) + C, + // transform them into (X + (signmask ^ C)) + if (XorRHS->getValue().isSignMask()) + return BinaryOperator::CreateAdd(XorLHS, + ConstantExpr::getXor(XorRHS, CI)); + } + } + + if (Ty->isIntOrIntVectorTy(1)) + return BinaryOperator::CreateXor(LHS, RHS); + + // X + X --> X << 1 + if (LHS == RHS) { + auto *Shl = BinaryOperator::CreateShl(LHS, ConstantInt::get(Ty, 1)); + Shl->setHasNoSignedWrap(I.hasNoSignedWrap()); + Shl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); + return Shl; + } + + Value *A, *B; + if (match(LHS, m_Neg(m_Value(A)))) { + // -A + -B --> -(A + B) + if (match(RHS, m_Neg(m_Value(B)))) + return BinaryOperator::CreateNeg(Builder.CreateAdd(A, B)); + + // -A + B --> B - A + return BinaryOperator::CreateSub(RHS, A); + } + + // Canonicalize sext to zext for better value tracking potential. + // add A, sext(B) --> sub A, zext(B) + if (match(&I, m_c_Add(m_Value(A), m_OneUse(m_SExt(m_Value(B))))) && + B->getType()->isIntOrIntVectorTy(1)) + return BinaryOperator::CreateSub(A, Builder.CreateZExt(B, Ty)); + + // A + -B --> A - B + if (match(RHS, m_Neg(m_Value(B)))) + return BinaryOperator::CreateSub(LHS, B); + + if (Value *V = checkForNegativeOperand(I, Builder)) + return replaceInstUsesWith(I, V); + + // (A + 1) + ~B --> A - B + // ~B + (A + 1) --> A - B + // (~B + A) + 1 --> A - B + // (A + ~B) + 1 --> A - B + if (match(&I, m_c_BinOp(m_Add(m_Value(A), m_One()), m_Not(m_Value(B)))) || + match(&I, m_BinOp(m_c_Add(m_Not(m_Value(B)), m_Value(A)), m_One()))) + return BinaryOperator::CreateSub(A, B); + + // X % C0 + (( X / C0 ) % C1) * C0 => X % (C0 * C1) + if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V); + + // A+B --> A|B iff A and B have no bits set in common. + if (haveNoCommonBitsSet(LHS, RHS, DL, &AC, &I, &DT)) + return BinaryOperator::CreateOr(LHS, RHS); + + // FIXME: We already did a check for ConstantInt RHS above this. + // FIXME: Is this pattern covered by another fold? No regression tests fail on + // removal. + if (ConstantInt *CRHS = dyn_cast<ConstantInt>(RHS)) { + // (X & FF00) + xx00 -> (X+xx00) & FF00 + Value *X; + ConstantInt *C2; + if (LHS->hasOneUse() && + match(LHS, m_And(m_Value(X), m_ConstantInt(C2))) && + CRHS->getValue() == (CRHS->getValue() & C2->getValue())) { + // See if all bits from the first bit set in the Add RHS up are included + // in the mask. First, get the rightmost bit. + const APInt &AddRHSV = CRHS->getValue(); + + // Form a mask of all bits from the lowest bit added through the top. + APInt AddRHSHighBits(~((AddRHSV & -AddRHSV)-1)); + + // See if the and mask includes all of these bits. + APInt AddRHSHighBitsAnd(AddRHSHighBits & C2->getValue()); + + if (AddRHSHighBits == AddRHSHighBitsAnd) { + // Okay, the xform is safe. Insert the new add pronto. + Value *NewAdd = Builder.CreateAdd(X, CRHS, LHS->getName()); + return BinaryOperator::CreateAnd(NewAdd, C2); + } + } + } + + // add (select X 0 (sub n A)) A --> select X A n + { + SelectInst *SI = dyn_cast<SelectInst>(LHS); + Value *A = RHS; + if (!SI) { + SI = dyn_cast<SelectInst>(RHS); + A = LHS; + } + if (SI && SI->hasOneUse()) { + Value *TV = SI->getTrueValue(); + Value *FV = SI->getFalseValue(); + Value *N; + + // Can we fold the add into the argument of the select? + // We check both true and false select arguments for a matching subtract. + if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A)))) + // Fold the add into the true select value. + return SelectInst::Create(SI->getCondition(), N, A); + + if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A)))) + // Fold the add into the false select value. + return SelectInst::Create(SI->getCondition(), A, N); + } + } + + if (Instruction *Ext = narrowMathIfNoOverflow(I)) + return Ext; + + // (add (xor A, B) (and A, B)) --> (or A, B) + // (add (and A, B) (xor A, B)) --> (or A, B) + if (match(&I, m_c_BinOp(m_Xor(m_Value(A), m_Value(B)), + m_c_And(m_Deferred(A), m_Deferred(B))))) + return BinaryOperator::CreateOr(A, B); + + // (add (or A, B) (and A, B)) --> (add A, B) + // (add (and A, B) (or A, B)) --> (add A, B) + if (match(&I, m_c_BinOp(m_Or(m_Value(A), m_Value(B)), + m_c_And(m_Deferred(A), m_Deferred(B))))) { + I.setOperand(0, A); + I.setOperand(1, B); + return &I; + } + + // TODO(jingyue): Consider willNotOverflowSignedAdd and + // willNotOverflowUnsignedAdd to reduce the number of invocations of + // computeKnownBits. + bool Changed = false; + if (!I.hasNoSignedWrap() && willNotOverflowSignedAdd(LHS, RHS, I)) { + Changed = true; + I.setHasNoSignedWrap(true); + } + if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) { + Changed = true; + I.setHasNoUnsignedWrap(true); + } + + if (Instruction *V = canonicalizeLowbitMask(I, Builder)) + return V; + + if (Instruction *V = + canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I)) + return V; + + if (Instruction *SatAdd = foldToUnsignedSaturatedAdd(I)) + return SatAdd; + + return Changed ? &I : nullptr; +} + +/// Eliminate an op from a linear interpolation (lerp) pattern. +static Instruction *factorizeLerp(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *X, *Y, *Z; + if (!match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_Value(Y), + m_OneUse(m_FSub(m_FPOne(), + m_Value(Z))))), + m_OneUse(m_c_FMul(m_Value(X), m_Deferred(Z)))))) + return nullptr; + + // (Y * (1.0 - Z)) + (X * Z) --> Y + Z * (X - Y) [8 commuted variants] + Value *XY = Builder.CreateFSubFMF(X, Y, &I); + Value *MulZ = Builder.CreateFMulFMF(Z, XY, &I); + return BinaryOperator::CreateFAddFMF(Y, MulZ, &I); +} + +/// Factor a common operand out of fadd/fsub of fmul/fdiv. +static Instruction *factorizeFAddFSub(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + assert((I.getOpcode() == Instruction::FAdd || + I.getOpcode() == Instruction::FSub) && "Expecting fadd/fsub"); + assert(I.hasAllowReassoc() && I.hasNoSignedZeros() && + "FP factorization requires FMF"); + + if (Instruction *Lerp = factorizeLerp(I, Builder)) + return Lerp; + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + Value *X, *Y, *Z; + bool IsFMul; + if ((match(Op0, m_OneUse(m_FMul(m_Value(X), m_Value(Z)))) && + match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))) || + (match(Op0, m_OneUse(m_FMul(m_Value(Z), m_Value(X)))) && + match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z)))))) + IsFMul = true; + else if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Z)))) && + match(Op1, m_OneUse(m_FDiv(m_Value(Y), m_Specific(Z))))) + IsFMul = false; + else + return nullptr; + + // (X * Z) + (Y * Z) --> (X + Y) * Z + // (X * Z) - (Y * Z) --> (X - Y) * Z + // (X / Z) + (Y / Z) --> (X + Y) / Z + // (X / Z) - (Y / Z) --> (X - Y) / Z + bool IsFAdd = I.getOpcode() == Instruction::FAdd; + Value *XY = IsFAdd ? Builder.CreateFAddFMF(X, Y, &I) + : Builder.CreateFSubFMF(X, Y, &I); + + // Bail out if we just created a denormal constant. + // TODO: This is copied from a previous implementation. Is it necessary? + const APFloat *C; + if (match(XY, m_APFloat(C)) && !C->isNormal()) + return nullptr; + + return IsFMul ? BinaryOperator::CreateFMulFMF(XY, Z, &I) + : BinaryOperator::CreateFDivFMF(XY, Z, &I); +} + +Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { + if (Value *V = SimplifyFAddInst(I.getOperand(0), I.getOperand(1), + I.getFastMathFlags(), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (SimplifyAssociativeOrCommutative(I)) + return &I; + + if (Instruction *X = foldVectorBinop(I)) + return X; + + if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I)) + return FoldedFAdd; + + // (-X) + Y --> Y - X + Value *X, *Y; + if (match(&I, m_c_FAdd(m_FNeg(m_Value(X)), m_Value(Y)))) + return BinaryOperator::CreateFSubFMF(Y, X, &I); + + // Similar to above, but look through fmul/fdiv for the negated term. + // (-X * Y) + Z --> Z - (X * Y) [4 commuted variants] + Value *Z; + if (match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))), + m_Value(Z)))) { + Value *XY = Builder.CreateFMulFMF(X, Y, &I); + return BinaryOperator::CreateFSubFMF(Z, XY, &I); + } + // (-X / Y) + Z --> Z - (X / Y) [2 commuted variants] + // (X / -Y) + Z --> Z - (X / Y) [2 commuted variants] + if (match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y))), + m_Value(Z))) || + match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))), + m_Value(Z)))) { + Value *XY = Builder.CreateFDivFMF(X, Y, &I); + return BinaryOperator::CreateFSubFMF(Z, XY, &I); + } + + // Check for (fadd double (sitofp x), y), see if we can merge this into an + // integer add followed by a promotion. + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) { + Value *LHSIntVal = LHSConv->getOperand(0); + Type *FPType = LHSConv->getType(); + + // TODO: This check is overly conservative. In many cases known bits + // analysis can tell us that the result of the addition has less significant + // bits than the integer type can hold. + auto IsValidPromotion = [](Type *FTy, Type *ITy) { + Type *FScalarTy = FTy->getScalarType(); + Type *IScalarTy = ITy->getScalarType(); + + // Do we have enough bits in the significand to represent the result of + // the integer addition? + unsigned MaxRepresentableBits = + APFloat::semanticsPrecision(FScalarTy->getFltSemantics()); + return IScalarTy->getIntegerBitWidth() <= MaxRepresentableBits; + }; + + // (fadd double (sitofp x), fpcst) --> (sitofp (add int x, intcst)) + // ... if the constant fits in the integer value. This is useful for things + // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer + // requires a constant pool load, and generally allows the add to be better + // instcombined. + if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) + if (IsValidPromotion(FPType, LHSIntVal->getType())) { + Constant *CI = + ConstantExpr::getFPToSI(CFP, LHSIntVal->getType()); + if (LHSConv->hasOneUse() && + ConstantExpr::getSIToFP(CI, I.getType()) == CFP && + willNotOverflowSignedAdd(LHSIntVal, CI, I)) { + // Insert the new integer add. + Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, CI, "addconv"); + return new SIToFPInst(NewAdd, I.getType()); + } + } + + // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y)) + if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) { + Value *RHSIntVal = RHSConv->getOperand(0); + // It's enough to check LHS types only because we require int types to + // be the same for this transform. + if (IsValidPromotion(FPType, LHSIntVal->getType())) { + // Only do this if x/y have the same type, if at least one of them has a + // single use (so we don't increase the number of int->fp conversions), + // and if the integer add will not overflow. + if (LHSIntVal->getType() == RHSIntVal->getType() && + (LHSConv->hasOneUse() || RHSConv->hasOneUse()) && + willNotOverflowSignedAdd(LHSIntVal, RHSIntVal, I)) { + // Insert the new integer add. + Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, RHSIntVal, "addconv"); + return new SIToFPInst(NewAdd, I.getType()); + } + } + } + } + + // Handle specials cases for FAdd with selects feeding the operation + if (Value *V = SimplifySelectsFeedingBinaryOp(I, LHS, RHS)) + return replaceInstUsesWith(I, V); + + if (I.hasAllowReassoc() && I.hasNoSignedZeros()) { + if (Instruction *F = factorizeFAddFSub(I, Builder)) + return F; + if (Value *V = FAddCombine(Builder).simplify(&I)) + return replaceInstUsesWith(I, V); + } + + return nullptr; +} + +/// Optimize pointer differences into the same array into a size. Consider: +/// &A[10] - &A[0]: we should compile this to "10". LHS/RHS are the pointer +/// operands to the ptrtoint instructions for the LHS/RHS of the subtract. +Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS, + Type *Ty) { + // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize + // this. + bool Swapped = false; + GEPOperator *GEP1 = nullptr, *GEP2 = nullptr; + + // For now we require one side to be the base pointer "A" or a constant + // GEP derived from it. + if (GEPOperator *LHSGEP = dyn_cast<GEPOperator>(LHS)) { + // (gep X, ...) - X + if (LHSGEP->getOperand(0) == RHS) { + GEP1 = LHSGEP; + Swapped = false; + } else if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) { + // (gep X, ...) - (gep X, ...) + if (LHSGEP->getOperand(0)->stripPointerCasts() == + RHSGEP->getOperand(0)->stripPointerCasts()) { + GEP2 = RHSGEP; + GEP1 = LHSGEP; + Swapped = false; + } + } + } + + if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) { + // X - (gep X, ...) + if (RHSGEP->getOperand(0) == LHS) { + GEP1 = RHSGEP; + Swapped = true; + } else if (GEPOperator *LHSGEP = dyn_cast<GEPOperator>(LHS)) { + // (gep X, ...) - (gep X, ...) + if (RHSGEP->getOperand(0)->stripPointerCasts() == + LHSGEP->getOperand(0)->stripPointerCasts()) { + GEP2 = LHSGEP; + GEP1 = RHSGEP; + Swapped = true; + } + } + } + + if (!GEP1) + // No GEP found. + return nullptr; + + if (GEP2) { + // (gep X, ...) - (gep X, ...) + // + // Avoid duplicating the arithmetic if there are more than one non-constant + // indices between the two GEPs and either GEP has a non-constant index and + // multiple users. If zero non-constant index, the result is a constant and + // there is no duplication. If one non-constant index, the result is an add + // or sub with a constant, which is no larger than the original code, and + // there's no duplicated arithmetic, even if either GEP has multiple + // users. If more than one non-constant indices combined, as long as the GEP + // with at least one non-constant index doesn't have multiple users, there + // is no duplication. + unsigned NumNonConstantIndices1 = GEP1->countNonConstantIndices(); + unsigned NumNonConstantIndices2 = GEP2->countNonConstantIndices(); + if (NumNonConstantIndices1 + NumNonConstantIndices2 > 1 && + ((NumNonConstantIndices1 > 0 && !GEP1->hasOneUse()) || + (NumNonConstantIndices2 > 0 && !GEP2->hasOneUse()))) { + return nullptr; + } + } + + // Emit the offset of the GEP and an intptr_t. + Value *Result = EmitGEPOffset(GEP1); + + // If we had a constant expression GEP on the other side offsetting the + // pointer, subtract it from the offset we have. + if (GEP2) { + Value *Offset = EmitGEPOffset(GEP2); + Result = Builder.CreateSub(Result, Offset); + } + + // If we have p - gep(p, ...) then we have to negate the result. + if (Swapped) + Result = Builder.CreateNeg(Result, "diff.neg"); + + return Builder.CreateIntCast(Result, Ty, true); +} + +Instruction *InstCombiner::visitSub(BinaryOperator &I) { + if (Value *V = SimplifySubInst(I.getOperand(0), I.getOperand(1), + I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (Instruction *X = foldVectorBinop(I)) + return X; + + // (A*B)-(A*C) -> A*(B-C) etc + if (Value *V = SimplifyUsingDistributiveLaws(I)) + return replaceInstUsesWith(I, V); + + // If this is a 'B = x-(-A)', change to B = x+A. + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + if (Value *V = dyn_castNegVal(Op1)) { + BinaryOperator *Res = BinaryOperator::CreateAdd(Op0, V); + + if (const auto *BO = dyn_cast<BinaryOperator>(Op1)) { + assert(BO->getOpcode() == Instruction::Sub && + "Expected a subtraction operator!"); + if (BO->hasNoSignedWrap() && I.hasNoSignedWrap()) + Res->setHasNoSignedWrap(true); + } else { + if (cast<Constant>(Op1)->isNotMinSignedValue() && I.hasNoSignedWrap()) + Res->setHasNoSignedWrap(true); + } + + return Res; + } + + if (I.getType()->isIntOrIntVectorTy(1)) + return BinaryOperator::CreateXor(Op0, Op1); + + // Replace (-1 - A) with (~A). + if (match(Op0, m_AllOnes())) + return BinaryOperator::CreateNot(Op1); + + // (~X) - (~Y) --> Y - X + Value *X, *Y; + if (match(Op0, m_Not(m_Value(X))) && match(Op1, m_Not(m_Value(Y)))) + return BinaryOperator::CreateSub(Y, X); + + // (X + -1) - Y --> ~Y + X + if (match(Op0, m_OneUse(m_Add(m_Value(X), m_AllOnes())))) + return BinaryOperator::CreateAdd(Builder.CreateNot(Op1), X); + + // Y - (X + 1) --> ~X + Y + if (match(Op1, m_OneUse(m_Add(m_Value(X), m_One())))) + return BinaryOperator::CreateAdd(Builder.CreateNot(X), Op0); + + // Y - ~X --> (X + 1) + Y + if (match(Op1, m_OneUse(m_Not(m_Value(X))))) { + return BinaryOperator::CreateAdd( + Builder.CreateAdd(Op0, ConstantInt::get(I.getType(), 1)), X); + } + + if (Constant *C = dyn_cast<Constant>(Op0)) { + bool IsNegate = match(C, m_ZeroInt()); + Value *X; + if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) { + // 0 - (zext bool) --> sext bool + // C - (zext bool) --> bool ? C - 1 : C + if (IsNegate) + return CastInst::CreateSExtOrBitCast(X, I.getType()); + return SelectInst::Create(X, SubOne(C), C); + } + if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) { + // 0 - (sext bool) --> zext bool + // C - (sext bool) --> bool ? C + 1 : C + if (IsNegate) + return CastInst::CreateZExtOrBitCast(X, I.getType()); + return SelectInst::Create(X, AddOne(C), C); + } + + // C - ~X == X + (1+C) + if (match(Op1, m_Not(m_Value(X)))) + return BinaryOperator::CreateAdd(X, AddOne(C)); + + // Try to fold constant sub into select arguments. + if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + + // Try to fold constant sub into PHI values. + if (PHINode *PN = dyn_cast<PHINode>(Op1)) + if (Instruction *R = foldOpIntoPhi(I, PN)) + return R; + + Constant *C2; + + // C-(C2-X) --> X+(C-C2) + if (match(Op1, m_Sub(m_Constant(C2), m_Value(X)))) + return BinaryOperator::CreateAdd(X, ConstantExpr::getSub(C, C2)); + + // C-(X+C2) --> (C-C2)-X + if (match(Op1, m_Add(m_Value(X), m_Constant(C2)))) + return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X); + } + + const APInt *Op0C; + if (match(Op0, m_APInt(Op0C))) { + + if (Op0C->isNullValue()) { + Value *Op1Wide; + match(Op1, m_TruncOrSelf(m_Value(Op1Wide))); + bool HadTrunc = Op1Wide != Op1; + bool NoTruncOrTruncIsOneUse = !HadTrunc || Op1->hasOneUse(); + unsigned BitWidth = Op1Wide->getType()->getScalarSizeInBits(); + + Value *X; + const APInt *ShAmt; + // -(X >>u 31) -> (X >>s 31) + if (NoTruncOrTruncIsOneUse && + match(Op1Wide, m_LShr(m_Value(X), m_APInt(ShAmt))) && + *ShAmt == BitWidth - 1) { + Value *ShAmtOp = cast<Instruction>(Op1Wide)->getOperand(1); + Instruction *NewShift = BinaryOperator::CreateAShr(X, ShAmtOp); + NewShift->copyIRFlags(Op1Wide); + if (!HadTrunc) + return NewShift; + Builder.Insert(NewShift); + return TruncInst::CreateTruncOrBitCast(NewShift, Op1->getType()); + } + // -(X >>s 31) -> (X >>u 31) + if (NoTruncOrTruncIsOneUse && + match(Op1Wide, m_AShr(m_Value(X), m_APInt(ShAmt))) && + *ShAmt == BitWidth - 1) { + Value *ShAmtOp = cast<Instruction>(Op1Wide)->getOperand(1); + Instruction *NewShift = BinaryOperator::CreateLShr(X, ShAmtOp); + NewShift->copyIRFlags(Op1Wide); + if (!HadTrunc) + return NewShift; + Builder.Insert(NewShift); + return TruncInst::CreateTruncOrBitCast(NewShift, Op1->getType()); + } + + if (!HadTrunc && Op1->hasOneUse()) { + Value *LHS, *RHS; + SelectPatternFlavor SPF = matchSelectPattern(Op1, LHS, RHS).Flavor; + if (SPF == SPF_ABS || SPF == SPF_NABS) { + // This is a negate of an ABS/NABS pattern. Just swap the operands + // of the select. + cast<SelectInst>(Op1)->swapValues(); + // Don't swap prof metadata, we didn't change the branch behavior. + return replaceInstUsesWith(I, Op1); + } + } + } + + // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known + // zero. + if (Op0C->isMask()) { + KnownBits RHSKnown = computeKnownBits(Op1, 0, &I); + if ((*Op0C | RHSKnown.Zero).isAllOnesValue()) + return BinaryOperator::CreateXor(Op1, Op0); + } + } + + { + Value *Y; + // X-(X+Y) == -Y X-(Y+X) == -Y + if (match(Op1, m_c_Add(m_Specific(Op0), m_Value(Y)))) + return BinaryOperator::CreateNeg(Y); + + // (X-Y)-X == -Y + if (match(Op0, m_Sub(m_Specific(Op1), m_Value(Y)))) + return BinaryOperator::CreateNeg(Y); + } + + // (sub (or A, B) (and A, B)) --> (xor A, B) + { + Value *A, *B; + if (match(Op1, m_And(m_Value(A), m_Value(B))) && + match(Op0, m_c_Or(m_Specific(A), m_Specific(B)))) + return BinaryOperator::CreateXor(A, B); + } + + // (sub (and A, B) (or A, B)) --> neg (xor A, B) + { + Value *A, *B; + if (match(Op0, m_And(m_Value(A), m_Value(B))) && + match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) && + (Op0->hasOneUse() || Op1->hasOneUse())) + return BinaryOperator::CreateNeg(Builder.CreateXor(A, B)); + } + + // (sub (or A, B), (xor A, B)) --> (and A, B) + { + Value *A, *B; + if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && + match(Op0, m_c_Or(m_Specific(A), m_Specific(B)))) + return BinaryOperator::CreateAnd(A, B); + } + + // (sub (xor A, B) (or A, B)) --> neg (and A, B) + { + Value *A, *B; + if (match(Op0, m_Xor(m_Value(A), m_Value(B))) && + match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) && + (Op0->hasOneUse() || Op1->hasOneUse())) + return BinaryOperator::CreateNeg(Builder.CreateAnd(A, B)); + } + + { + Value *Y; + // ((X | Y) - X) --> (~X & Y) + if (match(Op0, m_OneUse(m_c_Or(m_Value(Y), m_Specific(Op1))))) + return BinaryOperator::CreateAnd( + Y, Builder.CreateNot(Op1, Op1->getName() + ".not")); + } + + if (Op1->hasOneUse()) { + Value *X = nullptr, *Y = nullptr, *Z = nullptr; + Constant *C = nullptr; + + // (X - (Y - Z)) --> (X + (Z - Y)). + if (match(Op1, m_Sub(m_Value(Y), m_Value(Z)))) + return BinaryOperator::CreateAdd(Op0, + Builder.CreateSub(Z, Y, Op1->getName())); + + // (X - (X & Y)) --> (X & ~Y) + if (match(Op1, m_c_And(m_Value(Y), m_Specific(Op0)))) + return BinaryOperator::CreateAnd(Op0, + Builder.CreateNot(Y, Y->getName() + ".not")); + + // 0 - (X sdiv C) -> (X sdiv -C) provided the negation doesn't overflow. + // TODO: This could be extended to match arbitrary vector constants. + const APInt *DivC; + if (match(Op0, m_Zero()) && match(Op1, m_SDiv(m_Value(X), m_APInt(DivC))) && + !DivC->isMinSignedValue() && *DivC != 1) { + Constant *NegDivC = ConstantInt::get(I.getType(), -(*DivC)); + Instruction *BO = BinaryOperator::CreateSDiv(X, NegDivC); + BO->setIsExact(cast<BinaryOperator>(Op1)->isExact()); + return BO; + } + + // 0 - (X << Y) -> (-X << Y) when X is freely negatable. + if (match(Op1, m_Shl(m_Value(X), m_Value(Y))) && match(Op0, m_Zero())) + if (Value *XNeg = dyn_castNegVal(X)) + return BinaryOperator::CreateShl(XNeg, Y); + + // Subtracting -1/0 is the same as adding 1/0: + // sub [nsw] Op0, sext(bool Y) -> add [nsw] Op0, zext(bool Y) + // 'nuw' is dropped in favor of the canonical form. + if (match(Op1, m_SExt(m_Value(Y))) && + Y->getType()->getScalarSizeInBits() == 1) { + Value *Zext = Builder.CreateZExt(Y, I.getType()); + BinaryOperator *Add = BinaryOperator::CreateAdd(Op0, Zext); + Add->setHasNoSignedWrap(I.hasNoSignedWrap()); + return Add; + } + + // X - A*-B -> X + A*B + // X - -A*B -> X + A*B + Value *A, *B; + if (match(Op1, m_c_Mul(m_Value(A), m_Neg(m_Value(B))))) + return BinaryOperator::CreateAdd(Op0, Builder.CreateMul(A, B)); + + // X - A*C -> X + A*-C + // No need to handle commuted multiply because multiply handling will + // ensure constant will be move to the right hand side. + if (match(Op1, m_Mul(m_Value(A), m_Constant(C))) && !isa<ConstantExpr>(C)) { + Value *NewMul = Builder.CreateMul(A, ConstantExpr::getNeg(C)); + return BinaryOperator::CreateAdd(Op0, NewMul); + } + } + + { + // ~A - Min/Max(~A, O) -> Max/Min(A, ~O) - A + // ~A - Min/Max(O, ~A) -> Max/Min(A, ~O) - A + // Min/Max(~A, O) - ~A -> A - Max/Min(A, ~O) + // Min/Max(O, ~A) - ~A -> A - Max/Min(A, ~O) + // So long as O here is freely invertible, this will be neutral or a win. + Value *LHS, *RHS, *A; + Value *NotA = Op0, *MinMax = Op1; + SelectPatternFlavor SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor; + if (!SelectPatternResult::isMinOrMax(SPF)) { + NotA = Op1; + MinMax = Op0; + SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor; + } + if (SelectPatternResult::isMinOrMax(SPF) && + match(NotA, m_Not(m_Value(A))) && (NotA == LHS || NotA == RHS)) { + if (NotA == LHS) + std::swap(LHS, RHS); + // LHS is now O above and expected to have at least 2 uses (the min/max) + // NotA is epected to have 2 uses from the min/max and 1 from the sub. + if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) && + !NotA->hasNUsesOrMore(4)) { + // Note: We don't generate the inverse max/min, just create the not of + // it and let other folds do the rest. + Value *Not = Builder.CreateNot(MinMax); + if (NotA == Op0) + return BinaryOperator::CreateSub(Not, A); + else + return BinaryOperator::CreateSub(A, Not); + } + } + } + + // Optimize pointer differences into the same array into a size. Consider: + // &A[10] - &A[0]: we should compile this to "10". + Value *LHSOp, *RHSOp; + if (match(Op0, m_PtrToInt(m_Value(LHSOp))) && + match(Op1, m_PtrToInt(m_Value(RHSOp)))) + if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) + return replaceInstUsesWith(I, Res); + + // trunc(p)-trunc(q) -> trunc(p-q) + if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) && + match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp))))) + if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) + return replaceInstUsesWith(I, Res); + + // Canonicalize a shifty way to code absolute value to the common pattern. + // There are 2 potential commuted variants. + // We're relying on the fact that we only do this transform when the shift has + // exactly 2 uses and the xor has exactly 1 use (otherwise, we might increase + // instructions). + Value *A; + const APInt *ShAmt; + Type *Ty = I.getType(); + if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) && + Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 && + match(Op0, m_OneUse(m_c_Xor(m_Specific(A), m_Specific(Op1))))) { + // B = ashr i32 A, 31 ; smear the sign bit + // sub (xor A, B), B ; flip bits if negative and subtract -1 (add 1) + // --> (A < 0) ? -A : A + Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty)); + // Copy the nuw/nsw flags from the sub to the negate. + Value *Neg = Builder.CreateNeg(A, "", I.hasNoUnsignedWrap(), + I.hasNoSignedWrap()); + return SelectInst::Create(Cmp, Neg, A); + } + + if (Instruction *V = + canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I)) + return V; + + if (Instruction *Ext = narrowMathIfNoOverflow(I)) + return Ext; + + bool Changed = false; + if (!I.hasNoSignedWrap() && willNotOverflowSignedSub(Op0, Op1, I)) { + Changed = true; + I.setHasNoSignedWrap(true); + } + if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedSub(Op0, Op1, I)) { + Changed = true; + I.setHasNoUnsignedWrap(true); + } + + return Changed ? &I : nullptr; +} + +/// This eliminates floating-point negation in either 'fneg(X)' or +/// 'fsub(-0.0, X)' form by combining into a constant operand. +static Instruction *foldFNegIntoConstant(Instruction &I) { + Value *X; + Constant *C; + + // Fold negation into constant operand. This is limited with one-use because + // fneg is assumed better for analysis and cheaper in codegen than fmul/fdiv. + // -(X * C) --> X * (-C) + // FIXME: It's arguable whether these should be m_OneUse or not. The current + // belief is that the FNeg allows for better reassociation opportunities. + if (match(&I, m_FNeg(m_OneUse(m_FMul(m_Value(X), m_Constant(C)))))) + return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I); + // -(X / C) --> X / (-C) + if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Value(X), m_Constant(C)))))) + return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I); + // -(C / X) --> (-C) / X + if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Constant(C), m_Value(X)))))) + return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I); + + return nullptr; +} + +static Instruction *hoistFNegAboveFMulFDiv(Instruction &I, + InstCombiner::BuilderTy &Builder) { + Value *FNeg; + if (!match(&I, m_FNeg(m_Value(FNeg)))) + return nullptr; + + Value *X, *Y; + if (match(FNeg, m_OneUse(m_FMul(m_Value(X), m_Value(Y))))) + return BinaryOperator::CreateFMulFMF(Builder.CreateFNegFMF(X, &I), Y, &I); + + if (match(FNeg, m_OneUse(m_FDiv(m_Value(X), m_Value(Y))))) + return BinaryOperator::CreateFDivFMF(Builder.CreateFNegFMF(X, &I), Y, &I); + + return nullptr; +} + +Instruction *InstCombiner::visitFNeg(UnaryOperator &I) { + Value *Op = I.getOperand(0); + + if (Value *V = SimplifyFNegInst(Op, I.getFastMathFlags(), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (Instruction *X = foldFNegIntoConstant(I)) + return X; + + Value *X, *Y; + + // If we can ignore the sign of zeros: -(X - Y) --> (Y - X) + if (I.hasNoSignedZeros() && + match(Op, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) + return BinaryOperator::CreateFSubFMF(Y, X, &I); + + if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder)) + return R; + + return nullptr; +} + +Instruction *InstCombiner::visitFSub(BinaryOperator &I) { + if (Value *V = SimplifyFSubInst(I.getOperand(0), I.getOperand(1), + I.getFastMathFlags(), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (Instruction *X = foldVectorBinop(I)) + return X; + + // Subtraction from -0.0 is the canonical form of fneg. + // fsub nsz 0, X ==> fsub nsz -0.0, X + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + if (I.hasNoSignedZeros() && match(Op0, m_PosZeroFP())) + return BinaryOperator::CreateFNegFMF(Op1, &I); + + if (Instruction *X = foldFNegIntoConstant(I)) + return X; + + if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder)) + return R; + + Value *X, *Y; + Constant *C; + + // If Op0 is not -0.0 or we can ignore -0.0: Z - (X - Y) --> Z + (Y - X) + // Canonicalize to fadd to make analysis easier. + // This can also help codegen because fadd is commutative. + // Note that if this fsub was really an fneg, the fadd with -0.0 will get + // killed later. We still limit that particular transform with 'hasOneUse' + // because an fneg is assumed better/cheaper than a generic fsub. + if (I.hasNoSignedZeros() || CannotBeNegativeZero(Op0, SQ.TLI)) { + if (match(Op1, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) { + Value *NewSub = Builder.CreateFSubFMF(Y, X, &I); + return BinaryOperator::CreateFAddFMF(Op0, NewSub, &I); + } + } + + if (isa<Constant>(Op0)) + if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) + if (Instruction *NV = FoldOpIntoSelect(I, SI)) + return NV; + + // X - C --> X + (-C) + // But don't transform constant expressions because there's an inverse fold + // for X + (-Y) --> X - Y. + if (match(Op1, m_Constant(C)) && !isa<ConstantExpr>(Op1)) + return BinaryOperator::CreateFAddFMF(Op0, ConstantExpr::getFNeg(C), &I); + + // X - (-Y) --> X + Y + if (match(Op1, m_FNeg(m_Value(Y)))) + return BinaryOperator::CreateFAddFMF(Op0, Y, &I); + + // Similar to above, but look through a cast of the negated value: + // X - (fptrunc(-Y)) --> X + fptrunc(Y) + Type *Ty = I.getType(); + if (match(Op1, m_OneUse(m_FPTrunc(m_FNeg(m_Value(Y)))))) + return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPTrunc(Y, Ty), &I); + + // X - (fpext(-Y)) --> X + fpext(Y) + if (match(Op1, m_OneUse(m_FPExt(m_FNeg(m_Value(Y)))))) + return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPExt(Y, Ty), &I); + + // Similar to above, but look through fmul/fdiv of the negated value: + // Op0 - (-X * Y) --> Op0 + (X * Y) + // Op0 - (Y * -X) --> Op0 + (X * Y) + if (match(Op1, m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))))) { + Value *FMul = Builder.CreateFMulFMF(X, Y, &I); + return BinaryOperator::CreateFAddFMF(Op0, FMul, &I); + } + // Op0 - (-X / Y) --> Op0 + (X / Y) + // Op0 - (X / -Y) --> Op0 + (X / Y) + if (match(Op1, m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y)))) || + match(Op1, m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))))) { + Value *FDiv = Builder.CreateFDivFMF(X, Y, &I); + return BinaryOperator::CreateFAddFMF(Op0, FDiv, &I); + } + + // Handle special cases for FSub with selects feeding the operation + if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1)) + return replaceInstUsesWith(I, V); + + if (I.hasAllowReassoc() && I.hasNoSignedZeros()) { + // (Y - X) - Y --> -X + if (match(Op0, m_FSub(m_Specific(Op1), m_Value(X)))) + return BinaryOperator::CreateFNegFMF(X, &I); + + // Y - (X + Y) --> -X + // Y - (Y + X) --> -X + if (match(Op1, m_c_FAdd(m_Specific(Op0), m_Value(X)))) + return BinaryOperator::CreateFNegFMF(X, &I); + + // (X * C) - X --> X * (C - 1.0) + if (match(Op0, m_FMul(m_Specific(Op1), m_Constant(C)))) { + Constant *CSubOne = ConstantExpr::getFSub(C, ConstantFP::get(Ty, 1.0)); + return BinaryOperator::CreateFMulFMF(Op1, CSubOne, &I); + } + // X - (X * C) --> X * (1.0 - C) + if (match(Op1, m_FMul(m_Specific(Op0), m_Constant(C)))) { + Constant *OneSubC = ConstantExpr::getFSub(ConstantFP::get(Ty, 1.0), C); + return BinaryOperator::CreateFMulFMF(Op0, OneSubC, &I); + } + + if (Instruction *F = factorizeFAddFSub(I, Builder)) + return F; + + // TODO: This performs reassociative folds for FP ops. Some fraction of the + // functionality has been subsumed by simple pattern matching here and in + // InstSimplify. We should let a dedicated reassociation pass handle more + // complex pattern matching and remove this from InstCombine. + if (Value *V = FAddCombine(Builder).simplify(&I)) + return replaceInstUsesWith(I, V); + } + + return nullptr; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp new file mode 100644 index 000000000000..4a30b60ca931 --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -0,0 +1,3288 @@ +//===- InstCombineAndOrXor.cpp --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the visitAnd, visitOr, and visitXor functions. +// +//===----------------------------------------------------------------------===// + +#include "InstCombineInternal.h" +#include "llvm/Analysis/CmpInstAnalysis.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/ConstantRange.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PatternMatch.h" +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "instcombine" + +/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into +/// a four bit mask. +static unsigned getFCmpCode(FCmpInst::Predicate CC) { + assert(FCmpInst::FCMP_FALSE <= CC && CC <= FCmpInst::FCMP_TRUE && + "Unexpected FCmp predicate!"); + // Take advantage of the bit pattern of FCmpInst::Predicate here. + // U L G E + static_assert(FCmpInst::FCMP_FALSE == 0, ""); // 0 0 0 0 + static_assert(FCmpInst::FCMP_OEQ == 1, ""); // 0 0 0 1 + static_assert(FCmpInst::FCMP_OGT == 2, ""); // 0 0 1 0 + static_assert(FCmpInst::FCMP_OGE == 3, ""); // 0 0 1 1 + static_assert(FCmpInst::FCMP_OLT == 4, ""); // 0 1 0 0 + static_assert(FCmpInst::FCMP_OLE == 5, ""); // 0 1 0 1 + static_assert(FCmpInst::FCMP_ONE == 6, ""); // 0 1 1 0 + static_assert(FCmpInst::FCMP_ORD == 7, ""); // 0 1 1 1 + static_assert(FCmpInst::FCMP_UNO == 8, ""); // 1 0 0 0 + static_assert(FCmpInst::FCMP_UEQ == 9, ""); // 1 0 0 1 + static_assert(FCmpInst::FCMP_UGT == 10, ""); // 1 0 1 0 + static_assert(FCmpInst::FCMP_UGE == 11, ""); // 1 0 1 1 + static_assert(FCmpInst::FCMP_ULT == 12, ""); // 1 1 0 0 + static_assert(FCmpInst::FCMP_ULE == 13, ""); // 1 1 0 1 + static_assert(FCmpInst::FCMP_UNE == 14, ""); // 1 1 1 0 + static_assert(FCmpInst::FCMP_TRUE == 15, ""); // 1 1 1 1 + return CC; +} + +/// This is the complement of getICmpCode, which turns an opcode and two +/// operands into either a constant true or false, or a brand new ICmp +/// instruction. The sign is passed in to determine which kind of predicate to +/// use in the new icmp instruction. +static Value *getNewICmpValue(unsigned Code, bool Sign, Value *LHS, Value *RHS, + InstCombiner::BuilderTy &Builder) { + ICmpInst::Predicate NewPred; + if (Constant *TorF = getPredForICmpCode(Code, Sign, LHS->getType(), NewPred)) + return TorF; + return Builder.CreateICmp(NewPred, LHS, RHS); +} + +/// This is the complement of getFCmpCode, which turns an opcode and two +/// operands into either a FCmp instruction, or a true/false constant. +static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS, + InstCombiner::BuilderTy &Builder) { + const auto Pred = static_cast<FCmpInst::Predicate>(Code); + assert(FCmpInst::FCMP_FALSE <= Pred && Pred <= FCmpInst::FCMP_TRUE && + "Unexpected FCmp predicate!"); + if (Pred == FCmpInst::FCMP_FALSE) + return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); + if (Pred == FCmpInst::FCMP_TRUE) + return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1); + return Builder.CreateFCmp(Pred, LHS, RHS); +} + +/// Transform BITWISE_OP(BSWAP(A),BSWAP(B)) or +/// BITWISE_OP(BSWAP(A), Constant) to BSWAP(BITWISE_OP(A, B)) +/// \param I Binary operator to transform. +/// \return Pointer to node that must replace the original binary operator, or +/// null pointer if no transformation was made. +static Value *SimplifyBSwap(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + assert(I.isBitwiseLogicOp() && "Unexpected opcode for bswap simplifying"); + + Value *OldLHS = I.getOperand(0); + Value *OldRHS = I.getOperand(1); + + Value *NewLHS; + if (!match(OldLHS, m_BSwap(m_Value(NewLHS)))) + return nullptr; + + Value *NewRHS; + const APInt *C; + + if (match(OldRHS, m_BSwap(m_Value(NewRHS)))) { + // OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) ) + if (!OldLHS->hasOneUse() && !OldRHS->hasOneUse()) + return nullptr; + // NewRHS initialized by the matcher. + } else if (match(OldRHS, m_APInt(C))) { + // OP( BSWAP(x), CONSTANT ) -> BSWAP( OP(x, BSWAP(CONSTANT) ) ) + if (!OldLHS->hasOneUse()) + return nullptr; + NewRHS = ConstantInt::get(I.getType(), C->byteSwap()); + } else + return nullptr; + + Value *BinOp = Builder.CreateBinOp(I.getOpcode(), NewLHS, NewRHS); + Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap, + I.getType()); + return Builder.CreateCall(F, BinOp); +} + +/// This handles expressions of the form ((val OP C1) & C2). Where +/// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'. +Instruction *InstCombiner::OptAndOp(BinaryOperator *Op, + ConstantInt *OpRHS, + ConstantInt *AndRHS, + BinaryOperator &TheAnd) { + Value *X = Op->getOperand(0); + + switch (Op->getOpcode()) { + default: break; + case Instruction::Add: + if (Op->hasOneUse()) { + // Adding a one to a single bit bit-field should be turned into an XOR + // of the bit. First thing to check is to see if this AND is with a + // single bit constant. + const APInt &AndRHSV = AndRHS->getValue(); + + // If there is only one bit set. + if (AndRHSV.isPowerOf2()) { + // Ok, at this point, we know that we are masking the result of the + // ADD down to exactly one bit. If the constant we are adding has + // no bits set below this bit, then we can eliminate the ADD. + const APInt& AddRHS = OpRHS->getValue(); + + // Check to see if any bits below the one bit set in AndRHSV are set. + if ((AddRHS & (AndRHSV - 1)).isNullValue()) { + // If not, the only thing that can effect the output of the AND is + // the bit specified by AndRHSV. If that bit is set, the effect of + // the XOR is to toggle the bit. If it is clear, then the ADD has + // no effect. + if ((AddRHS & AndRHSV).isNullValue()) { // Bit is not set, noop + TheAnd.setOperand(0, X); + return &TheAnd; + } else { + // Pull the XOR out of the AND. + Value *NewAnd = Builder.CreateAnd(X, AndRHS); + NewAnd->takeName(Op); + return BinaryOperator::CreateXor(NewAnd, AndRHS); + } + } + } + } + break; + } + return nullptr; +} + +/// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise +/// (V < Lo || V >= Hi). This method expects that Lo < Hi. IsSigned indicates +/// whether to treat V, Lo, and Hi as signed or not. +Value *InstCombiner::insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi, + bool isSigned, bool Inside) { + assert((isSigned ? Lo.slt(Hi) : Lo.ult(Hi)) && + "Lo is not < Hi in range emission code!"); + + Type *Ty = V->getType(); + + // V >= Min && V < Hi --> V < Hi + // V < Min || V >= Hi --> V >= Hi + ICmpInst::Predicate Pred = Inside ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_UGE; + if (isSigned ? Lo.isMinSignedValue() : Lo.isMinValue()) { + Pred = isSigned ? ICmpInst::getSignedPredicate(Pred) : Pred; + return Builder.CreateICmp(Pred, V, ConstantInt::get(Ty, Hi)); + } + + // V >= Lo && V < Hi --> V - Lo u< Hi - Lo + // V < Lo || V >= Hi --> V - Lo u>= Hi - Lo + Value *VMinusLo = + Builder.CreateSub(V, ConstantInt::get(Ty, Lo), V->getName() + ".off"); + Constant *HiMinusLo = ConstantInt::get(Ty, Hi - Lo); + return Builder.CreateICmp(Pred, VMinusLo, HiMinusLo); +} + +/// Classify (icmp eq (A & B), C) and (icmp ne (A & B), C) as matching patterns +/// that can be simplified. +/// One of A and B is considered the mask. The other is the value. This is +/// described as the "AMask" or "BMask" part of the enum. If the enum contains +/// only "Mask", then both A and B can be considered masks. If A is the mask, +/// then it was proven that (A & C) == C. This is trivial if C == A or C == 0. +/// If both A and C are constants, this proof is also easy. +/// For the following explanations, we assume that A is the mask. +/// +/// "AllOnes" declares that the comparison is true only if (A & B) == A or all +/// bits of A are set in B. +/// Example: (icmp eq (A & 3), 3) -> AMask_AllOnes +/// +/// "AllZeros" declares that the comparison is true only if (A & B) == 0 or all +/// bits of A are cleared in B. +/// Example: (icmp eq (A & 3), 0) -> Mask_AllZeroes +/// +/// "Mixed" declares that (A & B) == C and C might or might not contain any +/// number of one bits and zero bits. +/// Example: (icmp eq (A & 3), 1) -> AMask_Mixed +/// +/// "Not" means that in above descriptions "==" should be replaced by "!=". +/// Example: (icmp ne (A & 3), 3) -> AMask_NotAllOnes +/// +/// If the mask A contains a single bit, then the following is equivalent: +/// (icmp eq (A & B), A) equals (icmp ne (A & B), 0) +/// (icmp ne (A & B), A) equals (icmp eq (A & B), 0) +enum MaskedICmpType { + AMask_AllOnes = 1, + AMask_NotAllOnes = 2, + BMask_AllOnes = 4, + BMask_NotAllOnes = 8, + Mask_AllZeros = 16, + Mask_NotAllZeros = 32, + AMask_Mixed = 64, + AMask_NotMixed = 128, + BMask_Mixed = 256, + BMask_NotMixed = 512 +}; + +/// Return the set of patterns (from MaskedICmpType) that (icmp SCC (A & B), C) +/// satisfies. +static unsigned getMaskedICmpType(Value *A, Value *B, Value *C, + ICmpInst::Predicate Pred) { + ConstantInt *ACst = dyn_cast<ConstantInt>(A); + ConstantInt *BCst = dyn_cast<ConstantInt>(B); + ConstantInt *CCst = dyn_cast<ConstantInt>(C); + bool IsEq = (Pred == ICmpInst::ICMP_EQ); + bool IsAPow2 = (ACst && !ACst->isZero() && ACst->getValue().isPowerOf2()); + bool IsBPow2 = (BCst && !BCst->isZero() && BCst->getValue().isPowerOf2()); + unsigned MaskVal = 0; + if (CCst && CCst->isZero()) { + // if C is zero, then both A and B qualify as mask + MaskVal |= (IsEq ? (Mask_AllZeros | AMask_Mixed | BMask_Mixed) + : (Mask_NotAllZeros | AMask_NotMixed | BMask_NotMixed)); + if (IsAPow2) + MaskVal |= (IsEq ? (AMask_NotAllOnes | AMask_NotMixed) + : (AMask_AllOnes | AMask_Mixed)); + if (IsBPow2) + MaskVal |= (IsEq ? (BMask_NotAllOnes | BMask_NotMixed) + : (BMask_AllOnes | BMask_Mixed)); + return MaskVal; + } + + if (A == C) { + MaskVal |= (IsEq ? (AMask_AllOnes | AMask_Mixed) + : (AMask_NotAllOnes | AMask_NotMixed)); + if (IsAPow2) + MaskVal |= (IsEq ? (Mask_NotAllZeros | AMask_NotMixed) + : (Mask_AllZeros | AMask_Mixed)); + } else if (ACst && CCst && ConstantExpr::getAnd(ACst, CCst) == CCst) { + MaskVal |= (IsEq ? AMask_Mixed : AMask_NotMixed); + } + + if (B == C) { + MaskVal |= (IsEq ? (BMask_AllOnes | BMask_Mixed) + : (BMask_NotAllOnes | BMask_NotMixed)); + if (IsBPow2) + MaskVal |= (IsEq ? (Mask_NotAllZeros | BMask_NotMixed) + : (Mask_AllZeros | BMask_Mixed)); + } else if (BCst && CCst && ConstantExpr::getAnd(BCst, CCst) == CCst) { + MaskVal |= (IsEq ? BMask_Mixed : BMask_NotMixed); + } + + return MaskVal; +} + +/// Convert an analysis of a masked ICmp into its equivalent if all boolean +/// operations had the opposite sense. Since each "NotXXX" flag (recording !=) +/// is adjacent to the corresponding normal flag (recording ==), this just +/// involves swapping those bits over. +static unsigned conjugateICmpMask(unsigned Mask) { + unsigned NewMask; + NewMask = (Mask & (AMask_AllOnes | BMask_AllOnes | Mask_AllZeros | + AMask_Mixed | BMask_Mixed)) + << 1; + + NewMask |= (Mask & (AMask_NotAllOnes | BMask_NotAllOnes | Mask_NotAllZeros | + AMask_NotMixed | BMask_NotMixed)) + >> 1; + + return NewMask; +} + +// Adapts the external decomposeBitTestICmp for local use. +static bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred, + Value *&X, Value *&Y, Value *&Z) { + APInt Mask; + if (!llvm::decomposeBitTestICmp(LHS, RHS, Pred, X, Mask)) + return false; + + Y = ConstantInt::get(X->getType(), Mask); + Z = ConstantInt::get(X->getType(), 0); + return true; +} + +/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E). +/// Return the pattern classes (from MaskedICmpType) for the left hand side and +/// the right hand side as a pair. +/// LHS and RHS are the left hand side and the right hand side ICmps and PredL +/// and PredR are their predicates, respectively. +static +Optional<std::pair<unsigned, unsigned>> +getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C, + Value *&D, Value *&E, ICmpInst *LHS, + ICmpInst *RHS, + ICmpInst::Predicate &PredL, + ICmpInst::Predicate &PredR) { + // vectors are not (yet?) supported. Don't support pointers either. + if (!LHS->getOperand(0)->getType()->isIntegerTy() || + !RHS->getOperand(0)->getType()->isIntegerTy()) + return None; + + // Here comes the tricky part: + // LHS might be of the form L11 & L12 == X, X == L21 & L22, + // and L11 & L12 == L21 & L22. The same goes for RHS. + // Now we must find those components L** and R**, that are equal, so + // that we can extract the parameters A, B, C, D, and E for the canonical + // above. + Value *L1 = LHS->getOperand(0); + Value *L2 = LHS->getOperand(1); + Value *L11, *L12, *L21, *L22; + // Check whether the icmp can be decomposed into a bit test. + if (decomposeBitTestICmp(L1, L2, PredL, L11, L12, L2)) { + L21 = L22 = L1 = nullptr; + } else { + // Look for ANDs in the LHS icmp. + if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) { + // Any icmp can be viewed as being trivially masked; if it allows us to + // remove one, it's worth it. + L11 = L1; + L12 = Constant::getAllOnesValue(L1->getType()); + } + + if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) { + L21 = L2; + L22 = Constant::getAllOnesValue(L2->getType()); + } + } + + // Bail if LHS was a icmp that can't be decomposed into an equality. + if (!ICmpInst::isEquality(PredL)) + return None; + + Value *R1 = RHS->getOperand(0); + Value *R2 = RHS->getOperand(1); + Value *R11, *R12; + bool Ok = false; + if (decomposeBitTestICmp(R1, R2, PredR, R11, R12, R2)) { + if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) { + A = R11; + D = R12; + } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) { + A = R12; + D = R11; + } else { + return None; + } + E = R2; + R1 = nullptr; + Ok = true; + } else { + if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) { + // As before, model no mask as a trivial mask if it'll let us do an + // optimization. + R11 = R1; + R12 = Constant::getAllOnesValue(R1->getType()); + } + + if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) { + A = R11; + D = R12; + E = R2; + Ok = true; + } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) { + A = R12; + D = R11; + E = R2; + Ok = true; + } + } + + // Bail if RHS was a icmp that can't be decomposed into an equality. + if (!ICmpInst::isEquality(PredR)) + return None; + + // Look for ANDs on the right side of the RHS icmp. + if (!Ok) { + if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) { + R11 = R2; + R12 = Constant::getAllOnesValue(R2->getType()); + } + + if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) { + A = R11; + D = R12; + E = R1; + Ok = true; + } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) { + A = R12; + D = R11; + E = R1; + Ok = true; + } else { + return None; + } + } + if (!Ok) + return None; + + if (L11 == A) { + B = L12; + C = L2; + } else if (L12 == A) { + B = L11; + C = L2; + } else if (L21 == A) { + B = L22; + C = L1; + } else if (L22 == A) { + B = L21; + C = L1; + } + + unsigned LeftType = getMaskedICmpType(A, B, C, PredL); + unsigned RightType = getMaskedICmpType(A, D, E, PredR); + return Optional<std::pair<unsigned, unsigned>>(std::make_pair(LeftType, RightType)); +} + +/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) into a single +/// (icmp(A & X) ==/!= Y), where the left-hand side is of type Mask_NotAllZeros +/// and the right hand side is of type BMask_Mixed. For example, +/// (icmp (A & 12) != 0) & (icmp (A & 15) == 8) -> (icmp (A & 15) == 8). +static Value * foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed( + ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, + Value *A, Value *B, Value *C, Value *D, Value *E, + ICmpInst::Predicate PredL, ICmpInst::Predicate PredR, + llvm::InstCombiner::BuilderTy &Builder) { + // We are given the canonical form: + // (icmp ne (A & B), 0) & (icmp eq (A & D), E). + // where D & E == E. + // + // If IsAnd is false, we get it in negated form: + // (icmp eq (A & B), 0) | (icmp ne (A & D), E) -> + // !((icmp ne (A & B), 0) & (icmp eq (A & D), E)). + // + // We currently handle the case of B, C, D, E are constant. + // + ConstantInt *BCst = dyn_cast<ConstantInt>(B); + if (!BCst) + return nullptr; + ConstantInt *CCst = dyn_cast<ConstantInt>(C); + if (!CCst) + return nullptr; + ConstantInt *DCst = dyn_cast<ConstantInt>(D); + if (!DCst) + return nullptr; + ConstantInt *ECst = dyn_cast<ConstantInt>(E); + if (!ECst) + return nullptr; + + ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE; + + // Update E to the canonical form when D is a power of two and RHS is + // canonicalized as, + // (icmp ne (A & D), 0) -> (icmp eq (A & D), D) or + // (icmp ne (A & D), D) -> (icmp eq (A & D), 0). + if (PredR != NewCC) + ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst)); + + // If B or D is zero, skip because if LHS or RHS can be trivially folded by + // other folding rules and this pattern won't apply any more. + if (BCst->getValue() == 0 || DCst->getValue() == 0) + return nullptr; + + // If B and D don't intersect, ie. (B & D) == 0, no folding because we can't + // deduce anything from it. + // For example, + // (icmp ne (A & 12), 0) & (icmp eq (A & 3), 1) -> no folding. + if ((BCst->getValue() & DCst->getValue()) == 0) + return nullptr; + + // If the following two conditions are met: + // + // 1. mask B covers only a single bit that's not covered by mask D, that is, + // (B & (B ^ D)) is a power of 2 (in other words, B minus the intersection of + // B and D has only one bit set) and, + // + // 2. RHS (and E) indicates that the rest of B's bits are zero (in other + // words, the intersection of B and D is zero), that is, ((B & D) & E) == 0 + // + // then that single bit in B must be one and thus the whole expression can be + // folded to + // (A & (B | D)) == (B & (B ^ D)) | E. + // + // For example, + // (icmp ne (A & 12), 0) & (icmp eq (A & 7), 1) -> (icmp eq (A & 15), 9) + // (icmp ne (A & 15), 0) & (icmp eq (A & 7), 0) -> (icmp eq (A & 15), 8) + if ((((BCst->getValue() & DCst->getValue()) & ECst->getValue()) == 0) && + (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())).isPowerOf2()) { + APInt BorD = BCst->getValue() | DCst->getValue(); + APInt BandBxorDorE = (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())) | + ECst->getValue(); + Value *NewMask = ConstantInt::get(BCst->getType(), BorD); + Value *NewMaskedValue = ConstantInt::get(BCst->getType(), BandBxorDorE); + Value *NewAnd = Builder.CreateAnd(A, NewMask); + return Builder.CreateICmp(NewCC, NewAnd, NewMaskedValue); + } + + auto IsSubSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) { + return (C1->getValue() & C2->getValue()) == C1->getValue(); + }; + auto IsSuperSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) { + return (C1->getValue() & C2->getValue()) == C2->getValue(); + }; + + // In the following, we consider only the cases where B is a superset of D, B + // is a subset of D, or B == D because otherwise there's at least one bit + // covered by B but not D, in which case we can't deduce much from it, so + // no folding (aside from the single must-be-one bit case right above.) + // For example, + // (icmp ne (A & 14), 0) & (icmp eq (A & 3), 1) -> no folding. + if (!IsSubSetOrEqual(BCst, DCst) && !IsSuperSetOrEqual(BCst, DCst)) + return nullptr; + + // At this point, either B is a superset of D, B is a subset of D or B == D. + + // If E is zero, if B is a subset of (or equal to) D, LHS and RHS contradict + // and the whole expression becomes false (or true if negated), otherwise, no + // folding. + // For example, + // (icmp ne (A & 3), 0) & (icmp eq (A & 7), 0) -> false. + // (icmp ne (A & 15), 0) & (icmp eq (A & 3), 0) -> no folding. + if (ECst->isZero()) { + if (IsSubSetOrEqual(BCst, DCst)) + return ConstantInt::get(LHS->getType(), !IsAnd); + return nullptr; + } + + // At this point, B, D, E aren't zero and (B & D) == B, (B & D) == D or B == + // D. If B is a superset of (or equal to) D, since E is not zero, LHS is + // subsumed by RHS (RHS implies LHS.) So the whole expression becomes + // RHS. For example, + // (icmp ne (A & 255), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8). + // (icmp ne (A & 15), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8). + if (IsSuperSetOrEqual(BCst, DCst)) + return RHS; + // Otherwise, B is a subset of D. If B and E have a common bit set, + // ie. (B & E) != 0, then LHS is subsumed by RHS. For example. + // (icmp ne (A & 12), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8). + assert(IsSubSetOrEqual(BCst, DCst) && "Precondition due to above code"); + if ((BCst->getValue() & ECst->getValue()) != 0) + return RHS; + // Otherwise, LHS and RHS contradict and the whole expression becomes false + // (or true if negated.) For example, + // (icmp ne (A & 7), 0) & (icmp eq (A & 15), 8) -> false. + // (icmp ne (A & 6), 0) & (icmp eq (A & 15), 8) -> false. + return ConstantInt::get(LHS->getType(), !IsAnd); +} + +/// Try to fold (icmp(A & B) ==/!= 0) &/| (icmp(A & D) ==/!= E) into a single +/// (icmp(A & X) ==/!= Y), where the left-hand side and the right hand side +/// aren't of the common mask pattern type. +static Value *foldLogOpOfMaskedICmpsAsymmetric( + ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, + Value *A, Value *B, Value *C, Value *D, Value *E, + ICmpInst::Predicate PredL, ICmpInst::Predicate PredR, + unsigned LHSMask, unsigned RHSMask, + llvm::InstCombiner::BuilderTy &Builder) { + assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) && + "Expected equality predicates for masked type of icmps."); + // Handle Mask_NotAllZeros-BMask_Mixed cases. + // (icmp ne/eq (A & B), C) &/| (icmp eq/ne (A & D), E), or + // (icmp eq/ne (A & B), C) &/| (icmp ne/eq (A & D), E) + // which gets swapped to + // (icmp ne/eq (A & D), E) &/| (icmp eq/ne (A & B), C). + if (!IsAnd) { + LHSMask = conjugateICmpMask(LHSMask); + RHSMask = conjugateICmpMask(RHSMask); + } + if ((LHSMask & Mask_NotAllZeros) && (RHSMask & BMask_Mixed)) { + if (Value *V = foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed( + LHS, RHS, IsAnd, A, B, C, D, E, + PredL, PredR, Builder)) { + return V; + } + } else if ((LHSMask & BMask_Mixed) && (RHSMask & Mask_NotAllZeros)) { + if (Value *V = foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed( + RHS, LHS, IsAnd, A, D, E, B, C, + PredR, PredL, Builder)) { + return V; + } + } + return nullptr; +} + +/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) +/// into a single (icmp(A & X) ==/!= Y). +static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, + llvm::InstCombiner::BuilderTy &Builder) { + Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr; + ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); + Optional<std::pair<unsigned, unsigned>> MaskPair = + getMaskedTypeForICmpPair(A, B, C, D, E, LHS, RHS, PredL, PredR); + if (!MaskPair) + return nullptr; + assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) && + "Expected equality predicates for masked type of icmps."); + unsigned LHSMask = MaskPair->first; + unsigned RHSMask = MaskPair->second; + unsigned Mask = LHSMask & RHSMask; + if (Mask == 0) { + // Even if the two sides don't share a common pattern, check if folding can + // still happen. + if (Value *V = foldLogOpOfMaskedICmpsAsymmetric( + LHS, RHS, IsAnd, A, B, C, D, E, PredL, PredR, LHSMask, RHSMask, + Builder)) + return V; + return nullptr; + } + + // In full generality: + // (icmp (A & B) Op C) | (icmp (A & D) Op E) + // == ![ (icmp (A & B) !Op C) & (icmp (A & D) !Op E) ] + // + // If the latter can be converted into (icmp (A & X) Op Y) then the former is + // equivalent to (icmp (A & X) !Op Y). + // + // Therefore, we can pretend for the rest of this function that we're dealing + // with the conjunction, provided we flip the sense of any comparisons (both + // input and output). + + // In most cases we're going to produce an EQ for the "&&" case. + ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE; + if (!IsAnd) { + // Convert the masking analysis into its equivalent with negated + // comparisons. + Mask = conjugateICmpMask(Mask); + } + + if (Mask & Mask_AllZeros) { + // (icmp eq (A & B), 0) & (icmp eq (A & D), 0) + // -> (icmp eq (A & (B|D)), 0) + Value *NewOr = Builder.CreateOr(B, D); + Value *NewAnd = Builder.CreateAnd(A, NewOr); + // We can't use C as zero because we might actually handle + // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // with B and D, having a single bit set. + Value *Zero = Constant::getNullValue(A->getType()); + return Builder.CreateICmp(NewCC, NewAnd, Zero); + } + if (Mask & BMask_AllOnes) { + // (icmp eq (A & B), B) & (icmp eq (A & D), D) + // -> (icmp eq (A & (B|D)), (B|D)) + Value *NewOr = Builder.CreateOr(B, D); + Value *NewAnd = Builder.CreateAnd(A, NewOr); + return Builder.CreateICmp(NewCC, NewAnd, NewOr); + } + if (Mask & AMask_AllOnes) { + // (icmp eq (A & B), A) & (icmp eq (A & D), A) + // -> (icmp eq (A & (B&D)), A) + Value *NewAnd1 = Builder.CreateAnd(B, D); + Value *NewAnd2 = Builder.CreateAnd(A, NewAnd1); + return Builder.CreateICmp(NewCC, NewAnd2, A); + } + + // Remaining cases assume at least that B and D are constant, and depend on + // their actual values. This isn't strictly necessary, just a "handle the + // easy cases for now" decision. + ConstantInt *BCst = dyn_cast<ConstantInt>(B); + if (!BCst) + return nullptr; + ConstantInt *DCst = dyn_cast<ConstantInt>(D); + if (!DCst) + return nullptr; + + if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) { + // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and + // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0) + // Only valid if one of the masks is a superset of the other (check "B&D" is + // the same as either B or D). + APInt NewMask = BCst->getValue() & DCst->getValue(); + + if (NewMask == BCst->getValue()) + return LHS; + else if (NewMask == DCst->getValue()) + return RHS; + } + + if (Mask & AMask_NotAllOnes) { + // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // -> (icmp ne (A & B), A) or (icmp ne (A & D), A) + // Only valid if one of the masks is a superset of the other (check "B|D" is + // the same as either B or D). + APInt NewMask = BCst->getValue() | DCst->getValue(); + + if (NewMask == BCst->getValue()) + return LHS; + else if (NewMask == DCst->getValue()) + return RHS; + } + + if (Mask & BMask_Mixed) { + // (icmp eq (A & B), C) & (icmp eq (A & D), E) + // We already know that B & C == C && D & E == E. + // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of + // C and E, which are shared by both the mask B and the mask D, don't + // contradict, then we can transform to + // -> (icmp eq (A & (B|D)), (C|E)) + // Currently, we only handle the case of B, C, D, and E being constant. + // We can't simply use C and E because we might actually handle + // (icmp ne (A & B), B) & (icmp eq (A & D), D) + // with B and D, having a single bit set. + ConstantInt *CCst = dyn_cast<ConstantInt>(C); + if (!CCst) + return nullptr; + ConstantInt *ECst = dyn_cast<ConstantInt>(E); + if (!ECst) + return nullptr; + if (PredL != NewCC) + CCst = cast<ConstantInt>(ConstantExpr::getXor(BCst, CCst)); + if (PredR != NewCC) + ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst)); + + // If there is a conflict, we should actually return a false for the + // whole construct. + if (((BCst->getValue() & DCst->getValue()) & + (CCst->getValue() ^ ECst->getValue())).getBoolValue()) + return ConstantInt::get(LHS->getType(), !IsAnd); + + Value *NewOr1 = Builder.CreateOr(B, D); + Value *NewOr2 = ConstantExpr::getOr(CCst, ECst); + Value *NewAnd = Builder.CreateAnd(A, NewOr1); + return Builder.CreateICmp(NewCC, NewAnd, NewOr2); + } + + return nullptr; +} + +/// Try to fold a signed range checked with lower bound 0 to an unsigned icmp. +/// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n +/// If \p Inverted is true then the check is for the inverted range, e.g. +/// (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n +Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, + bool Inverted) { + // Check the lower range comparison, e.g. x >= 0 + // InstCombine already ensured that if there is a constant it's on the RHS. + ConstantInt *RangeStart = dyn_cast<ConstantInt>(Cmp0->getOperand(1)); + if (!RangeStart) + return nullptr; + + ICmpInst::Predicate Pred0 = (Inverted ? Cmp0->getInversePredicate() : + Cmp0->getPredicate()); + + // Accept x > -1 or x >= 0 (after potentially inverting the predicate). + if (!((Pred0 == ICmpInst::ICMP_SGT && RangeStart->isMinusOne()) || + (Pred0 == ICmpInst::ICMP_SGE && RangeStart->isZero()))) + return nullptr; + + ICmpInst::Predicate Pred1 = (Inverted ? Cmp1->getInversePredicate() : + Cmp1->getPredicate()); + + Value *Input = Cmp0->getOperand(0); + Value *RangeEnd; + if (Cmp1->getOperand(0) == Input) { + // For the upper range compare we have: icmp x, n + RangeEnd = Cmp1->getOperand(1); + } else if (Cmp1->getOperand(1) == Input) { + // For the upper range compare we have: icmp n, x + RangeEnd = Cmp1->getOperand(0); + Pred1 = ICmpInst::getSwappedPredicate(Pred1); + } else { + return nullptr; + } + + // Check the upper range comparison, e.g. x < n + ICmpInst::Predicate NewPred; + switch (Pred1) { + case ICmpInst::ICMP_SLT: NewPred = ICmpInst::ICMP_ULT; break; + case ICmpInst::ICMP_SLE: NewPred = ICmpInst::ICMP_ULE; break; + default: return nullptr; + } + + // This simplification is only valid if the upper range is not negative. + KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1); + if (!Known.isNonNegative()) + return nullptr; + + if (Inverted) + NewPred = ICmpInst::getInversePredicate(NewPred); + + return Builder.CreateICmp(NewPred, Input, RangeEnd); +} + +static Value * +foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS, + bool JoinedByAnd, + InstCombiner::BuilderTy &Builder) { + Value *X = LHS->getOperand(0); + if (X != RHS->getOperand(0)) + return nullptr; + + const APInt *C1, *C2; + if (!match(LHS->getOperand(1), m_APInt(C1)) || + !match(RHS->getOperand(1), m_APInt(C2))) + return nullptr; + + // We only handle (X != C1 && X != C2) and (X == C1 || X == C2). + ICmpInst::Predicate Pred = LHS->getPredicate(); + if (Pred != RHS->getPredicate()) + return nullptr; + if (JoinedByAnd && Pred != ICmpInst::ICMP_NE) + return nullptr; + if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ) + return nullptr; + + // The larger unsigned constant goes on the right. + if (C1->ugt(*C2)) + std::swap(C1, C2); + + APInt Xor = *C1 ^ *C2; + if (Xor.isPowerOf2()) { + // If LHSC and RHSC differ by only one bit, then set that bit in X and + // compare against the larger constant: + // (X == C1 || X == C2) --> (X | (C1 ^ C2)) == C2 + // (X != C1 && X != C2) --> (X | (C1 ^ C2)) != C2 + // We choose an 'or' with a Pow2 constant rather than the inverse mask with + // 'and' because that may lead to smaller codegen from a smaller constant. + Value *Or = Builder.CreateOr(X, ConstantInt::get(X->getType(), Xor)); + return Builder.CreateICmp(Pred, Or, ConstantInt::get(X->getType(), *C2)); + } + + // Special case: get the ordering right when the values wrap around zero. + // Ie, we assumed the constants were unsigned when swapping earlier. + if (C1->isNullValue() && C2->isAllOnesValue()) + std::swap(C1, C2); + + if (*C1 == *C2 - 1) { + // (X == 13 || X == 14) --> X - 13 <=u 1 + // (X != 13 && X != 14) --> X - 13 >u 1 + // An 'add' is the canonical IR form, so favor that over a 'sub'. + Value *Add = Builder.CreateAdd(X, ConstantInt::get(X->getType(), -(*C1))); + auto NewPred = JoinedByAnd ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_ULE; + return Builder.CreateICmp(NewPred, Add, ConstantInt::get(X->getType(), 1)); + } + + return nullptr; +} + +// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) +// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2) +Value *InstCombiner::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS, + bool JoinedByAnd, + Instruction &CxtI) { + ICmpInst::Predicate Pred = LHS->getPredicate(); + if (Pred != RHS->getPredicate()) + return nullptr; + if (JoinedByAnd && Pred != ICmpInst::ICMP_NE) + return nullptr; + if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ) + return nullptr; + + // TODO support vector splats + ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1)); + ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1)); + if (!LHSC || !RHSC || !LHSC->isZero() || !RHSC->isZero()) + return nullptr; + + Value *A, *B, *C, *D; + if (match(LHS->getOperand(0), m_And(m_Value(A), m_Value(B))) && + match(RHS->getOperand(0), m_And(m_Value(C), m_Value(D)))) { + if (A == D || B == D) + std::swap(C, D); + if (B == C) + std::swap(A, B); + + if (A == C && + isKnownToBeAPowerOfTwo(B, false, 0, &CxtI) && + isKnownToBeAPowerOfTwo(D, false, 0, &CxtI)) { + Value *Mask = Builder.CreateOr(B, D); + Value *Masked = Builder.CreateAnd(A, Mask); + auto NewPred = JoinedByAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE; + return Builder.CreateICmp(NewPred, Masked, Mask); + } + } + + return nullptr; +} + +/// General pattern: +/// X & Y +/// +/// Where Y is checking that all the high bits (covered by a mask 4294967168) +/// are uniform, i.e. %arg & 4294967168 can be either 4294967168 or 0 +/// Pattern can be one of: +/// %t = add i32 %arg, 128 +/// %r = icmp ult i32 %t, 256 +/// Or +/// %t0 = shl i32 %arg, 24 +/// %t1 = ashr i32 %t0, 24 +/// %r = icmp eq i32 %t1, %arg +/// Or +/// %t0 = trunc i32 %arg to i8 +/// %t1 = sext i8 %t0 to i32 +/// %r = icmp eq i32 %t1, %arg +/// This pattern is a signed truncation check. +/// +/// And X is checking that some bit in that same mask is zero. +/// I.e. can be one of: +/// %r = icmp sgt i32 %arg, -1 +/// Or +/// %t = and i32 %arg, 2147483648 +/// %r = icmp eq i32 %t, 0 +/// +/// Since we are checking that all the bits in that mask are the same, +/// and a particular bit is zero, what we are really checking is that all the +/// masked bits are zero. +/// So this should be transformed to: +/// %r = icmp ult i32 %arg, 128 +static Value *foldSignedTruncationCheck(ICmpInst *ICmp0, ICmpInst *ICmp1, + Instruction &CxtI, + InstCombiner::BuilderTy &Builder) { + assert(CxtI.getOpcode() == Instruction::And); + + // Match icmp ult (add %arg, C01), C1 (C1 == C01 << 1; powers of two) + auto tryToMatchSignedTruncationCheck = [](ICmpInst *ICmp, Value *&X, + APInt &SignBitMask) -> bool { + CmpInst::Predicate Pred; + const APInt *I01, *I1; // powers of two; I1 == I01 << 1 + if (!(match(ICmp, + m_ICmp(Pred, m_Add(m_Value(X), m_Power2(I01)), m_Power2(I1))) && + Pred == ICmpInst::ICMP_ULT && I1->ugt(*I01) && I01->shl(1) == *I1)) + return false; + // Which bit is the new sign bit as per the 'signed truncation' pattern? + SignBitMask = *I01; + return true; + }; + + // One icmp needs to be 'signed truncation check'. + // We need to match this first, else we will mismatch commutative cases. + Value *X1; + APInt HighestBit; + ICmpInst *OtherICmp; + if (tryToMatchSignedTruncationCheck(ICmp1, X1, HighestBit)) + OtherICmp = ICmp0; + else if (tryToMatchSignedTruncationCheck(ICmp0, X1, HighestBit)) + OtherICmp = ICmp1; + else + return nullptr; + + assert(HighestBit.isPowerOf2() && "expected to be power of two (non-zero)"); + + // Try to match/decompose into: icmp eq (X & Mask), 0 + auto tryToDecompose = [](ICmpInst *ICmp, Value *&X, + APInt &UnsetBitsMask) -> bool { + CmpInst::Predicate Pred = ICmp->getPredicate(); + // Can it be decomposed into icmp eq (X & Mask), 0 ? + if (llvm::decomposeBitTestICmp(ICmp->getOperand(0), ICmp->getOperand(1), + Pred, X, UnsetBitsMask, + /*LookThroughTrunc=*/false) && + Pred == ICmpInst::ICMP_EQ) + return true; + // Is it icmp eq (X & Mask), 0 already? + const APInt *Mask; + if (match(ICmp, m_ICmp(Pred, m_And(m_Value(X), m_APInt(Mask)), m_Zero())) && + Pred == ICmpInst::ICMP_EQ) { + UnsetBitsMask = *Mask; + return true; + } + return false; + }; + + // And the other icmp needs to be decomposable into a bit test. + Value *X0; + APInt UnsetBitsMask; + if (!tryToDecompose(OtherICmp, X0, UnsetBitsMask)) + return nullptr; + + assert(!UnsetBitsMask.isNullValue() && "empty mask makes no sense."); + + // Are they working on the same value? + Value *X; + if (X1 == X0) { + // Ok as is. + X = X1; + } else if (match(X0, m_Trunc(m_Specific(X1)))) { + UnsetBitsMask = UnsetBitsMask.zext(X1->getType()->getScalarSizeInBits()); + X = X1; + } else + return nullptr; + + // So which bits should be uniform as per the 'signed truncation check'? + // (all the bits starting with (i.e. including) HighestBit) + APInt SignBitsMask = ~(HighestBit - 1U); + + // UnsetBitsMask must have some common bits with SignBitsMask, + if (!UnsetBitsMask.intersects(SignBitsMask)) + return nullptr; + + // Does UnsetBitsMask contain any bits outside of SignBitsMask? + if (!UnsetBitsMask.isSubsetOf(SignBitsMask)) { + APInt OtherHighestBit = (~UnsetBitsMask) + 1U; + if (!OtherHighestBit.isPowerOf2()) + return nullptr; + HighestBit = APIntOps::umin(HighestBit, OtherHighestBit); + } + // Else, if it does not, then all is ok as-is. + + // %r = icmp ult %X, SignBit + return Builder.CreateICmpULT(X, ConstantInt::get(X->getType(), HighestBit), + CxtI.getName() + ".simplified"); +} + +/// Reduce a pair of compares that check if a value has exactly 1 bit set. +static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd, + InstCombiner::BuilderTy &Builder) { + // Handle 'and' / 'or' commutation: make the equality check the first operand. + if (JoinedByAnd && Cmp1->getPredicate() == ICmpInst::ICMP_NE) + std::swap(Cmp0, Cmp1); + else if (!JoinedByAnd && Cmp1->getPredicate() == ICmpInst::ICMP_EQ) + std::swap(Cmp0, Cmp1); + + // (X != 0) && (ctpop(X) u< 2) --> ctpop(X) == 1 + CmpInst::Predicate Pred0, Pred1; + Value *X; + if (JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) && + match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)), + m_SpecificInt(2))) && + Pred0 == ICmpInst::ICMP_NE && Pred1 == ICmpInst::ICMP_ULT) { + Value *CtPop = Cmp1->getOperand(0); + return Builder.CreateICmpEQ(CtPop, ConstantInt::get(CtPop->getType(), 1)); + } + // (X == 0) || (ctpop(X) u> 1) --> ctpop(X) != 1 + if (!JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) && + match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)), + m_SpecificInt(1))) && + Pred0 == ICmpInst::ICMP_EQ && Pred1 == ICmpInst::ICMP_UGT) { + Value *CtPop = Cmp1->getOperand(0); + return Builder.CreateICmpNE(CtPop, ConstantInt::get(CtPop->getType(), 1)); + } + return nullptr; +} + +/// Commuted variants are assumed to be handled by calling this function again +/// with the parameters swapped. +static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp, + ICmpInst *UnsignedICmp, bool IsAnd, + const SimplifyQuery &Q, + InstCombiner::BuilderTy &Builder) { + Value *ZeroCmpOp; + ICmpInst::Predicate EqPred; + if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(ZeroCmpOp), m_Zero())) || + !ICmpInst::isEquality(EqPred)) + return nullptr; + + auto IsKnownNonZero = [&](Value *V) { + return isKnownNonZero(V, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT); + }; + + ICmpInst::Predicate UnsignedPred; + + Value *A, *B; + if (match(UnsignedICmp, + m_c_ICmp(UnsignedPred, m_Specific(ZeroCmpOp), m_Value(A))) && + match(ZeroCmpOp, m_c_Add(m_Specific(A), m_Value(B))) && + (ZeroICmp->hasOneUse() || UnsignedICmp->hasOneUse())) { + if (UnsignedICmp->getOperand(0) != ZeroCmpOp) + UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred); + + auto GetKnownNonZeroAndOther = [&](Value *&NonZero, Value *&Other) { + if (!IsKnownNonZero(NonZero)) + std::swap(NonZero, Other); + return IsKnownNonZero(NonZero); + }; + + // Given ZeroCmpOp = (A + B) + // ZeroCmpOp <= A && ZeroCmpOp != 0 --> (0-B) < A + // ZeroCmpOp > A || ZeroCmpOp == 0 --> (0-B) >= A + // + // ZeroCmpOp < A && ZeroCmpOp != 0 --> (0-X) < Y iff + // ZeroCmpOp >= A || ZeroCmpOp == 0 --> (0-X) >= Y iff + // with X being the value (A/B) that is known to be non-zero, + // and Y being remaining value. + if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE && + IsAnd) + return Builder.CreateICmpULT(Builder.CreateNeg(B), A); + if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE && + IsAnd && GetKnownNonZeroAndOther(B, A)) + return Builder.CreateICmpULT(Builder.CreateNeg(B), A); + if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ && + !IsAnd) + return Builder.CreateICmpUGE(Builder.CreateNeg(B), A); + if (UnsignedPred == ICmpInst::ICMP_UGE && EqPred == ICmpInst::ICMP_EQ && + !IsAnd && GetKnownNonZeroAndOther(B, A)) + return Builder.CreateICmpUGE(Builder.CreateNeg(B), A); + } + + Value *Base, *Offset; + if (!match(ZeroCmpOp, m_Sub(m_Value(Base), m_Value(Offset)))) + return nullptr; + + if (!match(UnsignedICmp, + m_c_ICmp(UnsignedPred, m_Specific(Base), m_Specific(Offset))) || + !ICmpInst::isUnsigned(UnsignedPred)) + return nullptr; + if (UnsignedICmp->getOperand(0) != Base) + UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred); + + // Base >=/> Offset && (Base - Offset) != 0 <--> Base > Offset + // (no overflow and not null) + if ((UnsignedPred == ICmpInst::ICMP_UGE || + UnsignedPred == ICmpInst::ICMP_UGT) && + EqPred == ICmpInst::ICMP_NE && IsAnd) + return Builder.CreateICmpUGT(Base, Offset); + + // Base <=/< Offset || (Base - Offset) == 0 <--> Base <= Offset + // (overflow or null) + if ((UnsignedPred == ICmpInst::ICMP_ULE || + UnsignedPred == ICmpInst::ICMP_ULT) && + EqPred == ICmpInst::ICMP_EQ && !IsAnd) + return Builder.CreateICmpULE(Base, Offset); + + // Base <= Offset && (Base - Offset) != 0 --> Base < Offset + if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE && + IsAnd) + return Builder.CreateICmpULT(Base, Offset); + + // Base > Offset || (Base - Offset) == 0 --> Base >= Offset + if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ && + !IsAnd) + return Builder.CreateICmpUGE(Base, Offset); + + return nullptr; +} + +/// Fold (icmp)&(icmp) if possible. +Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, + Instruction &CxtI) { + const SimplifyQuery Q = SQ.getWithInstruction(&CxtI); + + // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2) + // if K1 and K2 are a one-bit mask. + if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, true, CxtI)) + return V; + + ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); + + // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B) + if (predicatesFoldable(PredL, PredR)) { + if (LHS->getOperand(0) == RHS->getOperand(1) && + LHS->getOperand(1) == RHS->getOperand(0)) + LHS->swapOperands(); + if (LHS->getOperand(0) == RHS->getOperand(0) && + LHS->getOperand(1) == RHS->getOperand(1)) { + Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); + unsigned Code = getICmpCode(LHS) & getICmpCode(RHS); + bool IsSigned = LHS->isSigned() || RHS->isSigned(); + return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder); + } + } + + // handle (roughly): (icmp eq (A & B), C) & (icmp eq (A & D), E) + if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder)) + return V; + + // E.g. (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n + if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/false)) + return V; + + // E.g. (icmp slt x, n) & (icmp sge x, 0) --> icmp ult x, n + if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/false)) + return V; + + if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, true, Builder)) + return V; + + if (Value *V = foldSignedTruncationCheck(LHS, RHS, CxtI, Builder)) + return V; + + if (Value *V = foldIsPowerOf2(LHS, RHS, true /* JoinedByAnd */, Builder)) + return V; + + if (Value *X = + foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/true, Q, Builder)) + return X; + if (Value *X = + foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/true, Q, Builder)) + return X; + + // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2). + Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0); + ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1)); + ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1)); + if (!LHSC || !RHSC) + return nullptr; + + if (LHSC == RHSC && PredL == PredR) { + // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C) + // where C is a power of 2 or + // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0) + if ((PredL == ICmpInst::ICMP_ULT && LHSC->getValue().isPowerOf2()) || + (PredL == ICmpInst::ICMP_EQ && LHSC->isZero())) { + Value *NewOr = Builder.CreateOr(LHS0, RHS0); + return Builder.CreateICmp(PredL, NewOr, LHSC); + } + } + + // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2 + // where CMAX is the all ones value for the truncated type, + // iff the lower bits of C2 and CA are zero. + if (PredL == ICmpInst::ICMP_EQ && PredL == PredR && LHS->hasOneUse() && + RHS->hasOneUse()) { + Value *V; + ConstantInt *AndC, *SmallC = nullptr, *BigC = nullptr; + + // (trunc x) == C1 & (and x, CA) == C2 + // (and x, CA) == C2 & (trunc x) == C1 + if (match(RHS0, m_Trunc(m_Value(V))) && + match(LHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) { + SmallC = RHSC; + BigC = LHSC; + } else if (match(LHS0, m_Trunc(m_Value(V))) && + match(RHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) { + SmallC = LHSC; + BigC = RHSC; + } + + if (SmallC && BigC) { + unsigned BigBitSize = BigC->getType()->getBitWidth(); + unsigned SmallBitSize = SmallC->getType()->getBitWidth(); + + // Check that the low bits are zero. + APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize); + if ((Low & AndC->getValue()).isNullValue() && + (Low & BigC->getValue()).isNullValue()) { + Value *NewAnd = Builder.CreateAnd(V, Low | AndC->getValue()); + APInt N = SmallC->getValue().zext(BigBitSize) | BigC->getValue(); + Value *NewVal = ConstantInt::get(AndC->getType()->getContext(), N); + return Builder.CreateICmp(PredL, NewAnd, NewVal); + } + } + } + + // From here on, we only handle: + // (icmp1 A, C1) & (icmp2 A, C2) --> something simpler. + if (LHS0 != RHS0) + return nullptr; + + // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere. + if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE || + PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE || + PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE || + PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE) + return nullptr; + + // We can't fold (ugt x, C) & (sgt x, C2). + if (!predicatesFoldable(PredL, PredR)) + return nullptr; + + // Ensure that the larger constant is on the RHS. + bool ShouldSwap; + if (CmpInst::isSigned(PredL) || + (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR))) + ShouldSwap = LHSC->getValue().sgt(RHSC->getValue()); + else + ShouldSwap = LHSC->getValue().ugt(RHSC->getValue()); + + if (ShouldSwap) { + std::swap(LHS, RHS); + std::swap(LHSC, RHSC); + std::swap(PredL, PredR); + } + + // At this point, we know we have two icmp instructions + // comparing a value against two constants and and'ing the result + // together. Because of the above check, we know that we only have + // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know + // (from the icmp folding check above), that the two constants + // are not equal and that the larger constant is on the RHS + assert(LHSC != RHSC && "Compares not folded above?"); + + switch (PredL) { + default: + llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_NE: + switch (PredR) { + default: + llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_ULT: + // (X != 13 & X u< 14) -> X < 13 + if (LHSC->getValue() == (RHSC->getValue() - 1)) + return Builder.CreateICmpULT(LHS0, LHSC); + if (LHSC->isZero()) // (X != 0 & X u< C) -> X-1 u< C-1 + return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), + false, true); + break; // (X != 13 & X u< 15) -> no change + case ICmpInst::ICMP_SLT: + // (X != 13 & X s< 14) -> X < 13 + if (LHSC->getValue() == (RHSC->getValue() - 1)) + return Builder.CreateICmpSLT(LHS0, LHSC); + // (X != INT_MIN & X s< C) -> X-(INT_MIN+1) u< (C-(INT_MIN+1)) + if (LHSC->isMinValue(true)) + return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), + true, true); + break; // (X != 13 & X s< 15) -> no change + case ICmpInst::ICMP_NE: + // Potential folds for this case should already be handled. + break; + } + break; + case ICmpInst::ICMP_UGT: + switch (PredR) { + default: + llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_NE: + // (X u> 13 & X != 14) -> X u> 14 + if (RHSC->getValue() == (LHSC->getValue() + 1)) + return Builder.CreateICmp(PredL, LHS0, RHSC); + // X u> C & X != UINT_MAX -> (X-(C+1)) u< UINT_MAX-(C+1) + if (RHSC->isMaxValue(false)) + return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), + false, true); + break; // (X u> 13 & X != 15) -> no change + case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) u< 1 + return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), + false, true); + } + break; + case ICmpInst::ICMP_SGT: + switch (PredR) { + default: + llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_NE: + // (X s> 13 & X != 14) -> X s> 14 + if (RHSC->getValue() == (LHSC->getValue() + 1)) + return Builder.CreateICmp(PredL, LHS0, RHSC); + // X s> C & X != INT_MAX -> (X-(C+1)) u< INT_MAX-(C+1) + if (RHSC->isMaxValue(true)) + return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), + true, true); + break; // (X s> 13 & X != 15) -> no change + case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) u< 1 + return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), true, + true); + } + break; + } + + return nullptr; +} + +Value *InstCombiner::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd) { + Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1); + Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1); + FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); + + if (LHS0 == RHS1 && RHS0 == LHS1) { + // Swap RHS operands to match LHS. + PredR = FCmpInst::getSwappedPredicate(PredR); + std::swap(RHS0, RHS1); + } + + // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y). + // Suppose the relation between x and y is R, where R is one of + // U(1000), L(0100), G(0010) or E(0001), and CC0 and CC1 are the bitmasks for + // testing the desired relations. + // + // Since (R & CC0) and (R & CC1) are either R or 0, we actually have this: + // bool(R & CC0) && bool(R & CC1) + // = bool((R & CC0) & (R & CC1)) + // = bool(R & (CC0 & CC1)) <= by re-association, commutation, and idempotency + // + // Since (R & CC0) and (R & CC1) are either R or 0, we actually have this: + // bool(R & CC0) || bool(R & CC1) + // = bool((R & CC0) | (R & CC1)) + // = bool(R & (CC0 | CC1)) <= by reversed distribution (contribution? ;) + if (LHS0 == RHS0 && LHS1 == RHS1) { + unsigned FCmpCodeL = getFCmpCode(PredL); + unsigned FCmpCodeR = getFCmpCode(PredR); + unsigned NewPred = IsAnd ? FCmpCodeL & FCmpCodeR : FCmpCodeL | FCmpCodeR; + return getFCmpValue(NewPred, LHS0, LHS1, Builder); + } + + if ((PredL == FCmpInst::FCMP_ORD && PredR == FCmpInst::FCMP_ORD && IsAnd) || + (PredL == FCmpInst::FCMP_UNO && PredR == FCmpInst::FCMP_UNO && !IsAnd)) { + if (LHS0->getType() != RHS0->getType()) + return nullptr; + + // FCmp canonicalization ensures that (fcmp ord/uno X, X) and + // (fcmp ord/uno X, C) will be transformed to (fcmp X, +0.0). + if (match(LHS1, m_PosZeroFP()) && match(RHS1, m_PosZeroFP())) + // Ignore the constants because they are obviously not NANs: + // (fcmp ord x, 0.0) & (fcmp ord y, 0.0) -> (fcmp ord x, y) + // (fcmp uno x, 0.0) | (fcmp uno y, 0.0) -> (fcmp uno x, y) + return Builder.CreateFCmp(PredL, LHS0, RHS0); + } + + return nullptr; +} + +/// This a limited reassociation for a special case (see above) where we are +/// checking if two values are either both NAN (unordered) or not-NAN (ordered). +/// This could be handled more generally in '-reassociation', but it seems like +/// an unlikely pattern for a large number of logic ops and fcmps. +static Instruction *reassociateFCmps(BinaryOperator &BO, + InstCombiner::BuilderTy &Builder) { + Instruction::BinaryOps Opcode = BO.getOpcode(); + assert((Opcode == Instruction::And || Opcode == Instruction::Or) && + "Expecting and/or op for fcmp transform"); + + // There are 4 commuted variants of the pattern. Canonicalize operands of this + // logic op so an fcmp is operand 0 and a matching logic op is operand 1. + Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1), *X; + FCmpInst::Predicate Pred; + if (match(Op1, m_FCmp(Pred, m_Value(), m_AnyZeroFP()))) + std::swap(Op0, Op1); + + // Match inner binop and the predicate for combining 2 NAN checks into 1. + BinaryOperator *BO1; + FCmpInst::Predicate NanPred = Opcode == Instruction::And ? FCmpInst::FCMP_ORD + : FCmpInst::FCMP_UNO; + if (!match(Op0, m_FCmp(Pred, m_Value(X), m_AnyZeroFP())) || Pred != NanPred || + !match(Op1, m_BinOp(BO1)) || BO1->getOpcode() != Opcode) + return nullptr; + + // The inner logic op must have a matching fcmp operand. + Value *BO10 = BO1->getOperand(0), *BO11 = BO1->getOperand(1), *Y; + if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) || + Pred != NanPred || X->getType() != Y->getType()) + std::swap(BO10, BO11); + + if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) || + Pred != NanPred || X->getType() != Y->getType()) + return nullptr; + + // and (fcmp ord X, 0), (and (fcmp ord Y, 0), Z) --> and (fcmp ord X, Y), Z + // or (fcmp uno X, 0), (or (fcmp uno Y, 0), Z) --> or (fcmp uno X, Y), Z + Value *NewFCmp = Builder.CreateFCmp(Pred, X, Y); + if (auto *NewFCmpInst = dyn_cast<FCmpInst>(NewFCmp)) { + // Intersect FMF from the 2 source fcmps. + NewFCmpInst->copyIRFlags(Op0); + NewFCmpInst->andIRFlags(BO10); + } + return BinaryOperator::Create(Opcode, NewFCmp, BO11); +} + +/// Match De Morgan's Laws: +/// (~A & ~B) == (~(A | B)) +/// (~A | ~B) == (~(A & B)) +static Instruction *matchDeMorgansLaws(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + auto Opcode = I.getOpcode(); + assert((Opcode == Instruction::And || Opcode == Instruction::Or) && + "Trying to match De Morgan's Laws with something other than and/or"); + + // Flip the logic operation. + Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And; + + Value *A, *B; + if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) && + match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) && + !isFreeToInvert(A, A->hasOneUse()) && + !isFreeToInvert(B, B->hasOneUse())) { + Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan"); + return BinaryOperator::CreateNot(AndOr); + } + + return nullptr; +} + +bool InstCombiner::shouldOptimizeCast(CastInst *CI) { + Value *CastSrc = CI->getOperand(0); + + // Noop casts and casts of constants should be eliminated trivially. + if (CI->getSrcTy() == CI->getDestTy() || isa<Constant>(CastSrc)) + return false; + + // If this cast is paired with another cast that can be eliminated, we prefer + // to have it eliminated. + if (const auto *PrecedingCI = dyn_cast<CastInst>(CastSrc)) + if (isEliminableCastPair(PrecedingCI, CI)) + return false; + + return true; +} + +/// Fold {and,or,xor} (cast X), C. +static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast, + InstCombiner::BuilderTy &Builder) { + Constant *C = dyn_cast<Constant>(Logic.getOperand(1)); + if (!C) + return nullptr; + + auto LogicOpc = Logic.getOpcode(); + Type *DestTy = Logic.getType(); + Type *SrcTy = Cast->getSrcTy(); + + // Move the logic operation ahead of a zext or sext if the constant is + // unchanged in the smaller source type. Performing the logic in a smaller + // type may provide more information to later folds, and the smaller logic + // instruction may be cheaper (particularly in the case of vectors). + Value *X; + if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) { + Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy); + Constant *ZextTruncC = ConstantExpr::getZExt(TruncC, DestTy); + if (ZextTruncC == C) { + // LogicOpc (zext X), C --> zext (LogicOpc X, C) + Value *NewOp = Builder.CreateBinOp(LogicOpc, X, TruncC); + return new ZExtInst(NewOp, DestTy); + } + } + + if (match(Cast, m_OneUse(m_SExt(m_Value(X))))) { + Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy); + Constant *SextTruncC = ConstantExpr::getSExt(TruncC, DestTy); + if (SextTruncC == C) { + // LogicOpc (sext X), C --> sext (LogicOpc X, C) + Value *NewOp = Builder.CreateBinOp(LogicOpc, X, TruncC); + return new SExtInst(NewOp, DestTy); + } + } + + return nullptr; +} + +/// Fold {and,or,xor} (cast X), Y. +Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) { + auto LogicOpc = I.getOpcode(); + assert(I.isBitwiseLogicOp() && "Unexpected opcode for bitwise logic folding"); + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + CastInst *Cast0 = dyn_cast<CastInst>(Op0); + if (!Cast0) + return nullptr; + + // This must be a cast from an integer or integer vector source type to allow + // transformation of the logic operation to the source type. + Type *DestTy = I.getType(); + Type *SrcTy = Cast0->getSrcTy(); + if (!SrcTy->isIntOrIntVectorTy()) + return nullptr; + + if (Instruction *Ret = foldLogicCastConstant(I, Cast0, Builder)) + return Ret; + + CastInst *Cast1 = dyn_cast<CastInst>(Op1); + if (!Cast1) + return nullptr; + + // Both operands of the logic operation are casts. The casts must be of the + // same type for reduction. + auto CastOpcode = Cast0->getOpcode(); + if (CastOpcode != Cast1->getOpcode() || SrcTy != Cast1->getSrcTy()) + return nullptr; + + Value *Cast0Src = Cast0->getOperand(0); + Value *Cast1Src = Cast1->getOperand(0); + + // fold logic(cast(A), cast(B)) -> cast(logic(A, B)) + if (shouldOptimizeCast(Cast0) && shouldOptimizeCast(Cast1)) { + Value *NewOp = Builder.CreateBinOp(LogicOpc, Cast0Src, Cast1Src, + I.getName()); + return CastInst::Create(CastOpcode, NewOp, DestTy); + } + + // For now, only 'and'/'or' have optimizations after this. + if (LogicOpc == Instruction::Xor) + return nullptr; + + // If this is logic(cast(icmp), cast(icmp)), try to fold this even if the + // cast is otherwise not optimizable. This happens for vector sexts. + ICmpInst *ICmp0 = dyn_cast<ICmpInst>(Cast0Src); + ICmpInst *ICmp1 = dyn_cast<ICmpInst>(Cast1Src); + if (ICmp0 && ICmp1) { + Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1, I) + : foldOrOfICmps(ICmp0, ICmp1, I); + if (Res) + return CastInst::Create(CastOpcode, Res, DestTy); + return nullptr; + } + + // If this is logic(cast(fcmp), cast(fcmp)), try to fold this even if the + // cast is otherwise not optimizable. This happens for vector sexts. + FCmpInst *FCmp0 = dyn_cast<FCmpInst>(Cast0Src); + FCmpInst *FCmp1 = dyn_cast<FCmpInst>(Cast1Src); + if (FCmp0 && FCmp1) + if (Value *R = foldLogicOfFCmps(FCmp0, FCmp1, LogicOpc == Instruction::And)) + return CastInst::Create(CastOpcode, R, DestTy); + + return nullptr; +} + +static Instruction *foldAndToXor(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + assert(I.getOpcode() == Instruction::And); + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + Value *A, *B; + + // Operand complexity canonicalization guarantees that the 'or' is Op0. + // (A | B) & ~(A & B) --> A ^ B + // (A | B) & ~(B & A) --> A ^ B + if (match(&I, m_BinOp(m_Or(m_Value(A), m_Value(B)), + m_Not(m_c_And(m_Deferred(A), m_Deferred(B)))))) + return BinaryOperator::CreateXor(A, B); + + // (A | ~B) & (~A | B) --> ~(A ^ B) + // (A | ~B) & (B | ~A) --> ~(A ^ B) + // (~B | A) & (~A | B) --> ~(A ^ B) + // (~B | A) & (B | ~A) --> ~(A ^ B) + if (Op0->hasOneUse() || Op1->hasOneUse()) + if (match(&I, m_BinOp(m_c_Or(m_Value(A), m_Not(m_Value(B))), + m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B))))) + return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); + + return nullptr; +} + +static Instruction *foldOrToXor(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + assert(I.getOpcode() == Instruction::Or); + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + Value *A, *B; + + // Operand complexity canonicalization guarantees that the 'and' is Op0. + // (A & B) | ~(A | B) --> ~(A ^ B) + // (A & B) | ~(B | A) --> ~(A ^ B) + if (Op0->hasOneUse() || Op1->hasOneUse()) + if (match(Op0, m_And(m_Value(A), m_Value(B))) && + match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))) + return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); + + // (A & ~B) | (~A & B) --> A ^ B + // (A & ~B) | (B & ~A) --> A ^ B + // (~B & A) | (~A & B) --> A ^ B + // (~B & A) | (B & ~A) --> A ^ B + if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) && + match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))) + return BinaryOperator::CreateXor(A, B); + + return nullptr; +} + +/// Return true if a constant shift amount is always less than the specified +/// bit-width. If not, the shift could create poison in the narrower type. +static bool canNarrowShiftAmt(Constant *C, unsigned BitWidth) { + if (auto *ScalarC = dyn_cast<ConstantInt>(C)) + return ScalarC->getZExtValue() < BitWidth; + + if (C->getType()->isVectorTy()) { + // Check each element of a constant vector. + unsigned NumElts = C->getType()->getVectorNumElements(); + for (unsigned i = 0; i != NumElts; ++i) { + Constant *Elt = C->getAggregateElement(i); + if (!Elt) + return false; + if (isa<UndefValue>(Elt)) + continue; + auto *CI = dyn_cast<ConstantInt>(Elt); + if (!CI || CI->getZExtValue() >= BitWidth) + return false; + } + return true; + } + + // The constant is a constant expression or unknown. + return false; +} + +/// Try to use narrower ops (sink zext ops) for an 'and' with binop operand and +/// a common zext operand: and (binop (zext X), C), (zext X). +Instruction *InstCombiner::narrowMaskedBinOp(BinaryOperator &And) { + // This transform could also apply to {or, and, xor}, but there are better + // folds for those cases, so we don't expect those patterns here. AShr is not + // handled because it should always be transformed to LShr in this sequence. + // The subtract transform is different because it has a constant on the left. + // Add/mul commute the constant to RHS; sub with constant RHS becomes add. + Value *Op0 = And.getOperand(0), *Op1 = And.getOperand(1); + Constant *C; + if (!match(Op0, m_OneUse(m_Add(m_Specific(Op1), m_Constant(C)))) && + !match(Op0, m_OneUse(m_Mul(m_Specific(Op1), m_Constant(C)))) && + !match(Op0, m_OneUse(m_LShr(m_Specific(Op1), m_Constant(C)))) && + !match(Op0, m_OneUse(m_Shl(m_Specific(Op1), m_Constant(C)))) && + !match(Op0, m_OneUse(m_Sub(m_Constant(C), m_Specific(Op1))))) + return nullptr; + + Value *X; + if (!match(Op1, m_ZExt(m_Value(X))) || Op1->hasNUsesOrMore(3)) + return nullptr; + + Type *Ty = And.getType(); + if (!isa<VectorType>(Ty) && !shouldChangeType(Ty, X->getType())) + return nullptr; + + // If we're narrowing a shift, the shift amount must be safe (less than the + // width) in the narrower type. If the shift amount is greater, instsimplify + // usually handles that case, but we can't guarantee/assert it. + Instruction::BinaryOps Opc = cast<BinaryOperator>(Op0)->getOpcode(); + if (Opc == Instruction::LShr || Opc == Instruction::Shl) + if (!canNarrowShiftAmt(C, X->getType()->getScalarSizeInBits())) + return nullptr; + + // and (sub C, (zext X)), (zext X) --> zext (and (sub C', X), X) + // and (binop (zext X), C), (zext X) --> zext (and (binop X, C'), X) + Value *NewC = ConstantExpr::getTrunc(C, X->getType()); + Value *NewBO = Opc == Instruction::Sub ? Builder.CreateBinOp(Opc, NewC, X) + : Builder.CreateBinOp(Opc, X, NewC); + return new ZExtInst(Builder.CreateAnd(NewBO, X), Ty); +} + +// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches +// here. We should standardize that construct where it is needed or choose some +// other way to ensure that commutated variants of patterns are not missed. +Instruction *InstCombiner::visitAnd(BinaryOperator &I) { + if (Value *V = SimplifyAndInst(I.getOperand(0), I.getOperand(1), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (SimplifyAssociativeOrCommutative(I)) + return &I; + + if (Instruction *X = foldVectorBinop(I)) + return X; + + // See if we can simplify any instructions used by the instruction whose sole + // purpose is to compute bits we don't care about. + if (SimplifyDemandedInstructionBits(I)) + return &I; + + // Do this before using distributive laws to catch simple and/or/not patterns. + if (Instruction *Xor = foldAndToXor(I, Builder)) + return Xor; + + // (A|B)&(A|C) -> A|(B&C) etc + if (Value *V = SimplifyUsingDistributiveLaws(I)) + return replaceInstUsesWith(I, V); + + if (Value *V = SimplifyBSwap(I, Builder)) + return replaceInstUsesWith(I, V); + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + const APInt *C; + if (match(Op1, m_APInt(C))) { + Value *X, *Y; + if (match(Op0, m_OneUse(m_LogicalShift(m_One(), m_Value(X)))) && + C->isOneValue()) { + // (1 << X) & 1 --> zext(X == 0) + // (1 >> X) & 1 --> zext(X == 0) + Value *IsZero = Builder.CreateICmpEQ(X, ConstantInt::get(I.getType(), 0)); + return new ZExtInst(IsZero, I.getType()); + } + + const APInt *XorC; + if (match(Op0, m_OneUse(m_Xor(m_Value(X), m_APInt(XorC))))) { + // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2) + Constant *NewC = ConstantInt::get(I.getType(), *C & *XorC); + Value *And = Builder.CreateAnd(X, Op1); + And->takeName(Op0); + return BinaryOperator::CreateXor(And, NewC); + } + + const APInt *OrC; + if (match(Op0, m_OneUse(m_Or(m_Value(X), m_APInt(OrC))))) { + // (X | C1) & C2 --> (X & C2^(C1&C2)) | (C1&C2) + // NOTE: This reduces the number of bits set in the & mask, which + // can expose opportunities for store narrowing for scalars. + // NOTE: SimplifyDemandedBits should have already removed bits from C1 + // that aren't set in C2. Meaning we can replace (C1&C2) with C1 in + // above, but this feels safer. + APInt Together = *C & *OrC; + Value *And = Builder.CreateAnd(X, ConstantInt::get(I.getType(), + Together ^ *C)); + And->takeName(Op0); + return BinaryOperator::CreateOr(And, ConstantInt::get(I.getType(), + Together)); + } + + // If the mask is only needed on one incoming arm, push the 'and' op up. + if (match(Op0, m_OneUse(m_Xor(m_Value(X), m_Value(Y)))) || + match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) { + APInt NotAndMask(~(*C)); + BinaryOperator::BinaryOps BinOp = cast<BinaryOperator>(Op0)->getOpcode(); + if (MaskedValueIsZero(X, NotAndMask, 0, &I)) { + // Not masking anything out for the LHS, move mask to RHS. + // and ({x}or X, Y), C --> {x}or X, (and Y, C) + Value *NewRHS = Builder.CreateAnd(Y, Op1, Y->getName() + ".masked"); + return BinaryOperator::Create(BinOp, X, NewRHS); + } + if (!isa<Constant>(Y) && MaskedValueIsZero(Y, NotAndMask, 0, &I)) { + // Not masking anything out for the RHS, move mask to LHS. + // and ({x}or X, Y), C --> {x}or (and X, C), Y + Value *NewLHS = Builder.CreateAnd(X, Op1, X->getName() + ".masked"); + return BinaryOperator::Create(BinOp, NewLHS, Y); + } + } + + } + + if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) { + const APInt &AndRHSMask = AndRHS->getValue(); + + // Optimize a variety of ((val OP C1) & C2) combinations... + if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) { + // ((C1 OP zext(X)) & C2) -> zext((C1-X) & C2) if C2 fits in the bitwidth + // of X and OP behaves well when given trunc(C1) and X. + // TODO: Do this for vectors by using m_APInt isntead of m_ConstantInt. + switch (Op0I->getOpcode()) { + default: + break; + case Instruction::Xor: + case Instruction::Or: + case Instruction::Mul: + case Instruction::Add: + case Instruction::Sub: + Value *X; + ConstantInt *C1; + // TODO: The one use restrictions could be relaxed a little if the AND + // is going to be removed. + if (match(Op0I, m_OneUse(m_c_BinOp(m_OneUse(m_ZExt(m_Value(X))), + m_ConstantInt(C1))))) { + if (AndRHSMask.isIntN(X->getType()->getScalarSizeInBits())) { + auto *TruncC1 = ConstantExpr::getTrunc(C1, X->getType()); + Value *BinOp; + Value *Op0LHS = Op0I->getOperand(0); + if (isa<ZExtInst>(Op0LHS)) + BinOp = Builder.CreateBinOp(Op0I->getOpcode(), X, TruncC1); + else + BinOp = Builder.CreateBinOp(Op0I->getOpcode(), TruncC1, X); + auto *TruncC2 = ConstantExpr::getTrunc(AndRHS, X->getType()); + auto *And = Builder.CreateAnd(BinOp, TruncC2); + return new ZExtInst(And, I.getType()); + } + } + } + + if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) + if (Instruction *Res = OptAndOp(Op0I, Op0CI, AndRHS, I)) + return Res; + } + + // If this is an integer truncation, and if the source is an 'and' with + // immediate, transform it. This frequently occurs for bitfield accesses. + { + Value *X = nullptr; ConstantInt *YC = nullptr; + if (match(Op0, m_Trunc(m_And(m_Value(X), m_ConstantInt(YC))))) { + // Change: and (trunc (and X, YC) to T), C2 + // into : and (trunc X to T), trunc(YC) & C2 + // This will fold the two constants together, which may allow + // other simplifications. + Value *NewCast = Builder.CreateTrunc(X, I.getType(), "and.shrunk"); + Constant *C3 = ConstantExpr::getTrunc(YC, I.getType()); + C3 = ConstantExpr::getAnd(C3, AndRHS); + return BinaryOperator::CreateAnd(NewCast, C3); + } + } + } + + if (Instruction *Z = narrowMaskedBinOp(I)) + return Z; + + if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I)) + return FoldedLogic; + + if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) + return DeMorgan; + + { + Value *A, *B, *C; + // A & (A ^ B) --> A & ~B + if (match(Op1, m_OneUse(m_c_Xor(m_Specific(Op0), m_Value(B))))) + return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(B)); + // (A ^ B) & A --> A & ~B + if (match(Op0, m_OneUse(m_c_Xor(m_Specific(Op1), m_Value(B))))) + return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(B)); + + // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C + if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) + if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A)))) + if (Op1->hasOneUse() || isFreeToInvert(C, C->hasOneUse())) + return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(C)); + + // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C + if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B)))) + if (match(Op1, m_Xor(m_Specific(B), m_Specific(A)))) + if (Op0->hasOneUse() || isFreeToInvert(C, C->hasOneUse())) + return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C)); + + // (A | B) & ((~A) ^ B) -> (A & B) + // (A | B) & (B ^ (~A)) -> (A & B) + // (B | A) & ((~A) ^ B) -> (A & B) + // (B | A) & (B ^ (~A)) -> (A & B) + if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) && + match(Op0, m_c_Or(m_Specific(A), m_Specific(B)))) + return BinaryOperator::CreateAnd(A, B); + + // ((~A) ^ B) & (A | B) -> (A & B) + // ((~A) ^ B) & (B | A) -> (A & B) + // (B ^ (~A)) & (A | B) -> (A & B) + // (B ^ (~A)) & (B | A) -> (A & B) + if (match(Op0, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) && + match(Op1, m_c_Or(m_Specific(A), m_Specific(B)))) + return BinaryOperator::CreateAnd(A, B); + } + + { + ICmpInst *LHS = dyn_cast<ICmpInst>(Op0); + ICmpInst *RHS = dyn_cast<ICmpInst>(Op1); + if (LHS && RHS) + if (Value *Res = foldAndOfICmps(LHS, RHS, I)) + return replaceInstUsesWith(I, Res); + + // TODO: Make this recursive; it's a little tricky because an arbitrary + // number of 'and' instructions might have to be created. + Value *X, *Y; + if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) { + if (auto *Cmp = dyn_cast<ICmpInst>(X)) + if (Value *Res = foldAndOfICmps(LHS, Cmp, I)) + return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y)); + if (auto *Cmp = dyn_cast<ICmpInst>(Y)) + if (Value *Res = foldAndOfICmps(LHS, Cmp, I)) + return replaceInstUsesWith(I, Builder.CreateAnd(Res, X)); + } + if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) { + if (auto *Cmp = dyn_cast<ICmpInst>(X)) + if (Value *Res = foldAndOfICmps(Cmp, RHS, I)) + return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y)); + if (auto *Cmp = dyn_cast<ICmpInst>(Y)) + if (Value *Res = foldAndOfICmps(Cmp, RHS, I)) + return replaceInstUsesWith(I, Builder.CreateAnd(Res, X)); + } + } + + if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0))) + if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1))) + if (Value *Res = foldLogicOfFCmps(LHS, RHS, true)) + return replaceInstUsesWith(I, Res); + + if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder)) + return FoldedFCmps; + + if (Instruction *CastedAnd = foldCastedBitwiseLogic(I)) + return CastedAnd; + + // and(sext(A), B) / and(B, sext(A)) --> A ? B : 0, where A is i1 or <N x i1>. + Value *A; + if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) && + A->getType()->isIntOrIntVectorTy(1)) + return SelectInst::Create(A, Op1, Constant::getNullValue(I.getType())); + if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) && + A->getType()->isIntOrIntVectorTy(1)) + return SelectInst::Create(A, Op0, Constant::getNullValue(I.getType())); + + // and(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X) --> X s> Y ? X : 0. + { + Value *X, *Y; + const APInt *ShAmt; + Type *Ty = I.getType(); + if (match(&I, m_c_And(m_OneUse(m_AShr(m_NSWSub(m_Value(Y), m_Value(X)), + m_APInt(ShAmt))), + m_Deferred(X))) && + *ShAmt == Ty->getScalarSizeInBits() - 1) { + Value *NewICmpInst = Builder.CreateICmpSGT(X, Y); + return SelectInst::Create(NewICmpInst, X, ConstantInt::getNullValue(Ty)); + } + } + + return nullptr; +} + +Instruction *InstCombiner::matchBSwap(BinaryOperator &Or) { + assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'"); + Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1); + + // Look through zero extends. + if (Instruction *Ext = dyn_cast<ZExtInst>(Op0)) + Op0 = Ext->getOperand(0); + + if (Instruction *Ext = dyn_cast<ZExtInst>(Op1)) + Op1 = Ext->getOperand(0); + + // (A | B) | C and A | (B | C) -> bswap if possible. + bool OrOfOrs = match(Op0, m_Or(m_Value(), m_Value())) || + match(Op1, m_Or(m_Value(), m_Value())); + + // (A >> B) | (C << D) and (A << B) | (B >> C) -> bswap if possible. + bool OrOfShifts = match(Op0, m_LogicalShift(m_Value(), m_Value())) && + match(Op1, m_LogicalShift(m_Value(), m_Value())); + + // (A & B) | (C & D) -> bswap if possible. + bool OrOfAnds = match(Op0, m_And(m_Value(), m_Value())) && + match(Op1, m_And(m_Value(), m_Value())); + + // (A << B) | (C & D) -> bswap if possible. + // The bigger pattern here is ((A & C1) << C2) | ((B >> C2) & C1), which is a + // part of the bswap idiom for specific values of C1, C2 (e.g. C1 = 16711935, + // C2 = 8 for i32). + // This pattern can occur when the operands of the 'or' are not canonicalized + // for some reason (not having only one use, for example). + bool OrOfAndAndSh = (match(Op0, m_LogicalShift(m_Value(), m_Value())) && + match(Op1, m_And(m_Value(), m_Value()))) || + (match(Op0, m_And(m_Value(), m_Value())) && + match(Op1, m_LogicalShift(m_Value(), m_Value()))); + + if (!OrOfOrs && !OrOfShifts && !OrOfAnds && !OrOfAndAndSh) + return nullptr; + + SmallVector<Instruction*, 4> Insts; + if (!recognizeBSwapOrBitReverseIdiom(&Or, true, false, Insts)) + return nullptr; + Instruction *LastInst = Insts.pop_back_val(); + LastInst->removeFromParent(); + + for (auto *Inst : Insts) + Worklist.Add(Inst); + return LastInst; +} + +/// Transform UB-safe variants of bitwise rotate to the funnel shift intrinsic. +static Instruction *matchRotate(Instruction &Or) { + // TODO: Can we reduce the code duplication between this and the related + // rotate matching code under visitSelect and visitTrunc? + unsigned Width = Or.getType()->getScalarSizeInBits(); + if (!isPowerOf2_32(Width)) + return nullptr; + + // First, find an or'd pair of opposite shifts with the same shifted operand: + // or (lshr ShVal, ShAmt0), (shl ShVal, ShAmt1) + BinaryOperator *Or0, *Or1; + if (!match(Or.getOperand(0), m_BinOp(Or0)) || + !match(Or.getOperand(1), m_BinOp(Or1))) + return nullptr; + + Value *ShVal, *ShAmt0, *ShAmt1; + if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal), m_Value(ShAmt0)))) || + !match(Or1, m_OneUse(m_LogicalShift(m_Specific(ShVal), m_Value(ShAmt1))))) + return nullptr; + + BinaryOperator::BinaryOps ShiftOpcode0 = Or0->getOpcode(); + BinaryOperator::BinaryOps ShiftOpcode1 = Or1->getOpcode(); + if (ShiftOpcode0 == ShiftOpcode1) + return nullptr; + + // Match the shift amount operands for a rotate pattern. This always matches + // a subtraction on the R operand. + auto matchShiftAmount = [](Value *L, Value *R, unsigned Width) -> Value * { + // The shift amount may be masked with negation: + // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1))) + Value *X; + unsigned Mask = Width - 1; + if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) && + match(R, m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask)))) + return X; + + // Similar to above, but the shift amount may be extended after masking, + // so return the extended value as the parameter for the intrinsic. + if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) && + match(R, m_And(m_Neg(m_ZExt(m_And(m_Specific(X), m_SpecificInt(Mask)))), + m_SpecificInt(Mask)))) + return L; + + return nullptr; + }; + + Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, Width); + bool SubIsOnLHS = false; + if (!ShAmt) { + ShAmt = matchShiftAmount(ShAmt1, ShAmt0, Width); + SubIsOnLHS = true; + } + if (!ShAmt) + return nullptr; + + bool IsFshl = (!SubIsOnLHS && ShiftOpcode0 == BinaryOperator::Shl) || + (SubIsOnLHS && ShiftOpcode1 == BinaryOperator::Shl); + Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr; + Function *F = Intrinsic::getDeclaration(Or.getModule(), IID, Or.getType()); + return IntrinsicInst::Create(F, { ShVal, ShVal, ShAmt }); +} + +/// If all elements of two constant vectors are 0/-1 and inverses, return true. +static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) { + unsigned NumElts = C1->getType()->getVectorNumElements(); + for (unsigned i = 0; i != NumElts; ++i) { + Constant *EltC1 = C1->getAggregateElement(i); + Constant *EltC2 = C2->getAggregateElement(i); + if (!EltC1 || !EltC2) + return false; + + // One element must be all ones, and the other must be all zeros. + if (!((match(EltC1, m_Zero()) && match(EltC2, m_AllOnes())) || + (match(EltC2, m_Zero()) && match(EltC1, m_AllOnes())))) + return false; + } + return true; +} + +/// We have an expression of the form (A & C) | (B & D). If A is a scalar or +/// vector composed of all-zeros or all-ones values and is the bitwise 'not' of +/// B, it can be used as the condition operand of a select instruction. +Value *InstCombiner::getSelectCondition(Value *A, Value *B) { + // Step 1: We may have peeked through bitcasts in the caller. + // Exit immediately if we don't have (vector) integer types. + Type *Ty = A->getType(); + if (!Ty->isIntOrIntVectorTy() || !B->getType()->isIntOrIntVectorTy()) + return nullptr; + + // Step 2: We need 0 or all-1's bitmasks. + if (ComputeNumSignBits(A) != Ty->getScalarSizeInBits()) + return nullptr; + + // Step 3: If B is the 'not' value of A, we have our answer. + if (match(A, m_Not(m_Specific(B)))) { + // If these are scalars or vectors of i1, A can be used directly. + if (Ty->isIntOrIntVectorTy(1)) + return A; + return Builder.CreateTrunc(A, CmpInst::makeCmpResultType(Ty)); + } + + // If both operands are constants, see if the constants are inverse bitmasks. + Constant *AConst, *BConst; + if (match(A, m_Constant(AConst)) && match(B, m_Constant(BConst))) + if (AConst == ConstantExpr::getNot(BConst)) + return Builder.CreateZExtOrTrunc(A, CmpInst::makeCmpResultType(Ty)); + + // Look for more complex patterns. The 'not' op may be hidden behind various + // casts. Look through sexts and bitcasts to find the booleans. + Value *Cond; + Value *NotB; + if (match(A, m_SExt(m_Value(Cond))) && + Cond->getType()->isIntOrIntVectorTy(1) && + match(B, m_OneUse(m_Not(m_Value(NotB))))) { + NotB = peekThroughBitcast(NotB, true); + if (match(NotB, m_SExt(m_Specific(Cond)))) + return Cond; + } + + // All scalar (and most vector) possibilities should be handled now. + // Try more matches that only apply to non-splat constant vectors. + if (!Ty->isVectorTy()) + return nullptr; + + // If both operands are xor'd with constants using the same sexted boolean + // operand, see if the constants are inverse bitmasks. + // TODO: Use ConstantExpr::getNot()? + if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AConst)))) && + match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BConst)))) && + Cond->getType()->isIntOrIntVectorTy(1) && + areInverseVectorBitmasks(AConst, BConst)) { + AConst = ConstantExpr::getTrunc(AConst, CmpInst::makeCmpResultType(Ty)); + return Builder.CreateXor(Cond, AConst); + } + return nullptr; +} + +/// We have an expression of the form (A & C) | (B & D). Try to simplify this +/// to "A' ? C : D", where A' is a boolean or vector of booleans. +Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B, + Value *D) { + // The potential condition of the select may be bitcasted. In that case, look + // through its bitcast and the corresponding bitcast of the 'not' condition. + Type *OrigType = A->getType(); + A = peekThroughBitcast(A, true); + B = peekThroughBitcast(B, true); + if (Value *Cond = getSelectCondition(A, B)) { + // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D)) + // The bitcasts will either all exist or all not exist. The builder will + // not create unnecessary casts if the types already match. + Value *BitcastC = Builder.CreateBitCast(C, A->getType()); + Value *BitcastD = Builder.CreateBitCast(D, A->getType()); + Value *Select = Builder.CreateSelect(Cond, BitcastC, BitcastD); + return Builder.CreateBitCast(Select, OrigType); + } + + return nullptr; +} + +/// Fold (icmp)|(icmp) if possible. +Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, + Instruction &CxtI) { + const SimplifyQuery Q = SQ.getWithInstruction(&CxtI); + + // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) + // if K1 and K2 are a one-bit mask. + if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, false, CxtI)) + return V; + + ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); + + ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1)); + ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1)); + + // Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3) + // --> (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3) + // The original condition actually refers to the following two ranges: + // [MAX_UINT-C1+1, MAX_UINT-C1+1+C3] and [MAX_UINT-C2+1, MAX_UINT-C2+1+C3] + // We can fold these two ranges if: + // 1) C1 and C2 is unsigned greater than C3. + // 2) The two ranges are separated. + // 3) C1 ^ C2 is one-bit mask. + // 4) LowRange1 ^ LowRange2 and HighRange1 ^ HighRange2 are one-bit mask. + // This implies all values in the two ranges differ by exactly one bit. + + if ((PredL == ICmpInst::ICMP_ULT || PredL == ICmpInst::ICMP_ULE) && + PredL == PredR && LHSC && RHSC && LHS->hasOneUse() && RHS->hasOneUse() && + LHSC->getType() == RHSC->getType() && + LHSC->getValue() == (RHSC->getValue())) { + + Value *LAdd = LHS->getOperand(0); + Value *RAdd = RHS->getOperand(0); + + Value *LAddOpnd, *RAddOpnd; + ConstantInt *LAddC, *RAddC; + if (match(LAdd, m_Add(m_Value(LAddOpnd), m_ConstantInt(LAddC))) && + match(RAdd, m_Add(m_Value(RAddOpnd), m_ConstantInt(RAddC))) && + LAddC->getValue().ugt(LHSC->getValue()) && + RAddC->getValue().ugt(LHSC->getValue())) { + + APInt DiffC = LAddC->getValue() ^ RAddC->getValue(); + if (LAddOpnd == RAddOpnd && DiffC.isPowerOf2()) { + ConstantInt *MaxAddC = nullptr; + if (LAddC->getValue().ult(RAddC->getValue())) + MaxAddC = RAddC; + else + MaxAddC = LAddC; + + APInt RRangeLow = -RAddC->getValue(); + APInt RRangeHigh = RRangeLow + LHSC->getValue(); + APInt LRangeLow = -LAddC->getValue(); + APInt LRangeHigh = LRangeLow + LHSC->getValue(); + APInt LowRangeDiff = RRangeLow ^ LRangeLow; + APInt HighRangeDiff = RRangeHigh ^ LRangeHigh; + APInt RangeDiff = LRangeLow.sgt(RRangeLow) ? LRangeLow - RRangeLow + : RRangeLow - LRangeLow; + + if (LowRangeDiff.isPowerOf2() && LowRangeDiff == HighRangeDiff && + RangeDiff.ugt(LHSC->getValue())) { + Value *MaskC = ConstantInt::get(LAddC->getType(), ~DiffC); + + Value *NewAnd = Builder.CreateAnd(LAddOpnd, MaskC); + Value *NewAdd = Builder.CreateAdd(NewAnd, MaxAddC); + return Builder.CreateICmp(LHS->getPredicate(), NewAdd, LHSC); + } + } + } + } + + // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B) + if (predicatesFoldable(PredL, PredR)) { + if (LHS->getOperand(0) == RHS->getOperand(1) && + LHS->getOperand(1) == RHS->getOperand(0)) + LHS->swapOperands(); + if (LHS->getOperand(0) == RHS->getOperand(0) && + LHS->getOperand(1) == RHS->getOperand(1)) { + Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); + unsigned Code = getICmpCode(LHS) | getICmpCode(RHS); + bool IsSigned = LHS->isSigned() || RHS->isSigned(); + return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder); + } + } + + // handle (roughly): + // (icmp ne (A & B), C) | (icmp ne (A & D), E) + if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder)) + return V; + + Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0); + if (LHS->hasOneUse() || RHS->hasOneUse()) { + // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1) + // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1) + Value *A = nullptr, *B = nullptr; + if (PredL == ICmpInst::ICMP_EQ && LHSC && LHSC->isZero()) { + B = LHS0; + if (PredR == ICmpInst::ICMP_ULT && LHS0 == RHS->getOperand(1)) + A = RHS0; + else if (PredR == ICmpInst::ICMP_UGT && LHS0 == RHS0) + A = RHS->getOperand(1); + } + // (icmp ult A, B) | (icmp eq B, 0) -> (icmp ule A, B-1) + // (icmp ugt B, A) | (icmp eq B, 0) -> (icmp ule A, B-1) + else if (PredR == ICmpInst::ICMP_EQ && RHSC && RHSC->isZero()) { + B = RHS0; + if (PredL == ICmpInst::ICMP_ULT && RHS0 == LHS->getOperand(1)) + A = LHS0; + else if (PredL == ICmpInst::ICMP_UGT && LHS0 == RHS0) + A = LHS->getOperand(1); + } + if (A && B) + return Builder.CreateICmp( + ICmpInst::ICMP_UGE, + Builder.CreateAdd(B, ConstantInt::getSigned(B->getType(), -1)), A); + } + + // E.g. (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n + if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/true)) + return V; + + // E.g. (icmp sgt x, n) | (icmp slt x, 0) --> icmp ugt x, n + if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/true)) + return V; + + if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, false, Builder)) + return V; + + if (Value *V = foldIsPowerOf2(LHS, RHS, false /* JoinedByAnd */, Builder)) + return V; + + if (Value *X = + foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/false, Q, Builder)) + return X; + if (Value *X = + foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/false, Q, Builder)) + return X; + + // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2). + if (!LHSC || !RHSC) + return nullptr; + + if (LHSC == RHSC && PredL == PredR) { + // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0) + if (PredL == ICmpInst::ICMP_NE && LHSC->isZero()) { + Value *NewOr = Builder.CreateOr(LHS0, RHS0); + return Builder.CreateICmp(PredL, NewOr, LHSC); + } + } + + // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1) + // iff C2 + CA == C1. + if (PredL == ICmpInst::ICMP_ULT && PredR == ICmpInst::ICMP_EQ) { + ConstantInt *AddC; + if (match(LHS0, m_Add(m_Specific(RHS0), m_ConstantInt(AddC)))) + if (RHSC->getValue() + AddC->getValue() == LHSC->getValue()) + return Builder.CreateICmpULE(LHS0, LHSC); + } + + // From here on, we only handle: + // (icmp1 A, C1) | (icmp2 A, C2) --> something simpler. + if (LHS0 != RHS0) + return nullptr; + + // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere. + if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE || + PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE || + PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE || + PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE) + return nullptr; + + // We can't fold (ugt x, C) | (sgt x, C2). + if (!predicatesFoldable(PredL, PredR)) + return nullptr; + + // Ensure that the larger constant is on the RHS. + bool ShouldSwap; + if (CmpInst::isSigned(PredL) || + (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR))) + ShouldSwap = LHSC->getValue().sgt(RHSC->getValue()); + else + ShouldSwap = LHSC->getValue().ugt(RHSC->getValue()); + + if (ShouldSwap) { + std::swap(LHS, RHS); + std::swap(LHSC, RHSC); + std::swap(PredL, PredR); + } + + // At this point, we know we have two icmp instructions + // comparing a value against two constants and or'ing the result + // together. Because of the above check, we know that we only have + // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the + // icmp folding check above), that the two constants are not + // equal. + assert(LHSC != RHSC && "Compares not folded above?"); + + switch (PredL) { + default: + llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: + switch (PredR) { + default: + llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: + // Potential folds for this case should already be handled. + break; + case ICmpInst::ICMP_UGT: + // (X == 0 || X u> C) -> (X-1) u>= C + if (LHSC->isMinValue(false)) + return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1, + false, false); + // (X == 13 | X u> 14) -> no change + break; + case ICmpInst::ICMP_SGT: + // (X == INT_MIN || X s> C) -> (X-(INT_MIN+1)) u>= C-INT_MIN + if (LHSC->isMinValue(true)) + return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1, + true, false); + // (X == 13 | X s> 14) -> no change + break; + } + break; + case ICmpInst::ICMP_ULT: + switch (PredR) { + default: + llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change + // (X u< C || X == UINT_MAX) => (X-C) u>= UINT_MAX-C + if (RHSC->isMaxValue(false)) + return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(), + false, false); + break; + case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2 + assert(!RHSC->isMaxValue(false) && "Missed icmp simplification"); + return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, + false, false); + } + break; + case ICmpInst::ICMP_SLT: + switch (PredR) { + default: + llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: + // (X s< C || X == INT_MAX) => (X-C) u>= INT_MAX-C + if (RHSC->isMaxValue(true)) + return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(), + true, false); + // (X s< 13 | X == 14) -> no change + break; + case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) u> 2 + assert(!RHSC->isMaxValue(true) && "Missed icmp simplification"); + return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, true, + false); + } + break; + } + return nullptr; +} + +// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches +// here. We should standardize that construct where it is needed or choose some +// other way to ensure that commutated variants of patterns are not missed. +Instruction *InstCombiner::visitOr(BinaryOperator &I) { + if (Value *V = SimplifyOrInst(I.getOperand(0), I.getOperand(1), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (SimplifyAssociativeOrCommutative(I)) + return &I; + + if (Instruction *X = foldVectorBinop(I)) + return X; + + // See if we can simplify any instructions used by the instruction whose sole + // purpose is to compute bits we don't care about. + if (SimplifyDemandedInstructionBits(I)) + return &I; + + // Do this before using distributive laws to catch simple and/or/not patterns. + if (Instruction *Xor = foldOrToXor(I, Builder)) + return Xor; + + // (A&B)|(A&C) -> A&(B|C) etc + if (Value *V = SimplifyUsingDistributiveLaws(I)) + return replaceInstUsesWith(I, V); + + if (Value *V = SimplifyBSwap(I, Builder)) + return replaceInstUsesWith(I, V); + + if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I)) + return FoldedLogic; + + if (Instruction *BSwap = matchBSwap(I)) + return BSwap; + + if (Instruction *Rotate = matchRotate(I)) + return Rotate; + + Value *X, *Y; + const APInt *CV; + if (match(&I, m_c_Or(m_OneUse(m_Xor(m_Value(X), m_APInt(CV))), m_Value(Y))) && + !CV->isAllOnesValue() && MaskedValueIsZero(Y, *CV, 0, &I)) { + // (X ^ C) | Y -> (X | Y) ^ C iff Y & C == 0 + // The check for a 'not' op is for efficiency (if Y is known zero --> ~X). + Value *Or = Builder.CreateOr(X, Y); + return BinaryOperator::CreateXor(Or, ConstantInt::get(I.getType(), *CV)); + } + + // (A & C)|(B & D) + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + Value *A, *B, *C, *D; + if (match(Op0, m_And(m_Value(A), m_Value(C))) && + match(Op1, m_And(m_Value(B), m_Value(D)))) { + ConstantInt *C1 = dyn_cast<ConstantInt>(C); + ConstantInt *C2 = dyn_cast<ConstantInt>(D); + if (C1 && C2) { // (A & C1)|(B & C2) + Value *V1 = nullptr, *V2 = nullptr; + if ((C1->getValue() & C2->getValue()).isNullValue()) { + // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2) + // iff (C1&C2) == 0 and (N&~C1) == 0 + if (match(A, m_Or(m_Value(V1), m_Value(V2))) && + ((V1 == B && + MaskedValueIsZero(V2, ~C1->getValue(), 0, &I)) || // (V|N) + (V2 == B && + MaskedValueIsZero(V1, ~C1->getValue(), 0, &I)))) // (N|V) + return BinaryOperator::CreateAnd(A, + Builder.getInt(C1->getValue()|C2->getValue())); + // Or commutes, try both ways. + if (match(B, m_Or(m_Value(V1), m_Value(V2))) && + ((V1 == A && + MaskedValueIsZero(V2, ~C2->getValue(), 0, &I)) || // (V|N) + (V2 == A && + MaskedValueIsZero(V1, ~C2->getValue(), 0, &I)))) // (N|V) + return BinaryOperator::CreateAnd(B, + Builder.getInt(C1->getValue()|C2->getValue())); + + // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2) + // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0. + ConstantInt *C3 = nullptr, *C4 = nullptr; + if (match(A, m_Or(m_Value(V1), m_ConstantInt(C3))) && + (C3->getValue() & ~C1->getValue()).isNullValue() && + match(B, m_Or(m_Specific(V1), m_ConstantInt(C4))) && + (C4->getValue() & ~C2->getValue()).isNullValue()) { + V2 = Builder.CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield"); + return BinaryOperator::CreateAnd(V2, + Builder.getInt(C1->getValue()|C2->getValue())); + } + } + + if (C1->getValue() == ~C2->getValue()) { + Value *X; + + // ((X|B)&C1)|(B&C2) -> (X&C1) | B iff C1 == ~C2 + if (match(A, m_c_Or(m_Value(X), m_Specific(B)))) + return BinaryOperator::CreateOr(Builder.CreateAnd(X, C1), B); + // (A&C2)|((X|A)&C1) -> (X&C2) | A iff C1 == ~C2 + if (match(B, m_c_Or(m_Specific(A), m_Value(X)))) + return BinaryOperator::CreateOr(Builder.CreateAnd(X, C2), A); + + // ((X^B)&C1)|(B&C2) -> (X&C1) ^ B iff C1 == ~C2 + if (match(A, m_c_Xor(m_Value(X), m_Specific(B)))) + return BinaryOperator::CreateXor(Builder.CreateAnd(X, C1), B); + // (A&C2)|((X^A)&C1) -> (X&C2) ^ A iff C1 == ~C2 + if (match(B, m_c_Xor(m_Specific(A), m_Value(X)))) + return BinaryOperator::CreateXor(Builder.CreateAnd(X, C2), A); + } + } + + // Don't try to form a select if it's unlikely that we'll get rid of at + // least one of the operands. A select is generally more expensive than the + // 'or' that it is replacing. + if (Op0->hasOneUse() || Op1->hasOneUse()) { + // (Cond & C) | (~Cond & D) -> Cond ? C : D, and commuted variants. + if (Value *V = matchSelectFromAndOr(A, C, B, D)) + return replaceInstUsesWith(I, V); + if (Value *V = matchSelectFromAndOr(A, C, D, B)) + return replaceInstUsesWith(I, V); + if (Value *V = matchSelectFromAndOr(C, A, B, D)) + return replaceInstUsesWith(I, V); + if (Value *V = matchSelectFromAndOr(C, A, D, B)) + return replaceInstUsesWith(I, V); + if (Value *V = matchSelectFromAndOr(B, D, A, C)) + return replaceInstUsesWith(I, V); + if (Value *V = matchSelectFromAndOr(B, D, C, A)) + return replaceInstUsesWith(I, V); + if (Value *V = matchSelectFromAndOr(D, B, A, C)) + return replaceInstUsesWith(I, V); + if (Value *V = matchSelectFromAndOr(D, B, C, A)) + return replaceInstUsesWith(I, V); + } + } + + // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C + if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) + if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A)))) + return BinaryOperator::CreateOr(Op0, C); + + // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C + if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B)))) + if (match(Op1, m_Xor(m_Specific(B), m_Specific(A)))) + return BinaryOperator::CreateOr(Op1, C); + + // ((B | C) & A) | B -> B | (A & C) + if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A)))) + return BinaryOperator::CreateOr(Op1, Builder.CreateAnd(A, C)); + + if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) + return DeMorgan; + + // Canonicalize xor to the RHS. + bool SwappedForXor = false; + if (match(Op0, m_Xor(m_Value(), m_Value()))) { + std::swap(Op0, Op1); + SwappedForXor = true; + } + + // A | ( A ^ B) -> A | B + // A | (~A ^ B) -> A | ~B + // (A & B) | (A ^ B) + if (match(Op1, m_Xor(m_Value(A), m_Value(B)))) { + if (Op0 == A || Op0 == B) + return BinaryOperator::CreateOr(A, B); + + if (match(Op0, m_And(m_Specific(A), m_Specific(B))) || + match(Op0, m_And(m_Specific(B), m_Specific(A)))) + return BinaryOperator::CreateOr(A, B); + + if (Op1->hasOneUse() && match(A, m_Not(m_Specific(Op0)))) { + Value *Not = Builder.CreateNot(B, B->getName() + ".not"); + return BinaryOperator::CreateOr(Not, Op0); + } + if (Op1->hasOneUse() && match(B, m_Not(m_Specific(Op0)))) { + Value *Not = Builder.CreateNot(A, A->getName() + ".not"); + return BinaryOperator::CreateOr(Not, Op0); + } + } + + // A | ~(A | B) -> A | ~B + // A | ~(A ^ B) -> A | ~B + if (match(Op1, m_Not(m_Value(A)))) + if (BinaryOperator *B = dyn_cast<BinaryOperator>(A)) + if ((Op0 == B->getOperand(0) || Op0 == B->getOperand(1)) && + Op1->hasOneUse() && (B->getOpcode() == Instruction::Or || + B->getOpcode() == Instruction::Xor)) { + Value *NotOp = Op0 == B->getOperand(0) ? B->getOperand(1) : + B->getOperand(0); + Value *Not = Builder.CreateNot(NotOp, NotOp->getName() + ".not"); + return BinaryOperator::CreateOr(Not, Op0); + } + + if (SwappedForXor) + std::swap(Op0, Op1); + + { + ICmpInst *LHS = dyn_cast<ICmpInst>(Op0); + ICmpInst *RHS = dyn_cast<ICmpInst>(Op1); + if (LHS && RHS) + if (Value *Res = foldOrOfICmps(LHS, RHS, I)) + return replaceInstUsesWith(I, Res); + + // TODO: Make this recursive; it's a little tricky because an arbitrary + // number of 'or' instructions might have to be created. + Value *X, *Y; + if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) { + if (auto *Cmp = dyn_cast<ICmpInst>(X)) + if (Value *Res = foldOrOfICmps(LHS, Cmp, I)) + return replaceInstUsesWith(I, Builder.CreateOr(Res, Y)); + if (auto *Cmp = dyn_cast<ICmpInst>(Y)) + if (Value *Res = foldOrOfICmps(LHS, Cmp, I)) + return replaceInstUsesWith(I, Builder.CreateOr(Res, X)); + } + if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) { + if (auto *Cmp = dyn_cast<ICmpInst>(X)) + if (Value *Res = foldOrOfICmps(Cmp, RHS, I)) + return replaceInstUsesWith(I, Builder.CreateOr(Res, Y)); + if (auto *Cmp = dyn_cast<ICmpInst>(Y)) + if (Value *Res = foldOrOfICmps(Cmp, RHS, I)) + return replaceInstUsesWith(I, Builder.CreateOr(Res, X)); + } + } + + if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0))) + if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1))) + if (Value *Res = foldLogicOfFCmps(LHS, RHS, false)) + return replaceInstUsesWith(I, Res); + + if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder)) + return FoldedFCmps; + + if (Instruction *CastedOr = foldCastedBitwiseLogic(I)) + return CastedOr; + + // or(sext(A), B) / or(B, sext(A)) --> A ? -1 : B, where A is i1 or <N x i1>. + if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) && + A->getType()->isIntOrIntVectorTy(1)) + return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1); + if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) && + A->getType()->isIntOrIntVectorTy(1)) + return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0); + + // Note: If we've gotten to the point of visiting the outer OR, then the + // inner one couldn't be simplified. If it was a constant, then it won't + // be simplified by a later pass either, so we try swapping the inner/outer + // ORs in the hopes that we'll be able to simplify it this way. + // (X|C) | V --> (X|V) | C + ConstantInt *CI; + if (Op0->hasOneUse() && !isa<ConstantInt>(Op1) && + match(Op0, m_Or(m_Value(A), m_ConstantInt(CI)))) { + Value *Inner = Builder.CreateOr(A, Op1); + Inner->takeName(Op0); + return BinaryOperator::CreateOr(Inner, CI); + } + + // Change (or (bool?A:B),(bool?C:D)) --> (bool?(or A,C):(or B,D)) + // Since this OR statement hasn't been optimized further yet, we hope + // that this transformation will allow the new ORs to be optimized. + { + Value *X = nullptr, *Y = nullptr; + if (Op0->hasOneUse() && Op1->hasOneUse() && + match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) && + match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) { + Value *orTrue = Builder.CreateOr(A, C); + Value *orFalse = Builder.CreateOr(B, D); + return SelectInst::Create(X, orTrue, orFalse); + } + } + + // or(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X) --> X s> Y ? -1 : X. + { + Value *X, *Y; + const APInt *ShAmt; + Type *Ty = I.getType(); + if (match(&I, m_c_Or(m_OneUse(m_AShr(m_NSWSub(m_Value(Y), m_Value(X)), + m_APInt(ShAmt))), + m_Deferred(X))) && + *ShAmt == Ty->getScalarSizeInBits() - 1) { + Value *NewICmpInst = Builder.CreateICmpSGT(X, Y); + return SelectInst::Create(NewICmpInst, ConstantInt::getAllOnesValue(Ty), + X); + } + } + + if (Instruction *V = + canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I)) + return V; + + return nullptr; +} + +/// A ^ B can be specified using other logic ops in a variety of patterns. We +/// can fold these early and efficiently by morphing an existing instruction. +static Instruction *foldXorToXor(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + assert(I.getOpcode() == Instruction::Xor); + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + Value *A, *B; + + // There are 4 commuted variants for each of the basic patterns. + + // (A & B) ^ (A | B) -> A ^ B + // (A & B) ^ (B | A) -> A ^ B + // (A | B) ^ (A & B) -> A ^ B + // (A | B) ^ (B & A) -> A ^ B + if (match(&I, m_c_Xor(m_And(m_Value(A), m_Value(B)), + m_c_Or(m_Deferred(A), m_Deferred(B))))) { + I.setOperand(0, A); + I.setOperand(1, B); + return &I; + } + + // (A | ~B) ^ (~A | B) -> A ^ B + // (~B | A) ^ (~A | B) -> A ^ B + // (~A | B) ^ (A | ~B) -> A ^ B + // (B | ~A) ^ (A | ~B) -> A ^ B + if (match(&I, m_Xor(m_c_Or(m_Value(A), m_Not(m_Value(B))), + m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B))))) { + I.setOperand(0, A); + I.setOperand(1, B); + return &I; + } + + // (A & ~B) ^ (~A & B) -> A ^ B + // (~B & A) ^ (~A & B) -> A ^ B + // (~A & B) ^ (A & ~B) -> A ^ B + // (B & ~A) ^ (A & ~B) -> A ^ B + if (match(&I, m_Xor(m_c_And(m_Value(A), m_Not(m_Value(B))), + m_c_And(m_Not(m_Deferred(A)), m_Deferred(B))))) { + I.setOperand(0, A); + I.setOperand(1, B); + return &I; + } + + // For the remaining cases we need to get rid of one of the operands. + if (!Op0->hasOneUse() && !Op1->hasOneUse()) + return nullptr; + + // (A | B) ^ ~(A & B) -> ~(A ^ B) + // (A | B) ^ ~(B & A) -> ~(A ^ B) + // (A & B) ^ ~(A | B) -> ~(A ^ B) + // (A & B) ^ ~(B | A) -> ~(A ^ B) + // Complexity sorting ensures the not will be on the right side. + if ((match(Op0, m_Or(m_Value(A), m_Value(B))) && + match(Op1, m_Not(m_c_And(m_Specific(A), m_Specific(B))))) || + (match(Op0, m_And(m_Value(A), m_Value(B))) && + match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))) + return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); + + return nullptr; +} + +Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, + BinaryOperator &I) { + assert(I.getOpcode() == Instruction::Xor && I.getOperand(0) == LHS && + I.getOperand(1) == RHS && "Should be 'xor' with these operands"); + + if (predicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) { + if (LHS->getOperand(0) == RHS->getOperand(1) && + LHS->getOperand(1) == RHS->getOperand(0)) + LHS->swapOperands(); + if (LHS->getOperand(0) == RHS->getOperand(0) && + LHS->getOperand(1) == RHS->getOperand(1)) { + // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B) + Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); + unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS); + bool IsSigned = LHS->isSigned() || RHS->isSigned(); + return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder); + } + } + + // TODO: This can be generalized to compares of non-signbits using + // decomposeBitTestICmp(). It could be enhanced more by using (something like) + // foldLogOpOfMaskedICmps(). + ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); + Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1); + Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1); + if ((LHS->hasOneUse() || RHS->hasOneUse()) && + LHS0->getType() == RHS0->getType() && + LHS0->getType()->isIntOrIntVectorTy()) { + // (X > -1) ^ (Y > -1) --> (X ^ Y) < 0 + // (X < 0) ^ (Y < 0) --> (X ^ Y) < 0 + if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) && + PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes())) || + (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) && + PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero()))) { + Value *Zero = ConstantInt::getNullValue(LHS0->getType()); + return Builder.CreateICmpSLT(Builder.CreateXor(LHS0, RHS0), Zero); + } + // (X > -1) ^ (Y < 0) --> (X ^ Y) > -1 + // (X < 0) ^ (Y > -1) --> (X ^ Y) > -1 + if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) && + PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero())) || + (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) && + PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes()))) { + Value *MinusOne = ConstantInt::getAllOnesValue(LHS0->getType()); + return Builder.CreateICmpSGT(Builder.CreateXor(LHS0, RHS0), MinusOne); + } + } + + // Instead of trying to imitate the folds for and/or, decompose this 'xor' + // into those logic ops. That is, try to turn this into an and-of-icmps + // because we have many folds for that pattern. + // + // This is based on a truth table definition of xor: + // X ^ Y --> (X | Y) & !(X & Y) + if (Value *OrICmp = SimplifyBinOp(Instruction::Or, LHS, RHS, SQ)) { + // TODO: If OrICmp is true, then the definition of xor simplifies to !(X&Y). + // TODO: If OrICmp is false, the whole thing is false (InstSimplify?). + if (Value *AndICmp = SimplifyBinOp(Instruction::And, LHS, RHS, SQ)) { + // TODO: Independently handle cases where the 'and' side is a constant. + ICmpInst *X = nullptr, *Y = nullptr; + if (OrICmp == LHS && AndICmp == RHS) { + // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS --> X & !Y + X = LHS; + Y = RHS; + } + if (OrICmp == RHS && AndICmp == LHS) { + // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS --> !Y & X + X = RHS; + Y = LHS; + } + if (X && Y && (Y->hasOneUse() || canFreelyInvertAllUsersOf(Y, &I))) { + // Invert the predicate of 'Y', thus inverting its output. + Y->setPredicate(Y->getInversePredicate()); + // So, are there other uses of Y? + if (!Y->hasOneUse()) { + // We need to adapt other uses of Y though. Get a value that matches + // the original value of Y before inversion. While this increases + // immediate instruction count, we have just ensured that all the + // users are freely-invertible, so that 'not' *will* get folded away. + BuilderTy::InsertPointGuard Guard(Builder); + // Set insertion point to right after the Y. + Builder.SetInsertPoint(Y->getParent(), ++(Y->getIterator())); + Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not"); + // Replace all uses of Y (excluding the one in NotY!) with NotY. + Y->replaceUsesWithIf(NotY, + [NotY](Use &U) { return U.getUser() != NotY; }); + } + // All done. + return Builder.CreateAnd(LHS, RHS); + } + } + } + + return nullptr; +} + +/// If we have a masked merge, in the canonical form of: +/// (assuming that A only has one use.) +/// | A | |B| +/// ((x ^ y) & M) ^ y +/// | D | +/// * If M is inverted: +/// | D | +/// ((x ^ y) & ~M) ^ y +/// We can canonicalize by swapping the final xor operand +/// to eliminate the 'not' of the mask. +/// ((x ^ y) & M) ^ x +/// * If M is a constant, and D has one use, we transform to 'and' / 'or' ops +/// because that shortens the dependency chain and improves analysis: +/// (x & M) | (y & ~M) +static Instruction *visitMaskedMerge(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *B, *X, *D; + Value *M; + if (!match(&I, m_c_Xor(m_Value(B), + m_OneUse(m_c_And( + m_CombineAnd(m_c_Xor(m_Deferred(B), m_Value(X)), + m_Value(D)), + m_Value(M)))))) + return nullptr; + + Value *NotM; + if (match(M, m_Not(m_Value(NotM)))) { + // De-invert the mask and swap the value in B part. + Value *NewA = Builder.CreateAnd(D, NotM); + return BinaryOperator::CreateXor(NewA, X); + } + + Constant *C; + if (D->hasOneUse() && match(M, m_Constant(C))) { + // Unfold. + Value *LHS = Builder.CreateAnd(X, C); + Value *NotC = Builder.CreateNot(C); + Value *RHS = Builder.CreateAnd(B, NotC); + return BinaryOperator::CreateOr(LHS, RHS); + } + + return nullptr; +} + +// Transform +// ~(x ^ y) +// into: +// (~x) ^ y +// or into +// x ^ (~y) +static Instruction *sinkNotIntoXor(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *X, *Y; + // FIXME: one-use check is not needed in general, but currently we are unable + // to fold 'not' into 'icmp', if that 'icmp' has multiple uses. (D35182) + if (!match(&I, m_Not(m_OneUse(m_Xor(m_Value(X), m_Value(Y)))))) + return nullptr; + + // We only want to do the transform if it is free to do. + if (isFreeToInvert(X, X->hasOneUse())) { + // Ok, good. + } else if (isFreeToInvert(Y, Y->hasOneUse())) { + std::swap(X, Y); + } else + return nullptr; + + Value *NotX = Builder.CreateNot(X, X->getName() + ".not"); + return BinaryOperator::CreateXor(NotX, Y, I.getName() + ".demorgan"); +} + +// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches +// here. We should standardize that construct where it is needed or choose some +// other way to ensure that commutated variants of patterns are not missed. +Instruction *InstCombiner::visitXor(BinaryOperator &I) { + if (Value *V = SimplifyXorInst(I.getOperand(0), I.getOperand(1), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (SimplifyAssociativeOrCommutative(I)) + return &I; + + if (Instruction *X = foldVectorBinop(I)) + return X; + + if (Instruction *NewXor = foldXorToXor(I, Builder)) + return NewXor; + + // (A&B)^(A&C) -> A&(B^C) etc + if (Value *V = SimplifyUsingDistributiveLaws(I)) + return replaceInstUsesWith(I, V); + + // See if we can simplify any instructions used by the instruction whose sole + // purpose is to compute bits we don't care about. + if (SimplifyDemandedInstructionBits(I)) + return &I; + + if (Value *V = SimplifyBSwap(I, Builder)) + return replaceInstUsesWith(I, V); + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + // Fold (X & M) ^ (Y & ~M) -> (X & M) | (Y & ~M) + // This it a special case in haveNoCommonBitsSet, but the computeKnownBits + // calls in there are unnecessary as SimplifyDemandedInstructionBits should + // have already taken care of those cases. + Value *M; + if (match(&I, m_c_Xor(m_c_And(m_Not(m_Value(M)), m_Value()), + m_c_And(m_Deferred(M), m_Value())))) + return BinaryOperator::CreateOr(Op0, Op1); + + // Apply DeMorgan's Law for 'nand' / 'nor' logic with an inverted operand. + Value *X, *Y; + + // We must eliminate the and/or (one-use) for these transforms to not increase + // the instruction count. + // ~(~X & Y) --> (X | ~Y) + // ~(Y & ~X) --> (X | ~Y) + if (match(&I, m_Not(m_OneUse(m_c_And(m_Not(m_Value(X)), m_Value(Y)))))) { + Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not"); + return BinaryOperator::CreateOr(X, NotY); + } + // ~(~X | Y) --> (X & ~Y) + // ~(Y | ~X) --> (X & ~Y) + if (match(&I, m_Not(m_OneUse(m_c_Or(m_Not(m_Value(X)), m_Value(Y)))))) { + Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not"); + return BinaryOperator::CreateAnd(X, NotY); + } + + if (Instruction *Xor = visitMaskedMerge(I, Builder)) + return Xor; + + // Is this a 'not' (~) fed by a binary operator? + BinaryOperator *NotVal; + if (match(&I, m_Not(m_BinOp(NotVal)))) { + if (NotVal->getOpcode() == Instruction::And || + NotVal->getOpcode() == Instruction::Or) { + // Apply DeMorgan's Law when inverts are free: + // ~(X & Y) --> (~X | ~Y) + // ~(X | Y) --> (~X & ~Y) + if (isFreeToInvert(NotVal->getOperand(0), + NotVal->getOperand(0)->hasOneUse()) && + isFreeToInvert(NotVal->getOperand(1), + NotVal->getOperand(1)->hasOneUse())) { + Value *NotX = Builder.CreateNot(NotVal->getOperand(0), "notlhs"); + Value *NotY = Builder.CreateNot(NotVal->getOperand(1), "notrhs"); + if (NotVal->getOpcode() == Instruction::And) + return BinaryOperator::CreateOr(NotX, NotY); + return BinaryOperator::CreateAnd(NotX, NotY); + } + } + + // ~(X - Y) --> ~X + Y + if (match(NotVal, m_Sub(m_Value(X), m_Value(Y)))) + if (isa<Constant>(X) || NotVal->hasOneUse()) + return BinaryOperator::CreateAdd(Builder.CreateNot(X), Y); + + // ~(~X >>s Y) --> (X >>s Y) + if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y)))) + return BinaryOperator::CreateAShr(X, Y); + + // If we are inverting a right-shifted constant, we may be able to eliminate + // the 'not' by inverting the constant and using the opposite shift type. + // Canonicalization rules ensure that only a negative constant uses 'ashr', + // but we must check that in case that transform has not fired yet. + + // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits) + Constant *C; + if (match(NotVal, m_AShr(m_Constant(C), m_Value(Y))) && + match(C, m_Negative())) + return BinaryOperator::CreateLShr(ConstantExpr::getNot(C), Y); + + // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits) + if (match(NotVal, m_LShr(m_Constant(C), m_Value(Y))) && + match(C, m_NonNegative())) + return BinaryOperator::CreateAShr(ConstantExpr::getNot(C), Y); + + // ~(X + C) --> -(C + 1) - X + if (match(Op0, m_Add(m_Value(X), m_Constant(C)))) + return BinaryOperator::CreateSub(ConstantExpr::getNeg(AddOne(C)), X); + } + + // Use DeMorgan and reassociation to eliminate a 'not' op. + Constant *C1; + if (match(Op1, m_Constant(C1))) { + Constant *C2; + if (match(Op0, m_OneUse(m_Or(m_Not(m_Value(X)), m_Constant(C2))))) { + // (~X | C2) ^ C1 --> ((X & ~C2) ^ -1) ^ C1 --> (X & ~C2) ^ ~C1 + Value *And = Builder.CreateAnd(X, ConstantExpr::getNot(C2)); + return BinaryOperator::CreateXor(And, ConstantExpr::getNot(C1)); + } + if (match(Op0, m_OneUse(m_And(m_Not(m_Value(X)), m_Constant(C2))))) { + // (~X & C2) ^ C1 --> ((X | ~C2) ^ -1) ^ C1 --> (X | ~C2) ^ ~C1 + Value *Or = Builder.CreateOr(X, ConstantExpr::getNot(C2)); + return BinaryOperator::CreateXor(Or, ConstantExpr::getNot(C1)); + } + } + + // not (cmp A, B) = !cmp A, B + CmpInst::Predicate Pred; + if (match(&I, m_Not(m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))))) { + cast<CmpInst>(Op0)->setPredicate(CmpInst::getInversePredicate(Pred)); + return replaceInstUsesWith(I, Op0); + } + + { + const APInt *RHSC; + if (match(Op1, m_APInt(RHSC))) { + Value *X; + const APInt *C; + if (RHSC->isSignMask() && match(Op0, m_Sub(m_APInt(C), m_Value(X)))) { + // (C - X) ^ signmask -> (C + signmask - X) + Constant *NewC = ConstantInt::get(I.getType(), *C + *RHSC); + return BinaryOperator::CreateSub(NewC, X); + } + if (RHSC->isSignMask() && match(Op0, m_Add(m_Value(X), m_APInt(C)))) { + // (X + C) ^ signmask -> (X + C + signmask) + Constant *NewC = ConstantInt::get(I.getType(), *C + *RHSC); + return BinaryOperator::CreateAdd(X, NewC); + } + + // (X|C1)^C2 -> X^(C1^C2) iff X&~C1 == 0 + if (match(Op0, m_Or(m_Value(X), m_APInt(C))) && + MaskedValueIsZero(X, *C, 0, &I)) { + Constant *NewC = ConstantInt::get(I.getType(), *C ^ *RHSC); + Worklist.Add(cast<Instruction>(Op0)); + I.setOperand(0, X); + I.setOperand(1, NewC); + return &I; + } + } + } + + if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1)) { + if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) { + if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) { + if (Op0I->getOpcode() == Instruction::LShr) { + // ((X^C1) >> C2) ^ C3 -> (X>>C2) ^ ((C1>>C2)^C3) + // E1 = "X ^ C1" + BinaryOperator *E1; + ConstantInt *C1; + if (Op0I->hasOneUse() && + (E1 = dyn_cast<BinaryOperator>(Op0I->getOperand(0))) && + E1->getOpcode() == Instruction::Xor && + (C1 = dyn_cast<ConstantInt>(E1->getOperand(1)))) { + // fold (C1 >> C2) ^ C3 + ConstantInt *C2 = Op0CI, *C3 = RHSC; + APInt FoldConst = C1->getValue().lshr(C2->getValue()); + FoldConst ^= C3->getValue(); + // Prepare the two operands. + Value *Opnd0 = Builder.CreateLShr(E1->getOperand(0), C2); + Opnd0->takeName(Op0I); + cast<Instruction>(Opnd0)->setDebugLoc(I.getDebugLoc()); + Value *FoldVal = ConstantInt::get(Opnd0->getType(), FoldConst); + + return BinaryOperator::CreateXor(Opnd0, FoldVal); + } + } + } + } + } + + if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I)) + return FoldedLogic; + + // Y ^ (X | Y) --> X & ~Y + // Y ^ (Y | X) --> X & ~Y + if (match(Op1, m_OneUse(m_c_Or(m_Value(X), m_Specific(Op0))))) + return BinaryOperator::CreateAnd(X, Builder.CreateNot(Op0)); + // (X | Y) ^ Y --> X & ~Y + // (Y | X) ^ Y --> X & ~Y + if (match(Op0, m_OneUse(m_c_Or(m_Value(X), m_Specific(Op1))))) + return BinaryOperator::CreateAnd(X, Builder.CreateNot(Op1)); + + // Y ^ (X & Y) --> ~X & Y + // Y ^ (Y & X) --> ~X & Y + if (match(Op1, m_OneUse(m_c_And(m_Value(X), m_Specific(Op0))))) + return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(X)); + // (X & Y) ^ Y --> ~X & Y + // (Y & X) ^ Y --> ~X & Y + // Canonical form is (X & C) ^ C; don't touch that. + // TODO: A 'not' op is better for analysis and codegen, but demanded bits must + // be fixed to prefer that (otherwise we get infinite looping). + if (!match(Op1, m_Constant()) && + match(Op0, m_OneUse(m_c_And(m_Value(X), m_Specific(Op1))))) + return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(X)); + + Value *A, *B, *C; + // (A ^ B) ^ (A | C) --> (~A & C) ^ B -- There are 4 commuted variants. + if (match(&I, m_c_Xor(m_OneUse(m_Xor(m_Value(A), m_Value(B))), + m_OneUse(m_c_Or(m_Deferred(A), m_Value(C)))))) + return BinaryOperator::CreateXor( + Builder.CreateAnd(Builder.CreateNot(A), C), B); + + // (A ^ B) ^ (B | C) --> (~B & C) ^ A -- There are 4 commuted variants. + if (match(&I, m_c_Xor(m_OneUse(m_Xor(m_Value(A), m_Value(B))), + m_OneUse(m_c_Or(m_Deferred(B), m_Value(C)))))) + return BinaryOperator::CreateXor( + Builder.CreateAnd(Builder.CreateNot(B), C), A); + + // (A & B) ^ (A ^ B) -> (A | B) + if (match(Op0, m_And(m_Value(A), m_Value(B))) && + match(Op1, m_c_Xor(m_Specific(A), m_Specific(B)))) + return BinaryOperator::CreateOr(A, B); + // (A ^ B) ^ (A & B) -> (A | B) + if (match(Op0, m_Xor(m_Value(A), m_Value(B))) && + match(Op1, m_c_And(m_Specific(A), m_Specific(B)))) + return BinaryOperator::CreateOr(A, B); + + // (A & ~B) ^ ~A -> ~(A & B) + // (~B & A) ^ ~A -> ~(A & B) + if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) && + match(Op1, m_Not(m_Specific(A)))) + return BinaryOperator::CreateNot(Builder.CreateAnd(A, B)); + + if (auto *LHS = dyn_cast<ICmpInst>(I.getOperand(0))) + if (auto *RHS = dyn_cast<ICmpInst>(I.getOperand(1))) + if (Value *V = foldXorOfICmps(LHS, RHS, I)) + return replaceInstUsesWith(I, V); + + if (Instruction *CastedXor = foldCastedBitwiseLogic(I)) + return CastedXor; + + // Canonicalize a shifty way to code absolute value to the common pattern. + // There are 4 potential commuted variants. Move the 'ashr' candidate to Op1. + // We're relying on the fact that we only do this transform when the shift has + // exactly 2 uses and the add has exactly 1 use (otherwise, we might increase + // instructions). + if (Op0->hasNUses(2)) + std::swap(Op0, Op1); + + const APInt *ShAmt; + Type *Ty = I.getType(); + if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) && + Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 && + match(Op0, m_OneUse(m_c_Add(m_Specific(A), m_Specific(Op1))))) { + // B = ashr i32 A, 31 ; smear the sign bit + // xor (add A, B), B ; add -1 and flip bits if negative + // --> (A < 0) ? -A : A + Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty)); + // Copy the nuw/nsw flags from the add to the negate. + auto *Add = cast<BinaryOperator>(Op0); + Value *Neg = Builder.CreateNeg(A, "", Add->hasNoUnsignedWrap(), + Add->hasNoSignedWrap()); + return SelectInst::Create(Cmp, Neg, A); + } + + // Eliminate a bitwise 'not' op of 'not' min/max by inverting the min/max: + // + // %notx = xor i32 %x, -1 + // %cmp1 = icmp sgt i32 %notx, %y + // %smax = select i1 %cmp1, i32 %notx, i32 %y + // %res = xor i32 %smax, -1 + // => + // %noty = xor i32 %y, -1 + // %cmp2 = icmp slt %x, %noty + // %res = select i1 %cmp2, i32 %x, i32 %noty + // + // Same is applicable for smin/umax/umin. + if (match(Op1, m_AllOnes()) && Op0->hasOneUse()) { + Value *LHS, *RHS; + SelectPatternFlavor SPF = matchSelectPattern(Op0, LHS, RHS).Flavor; + if (SelectPatternResult::isMinOrMax(SPF)) { + // It's possible we get here before the not has been simplified, so make + // sure the input to the not isn't freely invertible. + if (match(LHS, m_Not(m_Value(X))) && !isFreeToInvert(X, X->hasOneUse())) { + Value *NotY = Builder.CreateNot(RHS); + return SelectInst::Create( + Builder.CreateICmp(getInverseMinMaxPred(SPF), X, NotY), X, NotY); + } + + // It's possible we get here before the not has been simplified, so make + // sure the input to the not isn't freely invertible. + if (match(RHS, m_Not(m_Value(Y))) && !isFreeToInvert(Y, Y->hasOneUse())) { + Value *NotX = Builder.CreateNot(LHS); + return SelectInst::Create( + Builder.CreateICmp(getInverseMinMaxPred(SPF), NotX, Y), NotX, Y); + } + + // If both sides are freely invertible, then we can get rid of the xor + // completely. + if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) && + isFreeToInvert(RHS, !RHS->hasNUsesOrMore(3))) { + Value *NotLHS = Builder.CreateNot(LHS); + Value *NotRHS = Builder.CreateNot(RHS); + return SelectInst::Create( + Builder.CreateICmp(getInverseMinMaxPred(SPF), NotLHS, NotRHS), + NotLHS, NotRHS); + } + } + } + + if (Instruction *NewXor = sinkNotIntoXor(I, Builder)) + return NewXor; + + return nullptr; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp new file mode 100644 index 000000000000..825f4b468b0a --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp @@ -0,0 +1,159 @@ +//===- InstCombineAtomicRMW.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the visit functions for atomic rmw instructions. +// +//===----------------------------------------------------------------------===// +#include "InstCombineInternal.h" +#include "llvm/IR/Instructions.h" + +using namespace llvm; + +namespace { +/// Return true if and only if the given instruction does not modify the memory +/// location referenced. Note that an idemptent atomicrmw may still have +/// ordering effects on nearby instructions, or be volatile. +/// TODO: Common w/ the version in AtomicExpandPass, and change the term used. +/// Idemptotent is confusing in this context. +bool isIdempotentRMW(AtomicRMWInst& RMWI) { + if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand())) + switch(RMWI.getOperation()) { + case AtomicRMWInst::FAdd: // -0.0 + return CF->isZero() && CF->isNegative(); + case AtomicRMWInst::FSub: // +0.0 + return CF->isZero() && !CF->isNegative(); + default: + return false; + }; + + auto C = dyn_cast<ConstantInt>(RMWI.getValOperand()); + if(!C) + return false; + + switch(RMWI.getOperation()) { + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + case AtomicRMWInst::Or: + case AtomicRMWInst::Xor: + return C->isZero(); + case AtomicRMWInst::And: + return C->isMinusOne(); + case AtomicRMWInst::Min: + return C->isMaxValue(true); + case AtomicRMWInst::Max: + return C->isMinValue(true); + case AtomicRMWInst::UMin: + return C->isMaxValue(false); + case AtomicRMWInst::UMax: + return C->isMinValue(false); + default: + return false; + } +} + +/// Return true if the given instruction always produces a value in memory +/// equivalent to its value operand. +bool isSaturating(AtomicRMWInst& RMWI) { + if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand())) + switch(RMWI.getOperation()) { + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: + return CF->isNaN(); + default: + return false; + }; + + auto C = dyn_cast<ConstantInt>(RMWI.getValOperand()); + if(!C) + return false; + + switch(RMWI.getOperation()) { + default: + return false; + case AtomicRMWInst::Xchg: + return true; + case AtomicRMWInst::Or: + return C->isAllOnesValue(); + case AtomicRMWInst::And: + return C->isZero(); + case AtomicRMWInst::Min: + return C->isMinValue(true); + case AtomicRMWInst::Max: + return C->isMaxValue(true); + case AtomicRMWInst::UMin: + return C->isMinValue(false); + case AtomicRMWInst::UMax: + return C->isMaxValue(false); + }; +} +} + +Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) { + + // Volatile RMWs perform a load and a store, we cannot replace this by just a + // load or just a store. We chose not to canonicalize out of general paranoia + // about user expectations around volatile. + if (RMWI.isVolatile()) + return nullptr; + + // Any atomicrmw op which produces a known result in memory can be + // replaced w/an atomicrmw xchg. + if (isSaturating(RMWI) && + RMWI.getOperation() != AtomicRMWInst::Xchg) { + RMWI.setOperation(AtomicRMWInst::Xchg); + return &RMWI; + } + + AtomicOrdering Ordering = RMWI.getOrdering(); + assert(Ordering != AtomicOrdering::NotAtomic && + Ordering != AtomicOrdering::Unordered && + "AtomicRMWs don't make sense with Unordered or NotAtomic"); + + // Any atomicrmw xchg with no uses can be converted to a atomic store if the + // ordering is compatible. + if (RMWI.getOperation() == AtomicRMWInst::Xchg && + RMWI.use_empty()) { + if (Ordering != AtomicOrdering::Release && + Ordering != AtomicOrdering::Monotonic) + return nullptr; + auto *SI = new StoreInst(RMWI.getValOperand(), + RMWI.getPointerOperand(), &RMWI); + SI->setAtomic(Ordering, RMWI.getSyncScopeID()); + SI->setAlignment(MaybeAlign(DL.getABITypeAlignment(RMWI.getType()))); + return eraseInstFromFunction(RMWI); + } + + if (!isIdempotentRMW(RMWI)) + return nullptr; + + // We chose to canonicalize all idempotent operations to an single + // operation code and constant. This makes it easier for the rest of the + // optimizer to match easily. The choices of or w/0 and fadd w/-0.0 are + // arbitrary. + if (RMWI.getType()->isIntegerTy() && + RMWI.getOperation() != AtomicRMWInst::Or) { + RMWI.setOperation(AtomicRMWInst::Or); + RMWI.setOperand(1, ConstantInt::get(RMWI.getType(), 0)); + return &RMWI; + } else if (RMWI.getType()->isFloatingPointTy() && + RMWI.getOperation() != AtomicRMWInst::FAdd) { + RMWI.setOperation(AtomicRMWInst::FAdd); + RMWI.setOperand(1, ConstantFP::getNegativeZero(RMWI.getType())); + return &RMWI; + } + + // Check if the required ordering is compatible with an atomic load. + if (Ordering != AtomicOrdering::Acquire && + Ordering != AtomicOrdering::Monotonic) + return nullptr; + + LoadInst *Load = new LoadInst(RMWI.getType(), RMWI.getPointerOperand()); + Load->setAtomic(Ordering, RMWI.getSyncScopeID()); + Load->setAlignment(MaybeAlign(DL.getABITypeAlignment(RMWI.getType()))); + return Load; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp new file mode 100644 index 000000000000..c650d242cd50 --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -0,0 +1,4826 @@ +//===- InstCombineCalls.cpp -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the visitCall, visitInvoke, and visitCallBr functions. +// +//===----------------------------------------------------------------------===// + +#include "InstCombineInternal.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Statepoint.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstring> +#include <utility> +#include <vector> + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "instcombine" + +STATISTIC(NumSimplified, "Number of library calls simplified"); + +static cl::opt<unsigned> GuardWideningWindow( + "instcombine-guard-widening-window", + cl::init(3), + cl::desc("How wide an instruction window to bypass looking for " + "another guard")); + +/// Return the specified type promoted as it would be to pass though a va_arg +/// area. +static Type *getPromotedType(Type *Ty) { + if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) { + if (ITy->getBitWidth() < 32) + return Type::getInt32Ty(Ty->getContext()); + } + return Ty; +} + +/// Return a constant boolean vector that has true elements in all positions +/// where the input constant data vector has an element with the sign bit set. +static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { + SmallVector<Constant *, 32> BoolVec; + IntegerType *BoolTy = Type::getInt1Ty(V->getContext()); + for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) { + Constant *Elt = V->getElementAsConstant(I); + assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) && + "Unexpected constant data vector element type"); + bool Sign = V->getElementType()->isIntegerTy() + ? cast<ConstantInt>(Elt)->isNegative() + : cast<ConstantFP>(Elt)->isNegative(); + BoolVec.push_back(ConstantInt::get(BoolTy, Sign)); + } + return ConstantVector::get(BoolVec); +} + +Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { + unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT); + unsigned CopyDstAlign = MI->getDestAlignment(); + if (CopyDstAlign < DstAlign){ + MI->setDestAlignment(DstAlign); + return MI; + } + + unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT); + unsigned CopySrcAlign = MI->getSourceAlignment(); + if (CopySrcAlign < SrcAlign) { + MI->setSourceAlignment(SrcAlign); + return MI; + } + + // If we have a store to a location which is known constant, we can conclude + // that the store must be storing the constant value (else the memory + // wouldn't be constant), and this must be a noop. + if (AA->pointsToConstantMemory(MI->getDest())) { + // Set the size of the copy to 0, it will be deleted on the next iteration. + MI->setLength(Constant::getNullValue(MI->getLength()->getType())); + return MI; + } + + // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with + // load/store. + ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength()); + if (!MemOpLength) return nullptr; + + // Source and destination pointer types are always "i8*" for intrinsic. See + // if the size is something we can handle with a single primitive load/store. + // A single load+store correctly handles overlapping memory in the memmove + // case. + uint64_t Size = MemOpLength->getLimitedValue(); + assert(Size && "0-sized memory transferring should be removed already."); + + if (Size > 8 || (Size&(Size-1))) + return nullptr; // If not 1/2/4/8 bytes, exit. + + // If it is an atomic and alignment is less than the size then we will + // introduce the unaligned memory access which will be later transformed + // into libcall in CodeGen. This is not evident performance gain so disable + // it now. + if (isa<AtomicMemTransferInst>(MI)) + if (CopyDstAlign < Size || CopySrcAlign < Size) + return nullptr; + + // Use an integer load+store unless we can find something better. + unsigned SrcAddrSp = + cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace(); + unsigned DstAddrSp = + cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace(); + + IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3); + Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp); + Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp); + + // If the memcpy has metadata describing the members, see if we can get the + // TBAA tag describing our copy. + MDNode *CopyMD = nullptr; + if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) { + CopyMD = M; + } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) { + if (M->getNumOperands() == 3 && M->getOperand(0) && + mdconst::hasa<ConstantInt>(M->getOperand(0)) && + mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() && + M->getOperand(1) && + mdconst::hasa<ConstantInt>(M->getOperand(1)) && + mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() == + Size && + M->getOperand(2) && isa<MDNode>(M->getOperand(2))) + CopyMD = cast<MDNode>(M->getOperand(2)); + } + + Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy); + Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); + LoadInst *L = Builder.CreateLoad(IntType, Src); + // Alignment from the mem intrinsic will be better, so use it. + L->setAlignment( + MaybeAlign(CopySrcAlign)); // FIXME: Check if we can use Align instead. + if (CopyMD) + L->setMetadata(LLVMContext::MD_tbaa, CopyMD); + MDNode *LoopMemParallelMD = + MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access); + if (LoopMemParallelMD) + L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); + MDNode *AccessGroupMD = MI->getMetadata(LLVMContext::MD_access_group); + if (AccessGroupMD) + L->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); + + StoreInst *S = Builder.CreateStore(L, Dest); + // Alignment from the mem intrinsic will be better, so use it. + S->setAlignment( + MaybeAlign(CopyDstAlign)); // FIXME: Check if we can use Align instead. + if (CopyMD) + S->setMetadata(LLVMContext::MD_tbaa, CopyMD); + if (LoopMemParallelMD) + S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); + if (AccessGroupMD) + S->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); + + if (auto *MT = dyn_cast<MemTransferInst>(MI)) { + // non-atomics can be volatile + L->setVolatile(MT->isVolatile()); + S->setVolatile(MT->isVolatile()); + } + if (isa<AtomicMemTransferInst>(MI)) { + // atomics have to be unordered + L->setOrdering(AtomicOrdering::Unordered); + S->setOrdering(AtomicOrdering::Unordered); + } + + // Set the size of the copy to 0, it will be deleted on the next iteration. + MI->setLength(Constant::getNullValue(MemOpLength->getType())); + return MI; +} + +Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { + const unsigned KnownAlignment = + getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); + if (MI->getDestAlignment() < KnownAlignment) { + MI->setDestAlignment(KnownAlignment); + return MI; + } + + // If we have a store to a location which is known constant, we can conclude + // that the store must be storing the constant value (else the memory + // wouldn't be constant), and this must be a noop. + if (AA->pointsToConstantMemory(MI->getDest())) { + // Set the size of the copy to 0, it will be deleted on the next iteration. + MI->setLength(Constant::getNullValue(MI->getLength()->getType())); + return MI; + } + + // Extract the length and alignment and fill if they are constant. + ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength()); + ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue()); + if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) + return nullptr; + const uint64_t Len = LenC->getLimitedValue(); + assert(Len && "0-sized memory setting should be removed already."); + const Align Alignment = assumeAligned(MI->getDestAlignment()); + + // If it is an atomic and alignment is less than the size then we will + // introduce the unaligned memory access which will be later transformed + // into libcall in CodeGen. This is not evident performance gain so disable + // it now. + if (isa<AtomicMemSetInst>(MI)) + if (Alignment < Len) + return nullptr; + + // memset(s,c,n) -> store s, c (for n=1,2,4,8) + if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) { + Type *ITy = IntegerType::get(MI->getContext(), Len*8); // n=1 -> i8. + + Value *Dest = MI->getDest(); + unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace(); + Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp); + Dest = Builder.CreateBitCast(Dest, NewDstPtrTy); + + // Extract the fill value and store. + uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL; + StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest, + MI->isVolatile()); + S->setAlignment(Alignment); + if (isa<AtomicMemSetInst>(MI)) + S->setOrdering(AtomicOrdering::Unordered); + + // Set the size of the copy to 0, it will be deleted on the next iteration. + MI->setLength(Constant::getNullValue(LenC->getType())); + return MI; + } + + return nullptr; +} + +static Value *simplifyX86immShift(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + bool LogicalShift = false; + bool ShiftLeft = false; + + switch (II.getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx512_psra_q_128: + case Intrinsic::x86_avx512_psrai_q_128: + case Intrinsic::x86_avx512_psra_q_256: + case Intrinsic::x86_avx512_psrai_q_256: + case Intrinsic::x86_avx512_psra_d_512: + case Intrinsic::x86_avx512_psra_q_512: + case Intrinsic::x86_avx512_psra_w_512: + case Intrinsic::x86_avx512_psrai_d_512: + case Intrinsic::x86_avx512_psrai_q_512: + case Intrinsic::x86_avx512_psrai_w_512: + LogicalShift = false; ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx512_psrl_d_512: + case Intrinsic::x86_avx512_psrl_q_512: + case Intrinsic::x86_avx512_psrl_w_512: + case Intrinsic::x86_avx512_psrli_d_512: + case Intrinsic::x86_avx512_psrli_q_512: + case Intrinsic::x86_avx512_psrli_w_512: + LogicalShift = true; ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx512_psll_d_512: + case Intrinsic::x86_avx512_psll_q_512: + case Intrinsic::x86_avx512_psll_w_512: + case Intrinsic::x86_avx512_pslli_d_512: + case Intrinsic::x86_avx512_pslli_q_512: + case Intrinsic::x86_avx512_pslli_w_512: + LogicalShift = true; ShiftLeft = true; + break; + } + assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); + + // Simplify if count is constant. + auto Arg1 = II.getArgOperand(1); + auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1); + auto CDV = dyn_cast<ConstantDataVector>(Arg1); + auto CInt = dyn_cast<ConstantInt>(Arg1); + if (!CAZ && !CDV && !CInt) + return nullptr; + + APInt Count(64, 0); + if (CDV) { + // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + auto VT = cast<VectorType>(CDV->getType()); + unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits(); + assert((64 % BitWidth) == 0 && "Unexpected packed shift size"); + unsigned NumSubElts = 64 / BitWidth; + + // Concatenate the sub-elements to create the 64-bit value. + for (unsigned i = 0; i != NumSubElts; ++i) { + unsigned SubEltIdx = (NumSubElts - 1) - i; + auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); + Count <<= BitWidth; + Count |= SubElt->getValue().zextOrTrunc(64); + } + } + else if (CInt) + Count = CInt->getValue(); + + auto Vec = II.getArgOperand(0); + auto VT = cast<VectorType>(Vec->getType()); + auto SVT = VT->getElementType(); + unsigned VWidth = VT->getNumElements(); + unsigned BitWidth = SVT->getPrimitiveSizeInBits(); + + // If shift-by-zero then just return the original value. + if (Count.isNullValue()) + return Vec; + + // Handle cases when Shift >= BitWidth. + if (Count.uge(BitWidth)) { + // If LogicalShift - just return zero. + if (LogicalShift) + return ConstantAggregateZero::get(VT); + + // If ArithmeticShift - clamp Shift to (BitWidth - 1). + Count = APInt(64, BitWidth - 1); + } + + // Get a constant vector of the same type as the first operand. + auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); + auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); + + if (ShiftLeft) + return Builder.CreateShl(Vec, ShiftVec); + + if (LogicalShift) + return Builder.CreateLShr(Vec, ShiftVec); + + return Builder.CreateAShr(Vec, ShiftVec); +} + +// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. +// Unlike the generic IR shifts, the intrinsics have defined behaviour for out +// of range shift amounts (logical - set to zero, arithmetic - splat sign bit). +static Value *simplifyX86varShift(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + bool LogicalShift = false; + bool ShiftLeft = false; + + switch (II.getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: + case Intrinsic::x86_avx512_psrav_q_128: + case Intrinsic::x86_avx512_psrav_q_256: + case Intrinsic::x86_avx512_psrav_d_512: + case Intrinsic::x86_avx512_psrav_q_512: + case Intrinsic::x86_avx512_psrav_w_128: + case Intrinsic::x86_avx512_psrav_w_256: + case Intrinsic::x86_avx512_psrav_w_512: + LogicalShift = false; + ShiftLeft = false; + break; + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_q_256: + case Intrinsic::x86_avx512_psrlv_d_512: + case Intrinsic::x86_avx512_psrlv_q_512: + case Intrinsic::x86_avx512_psrlv_w_128: + case Intrinsic::x86_avx512_psrlv_w_256: + case Intrinsic::x86_avx512_psrlv_w_512: + LogicalShift = true; + ShiftLeft = false; + break; + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_q_256: + case Intrinsic::x86_avx512_psllv_d_512: + case Intrinsic::x86_avx512_psllv_q_512: + case Intrinsic::x86_avx512_psllv_w_128: + case Intrinsic::x86_avx512_psllv_w_256: + case Intrinsic::x86_avx512_psllv_w_512: + LogicalShift = true; + ShiftLeft = true; + break; + } + assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); + + // Simplify if all shift amounts are constant/undef. + auto *CShift = dyn_cast<Constant>(II.getArgOperand(1)); + if (!CShift) + return nullptr; + + auto Vec = II.getArgOperand(0); + auto VT = cast<VectorType>(II.getType()); + auto SVT = VT->getVectorElementType(); + int NumElts = VT->getNumElements(); + int BitWidth = SVT->getIntegerBitWidth(); + + // Collect each element's shift amount. + // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. + bool AnyOutOfRange = false; + SmallVector<int, 8> ShiftAmts; + for (int I = 0; I < NumElts; ++I) { + auto *CElt = CShift->getAggregateElement(I); + if (CElt && isa<UndefValue>(CElt)) { + ShiftAmts.push_back(-1); + continue; + } + + auto *COp = dyn_cast_or_null<ConstantInt>(CElt); + if (!COp) + return nullptr; + + // Handle out of range shifts. + // If LogicalShift - set to BitWidth (special case). + // If ArithmeticShift - set to (BitWidth - 1) (sign splat). + APInt ShiftVal = COp->getValue(); + if (ShiftVal.uge(BitWidth)) { + AnyOutOfRange = LogicalShift; + ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); + continue; + } + + ShiftAmts.push_back((int)ShiftVal.getZExtValue()); + } + + // If all elements out of range or UNDEF, return vector of zeros/undefs. + // ArithmeticShift should only hit this if they are all UNDEF. + auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; + if (llvm::all_of(ShiftAmts, OutOfRange)) { + SmallVector<Constant *, 8> ConstantVec; + for (int Idx : ShiftAmts) { + if (Idx < 0) { + ConstantVec.push_back(UndefValue::get(SVT)); + } else { + assert(LogicalShift && "Logical shift expected"); + ConstantVec.push_back(ConstantInt::getNullValue(SVT)); + } + } + return ConstantVector::get(ConstantVec); + } + + // We can't handle only some out of range values with generic logical shifts. + if (AnyOutOfRange) + return nullptr; + + // Build the shift amount constant vector. + SmallVector<Constant *, 8> ShiftVecAmts; + for (int Idx : ShiftAmts) { + if (Idx < 0) + ShiftVecAmts.push_back(UndefValue::get(SVT)); + else + ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); + } + auto ShiftVec = ConstantVector::get(ShiftVecAmts); + + if (ShiftLeft) + return Builder.CreateShl(Vec, ShiftVec); + + if (LogicalShift) + return Builder.CreateLShr(Vec, ShiftVec); + + return Builder.CreateAShr(Vec, ShiftVec); +} + +static Value *simplifyX86pack(IntrinsicInst &II, + InstCombiner::BuilderTy &Builder, bool IsSigned) { + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + Type *ResTy = II.getType(); + + // Fast all undef handling. + if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) + return UndefValue::get(ResTy); + + Type *ArgTy = Arg0->getType(); + unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; + unsigned NumSrcElts = ArgTy->getVectorNumElements(); + assert(ResTy->getVectorNumElements() == (2 * NumSrcElts) && + "Unexpected packing types"); + + unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; + unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); + unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); + assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && + "Unexpected packing types"); + + // Constant folding. + if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) + return nullptr; + + // Clamp Values - signed/unsigned both use signed clamp values, but they + // differ on the min/max values. + APInt MinValue, MaxValue; + if (IsSigned) { + // PACKSS: Truncate signed value with signed saturation. + // Source values less than dst minint are saturated to minint. + // Source values greater than dst maxint are saturated to maxint. + MinValue = + APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); + MaxValue = + APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); + } else { + // PACKUS: Truncate signed value with unsigned saturation. + // Source values less than zero are saturated to zero. + // Source values greater than dst maxuint are saturated to maxuint. + MinValue = APInt::getNullValue(SrcScalarSizeInBits); + MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); + } + + auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); + auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); + Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); + Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); + Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); + Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); + + // Shuffle clamped args together at the lane level. + SmallVector<unsigned, 32> PackMask; + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) + PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); + for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) + PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); + } + auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); + + // Truncate to dst size. + return Builder.CreateTrunc(Shuffle, ResTy); +} + +static Value *simplifyX86movmsk(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Value *Arg = II.getArgOperand(0); + Type *ResTy = II.getType(); + Type *ArgTy = Arg->getType(); + + // movmsk(undef) -> zero as we must ensure the upper bits are zero. + if (isa<UndefValue>(Arg)) + return Constant::getNullValue(ResTy); + + // We can't easily peek through x86_mmx types. + if (!ArgTy->isVectorTy()) + return nullptr; + + // Expand MOVMSK to compare/bitcast/zext: + // e.g. PMOVMSKB(v16i8 x): + // %cmp = icmp slt <16 x i8> %x, zeroinitializer + // %int = bitcast <16 x i1> %cmp to i16 + // %res = zext i16 %int to i32 + unsigned NumElts = ArgTy->getVectorNumElements(); + Type *IntegerVecTy = VectorType::getInteger(cast<VectorType>(ArgTy)); + Type *IntegerTy = Builder.getIntNTy(NumElts); + + Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); + Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); + Res = Builder.CreateBitCast(Res, IntegerTy); + Res = Builder.CreateZExtOrTrunc(Res, ResTy); + return Res; +} + +static Value *simplifyX86addcarry(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Value *CarryIn = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + Value *Op2 = II.getArgOperand(2); + Type *RetTy = II.getType(); + Type *OpTy = Op1->getType(); + assert(RetTy->getStructElementType(0)->isIntegerTy(8) && + RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && + "Unexpected types for x86 addcarry"); + + // If carry-in is zero, this is just an unsigned add with overflow. + if (match(CarryIn, m_ZeroInt())) { + Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, + { Op1, Op2 }); + // The types have to be adjusted to match the x86 call types. + Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); + Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), + Builder.getInt8Ty()); + Value *Res = UndefValue::get(RetTy); + Res = Builder.CreateInsertValue(Res, UAddOV, 0); + return Builder.CreateInsertValue(Res, UAddResult, 1); + } + + return nullptr; +} + +static Value *simplifyX86insertps(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); + if (!CInt) + return nullptr; + + VectorType *VecTy = cast<VectorType>(II.getType()); + assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); + + // The immediate permute control byte looks like this: + // [3:0] - zero mask for each 32-bit lane + // [5:4] - select one 32-bit destination lane + // [7:6] - select one 32-bit source lane + + uint8_t Imm = CInt->getZExtValue(); + uint8_t ZMask = Imm & 0xf; + uint8_t DestLane = (Imm >> 4) & 0x3; + uint8_t SourceLane = (Imm >> 6) & 0x3; + + ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); + + // If all zero mask bits are set, this was just a weird way to + // generate a zero vector. + if (ZMask == 0xf) + return ZeroVector; + + // Initialize by passing all of the first source bits through. + uint32_t ShuffleMask[4] = { 0, 1, 2, 3 }; + + // We may replace the second operand with the zero vector. + Value *V1 = II.getArgOperand(1); + + if (ZMask) { + // If the zero mask is being used with a single input or the zero mask + // overrides the destination lane, this is a shuffle with the zero vector. + if ((II.getArgOperand(0) == II.getArgOperand(1)) || + (ZMask & (1 << DestLane))) { + V1 = ZeroVector; + // We may still move 32-bits of the first source vector from one lane + // to another. + ShuffleMask[DestLane] = SourceLane; + // The zero mask may override the previous insert operation. + for (unsigned i = 0; i < 4; ++i) + if ((ZMask >> i) & 0x1) + ShuffleMask[i] = i + 4; + } else { + // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? + return nullptr; + } + } else { + // Replace the selected destination lane with the selected source lane. + ShuffleMask[DestLane] = SourceLane + 4; + } + + return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); +} + +/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding +/// or conversion to a shuffle vector. +static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, + ConstantInt *CILength, ConstantInt *CIIndex, + InstCombiner::BuilderTy &Builder) { + auto LowConstantHighUndef = [&](uint64_t Val) { + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + }; + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast<Constant>(Op0); + ConstantInt *CI0 = + C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) + : nullptr; + + // Attempt to constant fold. + if (CILength && CIIndex) { + // From AMD documentation: "The bit index and field length are each six + // bits in length other bits of the field are ignored." + APInt APIndex = CIIndex->getValue().zextOrTrunc(6); + APInt APLength = CILength->getValue().zextOrTrunc(6); + + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize EXTRQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Type *IntTy32 = Type::getInt32Ty(II.getContext()); + VectorType *ShufTy = VectorType::get(IntTy8, 16); + + SmallVector<Constant *, 16> ShuffleMask; + for (int i = 0; i != (int)Length; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + Index))); + for (int i = Length; i != 8; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(UndefValue::get(IntTy32)); + + Value *SV = Builder.CreateShuffleVector( + Builder.CreateBitCast(Op0, ShufTy), + ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask)); + return Builder.CreateBitCast(SV, II.getType()); + } + + // Constant Fold - shift Index'th bit to lowest position and mask off + // Length bits. + if (CI0) { + APInt Elt = CI0->getValue(); + Elt.lshrInPlace(Index); + Elt = Elt.zextOrTrunc(Length); + return LowConstantHighUndef(Elt.getZExtValue()); + } + + // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { + Value *Args[] = {Op0, CILength, CIIndex}; + Module *M = II.getModule(); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); + return Builder.CreateCall(F, Args); + } + } + + // Constant Fold - extraction from zero is always {zero, undef}. + if (CI0 && CI0->isZero()) + return LowConstantHighUndef(0); + + return nullptr; +} + +/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant +/// folding or conversion to a shuffle vector. +static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, + APInt APLength, APInt APIndex, + InstCombiner::BuilderTy &Builder) { + // From AMD documentation: "The bit index and field length are each six bits + // in length other bits of the field are ignored." + APIndex = APIndex.zextOrTrunc(6); + APLength = APLength.zextOrTrunc(6); + + // Attempt to constant fold. + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize INSERTQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Type *IntTy32 = Type::getInt32Ty(II.getContext()); + VectorType *ShufTy = VectorType::get(IntTy8, 16); + + SmallVector<Constant *, 16> ShuffleMask; + for (int i = 0; i != (int)Index; ++i) + ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); + for (int i = 0; i != (int)Length; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); + for (int i = Index + Length; i != 8; ++i) + ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(UndefValue::get(IntTy32)); + + Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), + Builder.CreateBitCast(Op1, ShufTy), + ConstantVector::get(ShuffleMask)); + return Builder.CreateBitCast(SV, II.getType()); + } + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast<Constant>(Op0); + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CI00 = + C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CI10 = + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) + : nullptr; + + // Constant Fold - insert bottom Length bits starting at the Index'th bit. + if (CI00 && CI10) { + APInt V00 = CI00->getValue(); + APInt V10 = CI10->getValue(); + APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); + V00 = V00 & ~Mask; + V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); + APInt Val = V00 | V10; + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + } + + // If we were an INSERTQ call, we'll save demanded elements if we convert to + // INSERTQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Constant *CILength = ConstantInt::get(IntTy8, Length, false); + Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); + + Value *Args[] = {Op0, Op1, CILength, CIIndex}; + Module *M = II.getModule(); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); + return Builder.CreateCall(F, Args); + } + + return nullptr; +} + +/// Attempt to convert pshufb* to shufflevector if the mask is constant. +static Value *simplifyX86pshufb(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); + if (!V) + return nullptr; + + auto *VecTy = cast<VectorType>(II.getType()); + auto *MaskEltTy = Type::getInt32Ty(II.getContext()); + unsigned NumElts = VecTy->getNumElements(); + assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && + "Unexpected number of elements in shuffle mask!"); + + // Construct a shuffle mask from constant integers or UNDEFs. + Constant *Indexes[64] = {nullptr}; + + // Each byte in the shuffle control mask forms an index to permute the + // corresponding byte in the destination operand. + for (unsigned I = 0; I < NumElts; ++I) { + Constant *COp = V->getAggregateElement(I); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) + return nullptr; + + if (isa<UndefValue>(COp)) { + Indexes[I] = UndefValue::get(MaskEltTy); + continue; + } + + int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); + + // If the most significant bit (bit[7]) of each byte of the shuffle + // control mask is set, then zero is written in the result byte. + // The zero vector is in the right-hand side of the resulting + // shufflevector. + + // The value of each index for the high 128-bit lane is the least + // significant 4 bits of the respective shuffle control byte. + Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); + Indexes[I] = ConstantInt::get(MaskEltTy, Index); + } + + auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); + auto V1 = II.getArgOperand(0); + auto V2 = Constant::getNullValue(VecTy); + return Builder.CreateShuffleVector(V1, V2, ShuffleMask); +} + +/// Attempt to convert vpermilvar* to shufflevector if the mask is constant. +static Value *simplifyX86vpermilvar(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); + if (!V) + return nullptr; + + auto *VecTy = cast<VectorType>(II.getType()); + auto *MaskEltTy = Type::getInt32Ty(II.getContext()); + unsigned NumElts = VecTy->getVectorNumElements(); + bool IsPD = VecTy->getScalarType()->isDoubleTy(); + unsigned NumLaneElts = IsPD ? 2 : 4; + assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); + + // Construct a shuffle mask from constant integers or UNDEFs. + Constant *Indexes[16] = {nullptr}; + + // The intrinsics only read one or two bits, clear the rest. + for (unsigned I = 0; I < NumElts; ++I) { + Constant *COp = V->getAggregateElement(I); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) + return nullptr; + + if (isa<UndefValue>(COp)) { + Indexes[I] = UndefValue::get(MaskEltTy); + continue; + } + + APInt Index = cast<ConstantInt>(COp)->getValue(); + Index = Index.zextOrTrunc(32).getLoBits(2); + + // The PD variants uses bit 1 to select per-lane element index, so + // shift down to convert to generic shuffle mask index. + if (IsPD) + Index.lshrInPlace(1); + + // The _256 variants are a bit trickier since the mask bits always index + // into the corresponding 128 half. In order to convert to a generic + // shuffle, we have to make that explicit. + Index += APInt(32, (I / NumLaneElts) * NumLaneElts); + + Indexes[I] = ConstantInt::get(MaskEltTy, Index); + } + + auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); + auto V1 = II.getArgOperand(0); + auto V2 = UndefValue::get(V1->getType()); + return Builder.CreateShuffleVector(V1, V2, ShuffleMask); +} + +/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. +static Value *simplifyX86vpermv(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + auto *V = dyn_cast<Constant>(II.getArgOperand(1)); + if (!V) + return nullptr; + + auto *VecTy = cast<VectorType>(II.getType()); + auto *MaskEltTy = Type::getInt32Ty(II.getContext()); + unsigned Size = VecTy->getNumElements(); + assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && + "Unexpected shuffle mask size"); + + // Construct a shuffle mask from constant integers or UNDEFs. + Constant *Indexes[64] = {nullptr}; + + for (unsigned I = 0; I < Size; ++I) { + Constant *COp = V->getAggregateElement(I); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) + return nullptr; + + if (isa<UndefValue>(COp)) { + Indexes[I] = UndefValue::get(MaskEltTy); + continue; + } + + uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); + Index &= Size - 1; + Indexes[I] = ConstantInt::get(MaskEltTy, Index); + } + + auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, Size)); + auto V1 = II.getArgOperand(0); + auto V2 = UndefValue::get(VecTy); + return Builder.CreateShuffleVector(V1, V2, ShuffleMask); +} + +// TODO, Obvious Missing Transforms: +// * Narrow width by halfs excluding zero/undef lanes +Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) { + Value *LoadPtr = II.getArgOperand(0); + unsigned Alignment = cast<ConstantInt>(II.getArgOperand(1))->getZExtValue(); + + // If the mask is all ones or undefs, this is a plain vector load of the 1st + // argument. + if (maskIsAllOneOrUndef(II.getArgOperand(2))) + return Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, + "unmaskedload"); + + // If we can unconditionally load from this address, replace with a + // load/select idiom. TODO: use DT for context sensitive query + if (isDereferenceableAndAlignedPointer( + LoadPtr, II.getType(), MaybeAlign(Alignment), + II.getModule()->getDataLayout(), &II, nullptr)) { + Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, + "unmaskedload"); + return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3)); + } + + return nullptr; +} + +// TODO, Obvious Missing Transforms: +// * Single constant active lane -> store +// * Narrow width by halfs excluding zero/undef lanes +Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) { + auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); + if (!ConstMask) + return nullptr; + + // If the mask is all zeros, this instruction does nothing. + if (ConstMask->isNullValue()) + return eraseInstFromFunction(II); + + // If the mask is all ones, this is a plain vector store of the 1st argument. + if (ConstMask->isAllOnesValue()) { + Value *StorePtr = II.getArgOperand(1); + MaybeAlign Alignment( + cast<ConstantInt>(II.getArgOperand(2))->getZExtValue()); + return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment); + } + + // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts + APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); + APInt UndefElts(DemandedElts.getBitWidth(), 0); + if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), + DemandedElts, UndefElts)) { + II.setOperand(0, V); + return &II; + } + + return nullptr; +} + +// TODO, Obvious Missing Transforms: +// * Single constant active lane load -> load +// * Dereferenceable address & few lanes -> scalarize speculative load/selects +// * Adjacent vector addresses -> masked.load +// * Narrow width by halfs excluding zero/undef lanes +// * Vector splat address w/known mask -> scalar load +// * Vector incrementing address -> vector masked load +Instruction *InstCombiner::simplifyMaskedGather(IntrinsicInst &II) { + return nullptr; +} + +// TODO, Obvious Missing Transforms: +// * Single constant active lane -> store +// * Adjacent vector addresses -> masked.store +// * Narrow store width by halfs excluding zero/undef lanes +// * Vector splat address w/known mask -> scalar store +// * Vector incrementing address -> vector masked store +Instruction *InstCombiner::simplifyMaskedScatter(IntrinsicInst &II) { + auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); + if (!ConstMask) + return nullptr; + + // If the mask is all zeros, a scatter does nothing. + if (ConstMask->isNullValue()) + return eraseInstFromFunction(II); + + // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts + APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); + APInt UndefElts(DemandedElts.getBitWidth(), 0); + if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), + DemandedElts, UndefElts)) { + II.setOperand(0, V); + return &II; + } + if (Value *V = SimplifyDemandedVectorElts(II.getOperand(1), + DemandedElts, UndefElts)) { + II.setOperand(1, V); + return &II; + } + + return nullptr; +} + +/// This function transforms launder.invariant.group and strip.invariant.group +/// like: +/// launder(launder(%x)) -> launder(%x) (the result is not the argument) +/// launder(strip(%x)) -> launder(%x) +/// strip(strip(%x)) -> strip(%x) (the result is not the argument) +/// strip(launder(%x)) -> strip(%x) +/// This is legal because it preserves the most recent information about +/// the presence or absence of invariant.group. +static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II, + InstCombiner &IC) { + auto *Arg = II.getArgOperand(0); + auto *StrippedArg = Arg->stripPointerCasts(); + auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups(); + if (StrippedArg == StrippedInvariantGroupsArg) + return nullptr; // No launders/strips to remove. + + Value *Result = nullptr; + + if (II.getIntrinsicID() == Intrinsic::launder_invariant_group) + Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg); + else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group) + Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg); + else + llvm_unreachable( + "simplifyInvariantGroupIntrinsic only handles launder and strip"); + if (Result->getType()->getPointerAddressSpace() != + II.getType()->getPointerAddressSpace()) + Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType()); + if (Result->getType() != II.getType()) + Result = IC.Builder.CreateBitCast(Result, II.getType()); + + return cast<Instruction>(Result); +} + +static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { + assert((II.getIntrinsicID() == Intrinsic::cttz || + II.getIntrinsicID() == Intrinsic::ctlz) && + "Expected cttz or ctlz intrinsic"); + bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; + Value *Op0 = II.getArgOperand(0); + Value *X; + // ctlz(bitreverse(x)) -> cttz(x) + // cttz(bitreverse(x)) -> ctlz(x) + if (match(Op0, m_BitReverse(m_Value(X)))) { + Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz; + Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType()); + return CallInst::Create(F, {X, II.getArgOperand(1)}); + } + + if (IsTZ) { + // cttz(-x) -> cttz(x) + if (match(Op0, m_Neg(m_Value(X)))) { + II.setOperand(0, X); + return &II; + } + + // cttz(abs(x)) -> cttz(x) + // cttz(nabs(x)) -> cttz(x) + Value *Y; + SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor; + if (SPF == SPF_ABS || SPF == SPF_NABS) { + II.setOperand(0, X); + return &II; + } + } + + KnownBits Known = IC.computeKnownBits(Op0, 0, &II); + + // Create a mask for bits above (ctlz) or below (cttz) the first known one. + unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros() + : Known.countMaxLeadingZeros(); + unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros() + : Known.countMinLeadingZeros(); + + // If all bits above (ctlz) or below (cttz) the first known one are known + // zero, this value is constant. + // FIXME: This should be in InstSimplify because we're replacing an + // instruction with a constant. + if (PossibleZeros == DefiniteZeros) { + auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros); + return IC.replaceInstUsesWith(II, C); + } + + // If the input to cttz/ctlz is known to be non-zero, + // then change the 'ZeroIsUndef' parameter to 'true' + // because we know the zero behavior can't affect the result. + if (!Known.One.isNullValue() || + isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II, + &IC.getDominatorTree())) { + if (!match(II.getArgOperand(1), m_One())) { + II.setOperand(1, IC.Builder.getTrue()); + return &II; + } + } + + // Add range metadata since known bits can't completely reflect what we know. + // TODO: Handle splat vectors. + auto *IT = dyn_cast<IntegerType>(Op0->getType()); + if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { + Metadata *LowAndHigh[] = { + ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)), + ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))}; + II.setMetadata(LLVMContext::MD_range, + MDNode::get(II.getContext(), LowAndHigh)); + return &II; + } + + return nullptr; +} + +static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) { + assert(II.getIntrinsicID() == Intrinsic::ctpop && + "Expected ctpop intrinsic"); + Value *Op0 = II.getArgOperand(0); + Value *X; + // ctpop(bitreverse(x)) -> ctpop(x) + // ctpop(bswap(x)) -> ctpop(x) + if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X)))) { + II.setOperand(0, X); + return &II; + } + + // FIXME: Try to simplify vectors of integers. + auto *IT = dyn_cast<IntegerType>(Op0->getType()); + if (!IT) + return nullptr; + + unsigned BitWidth = IT->getBitWidth(); + KnownBits Known(BitWidth); + IC.computeKnownBits(Op0, Known, 0, &II); + + unsigned MinCount = Known.countMinPopulation(); + unsigned MaxCount = Known.countMaxPopulation(); + + // Add range metadata since known bits can't completely reflect what we know. + if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { + Metadata *LowAndHigh[] = { + ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)), + ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))}; + II.setMetadata(LLVMContext::MD_range, + MDNode::get(II.getContext(), LowAndHigh)); + return &II; + } + + return nullptr; +} + +// TODO: If the x86 backend knew how to convert a bool vector mask back to an +// XMM register mask efficiently, we could transform all x86 masked intrinsics +// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. +static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { + Value *Ptr = II.getOperand(0); + Value *Mask = II.getOperand(1); + Constant *ZeroVec = Constant::getNullValue(II.getType()); + + // Special case a zero mask since that's not a ConstantDataVector. + // This masked load instruction creates a zero vector. + if (isa<ConstantAggregateZero>(Mask)) + return IC.replaceInstUsesWith(II, ZeroVec); + + auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); + if (!ConstMask) + return nullptr; + + // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic + // to allow target-independent optimizations. + + // First, cast the x86 intrinsic scalar pointer to a vector pointer to match + // the LLVM intrinsic definition for the pointer argument. + unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); + PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); + Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); + + // Second, convert the x86 XMM integer vector mask to a vector of bools based + // on each element's most significant bit (the sign bit). + Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); + + // The pass-through vector for an x86 masked load is a zero vector. + CallInst *NewMaskedLoad = + IC.Builder.CreateMaskedLoad(PtrCast, 1, BoolMask, ZeroVec); + return IC.replaceInstUsesWith(II, NewMaskedLoad); +} + +// TODO: If the x86 backend knew how to convert a bool vector mask back to an +// XMM register mask efficiently, we could transform all x86 masked intrinsics +// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. +static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { + Value *Ptr = II.getOperand(0); + Value *Mask = II.getOperand(1); + Value *Vec = II.getOperand(2); + + // Special case a zero mask since that's not a ConstantDataVector: + // this masked store instruction does nothing. + if (isa<ConstantAggregateZero>(Mask)) { + IC.eraseInstFromFunction(II); + return true; + } + + // The SSE2 version is too weird (eg, unaligned but non-temporal) to do + // anything else at this level. + if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) + return false; + + auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); + if (!ConstMask) + return false; + + // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic + // to allow target-independent optimizations. + + // First, cast the x86 intrinsic scalar pointer to a vector pointer to match + // the LLVM intrinsic definition for the pointer argument. + unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); + PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); + Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); + + // Second, convert the x86 XMM integer vector mask to a vector of bools based + // on each element's most significant bit (the sign bit). + Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); + + IC.Builder.CreateMaskedStore(Vec, PtrCast, 1, BoolMask); + + // 'Replace uses' doesn't work for stores. Erase the original masked store. + IC.eraseInstFromFunction(II); + return true; +} + +// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. +// +// A single NaN input is folded to minnum, so we rely on that folding for +// handling NaNs. +static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, + const APFloat &Src2) { + APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); + + APFloat::cmpResult Cmp0 = Max3.compare(Src0); + assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); + if (Cmp0 == APFloat::cmpEqual) + return maxnum(Src1, Src2); + + APFloat::cmpResult Cmp1 = Max3.compare(Src1); + assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); + if (Cmp1 == APFloat::cmpEqual) + return maxnum(Src0, Src2); + + return maxnum(Src0, Src1); +} + +/// Convert a table lookup to shufflevector if the mask is constant. +/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in +/// which case we could lower the shufflevector with rev64 instructions +/// as it's actually a byte reverse. +static Value *simplifyNeonTbl1(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + // Bail out if the mask is not a constant. + auto *C = dyn_cast<Constant>(II.getArgOperand(1)); + if (!C) + return nullptr; + + auto *VecTy = cast<VectorType>(II.getType()); + unsigned NumElts = VecTy->getNumElements(); + + // Only perform this transformation for <8 x i8> vector types. + if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8) + return nullptr; + + uint32_t Indexes[8]; + + for (unsigned I = 0; I < NumElts; ++I) { + Constant *COp = C->getAggregateElement(I); + + if (!COp || !isa<ConstantInt>(COp)) + return nullptr; + + Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue(); + + // Make sure the mask indices are in range. + if (Indexes[I] >= NumElts) + return nullptr; + } + + auto *ShuffleMask = ConstantDataVector::get(II.getContext(), + makeArrayRef(Indexes)); + auto *V1 = II.getArgOperand(0); + auto *V2 = Constant::getNullValue(V1->getType()); + return Builder.CreateShuffleVector(V1, V2, ShuffleMask); +} + +/// Convert a vector load intrinsic into a simple llvm load instruction. +/// This is beneficial when the underlying object being addressed comes +/// from a constant, since we get constant-folding for free. +static Value *simplifyNeonVld1(const IntrinsicInst &II, + unsigned MemAlign, + InstCombiner::BuilderTy &Builder) { + auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); + + if (!IntrAlign) + return nullptr; + + unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign ? + MemAlign : IntrAlign->getLimitedValue(); + + if (!isPowerOf2_32(Alignment)) + return nullptr; + + auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), + PointerType::get(II.getType(), 0)); + return Builder.CreateAlignedLoad(II.getType(), BCastInst, Alignment); +} + +// Returns true iff the 2 intrinsics have the same operands, limiting the +// comparison to the first NumOperands. +static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E, + unsigned NumOperands) { + assert(I.getNumArgOperands() >= NumOperands && "Not enough operands"); + assert(E.getNumArgOperands() >= NumOperands && "Not enough operands"); + for (unsigned i = 0; i < NumOperands; i++) + if (I.getArgOperand(i) != E.getArgOperand(i)) + return false; + return true; +} + +// Remove trivially empty start/end intrinsic ranges, i.e. a start +// immediately followed by an end (ignoring debuginfo or other +// start/end intrinsics in between). As this handles only the most trivial +// cases, tracking the nesting level is not needed: +// +// call @llvm.foo.start(i1 0) ; &I +// call @llvm.foo.start(i1 0) +// call @llvm.foo.end(i1 0) ; This one will not be skipped: it will be removed +// call @llvm.foo.end(i1 0) +static bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID, + unsigned EndID, InstCombiner &IC) { + assert(I.getIntrinsicID() == StartID && + "Start intrinsic does not have expected ID"); + BasicBlock::iterator BI(I), BE(I.getParent()->end()); + for (++BI; BI != BE; ++BI) { + if (auto *E = dyn_cast<IntrinsicInst>(BI)) { + if (isa<DbgInfoIntrinsic>(E) || E->getIntrinsicID() == StartID) + continue; + if (E->getIntrinsicID() == EndID && + haveSameOperands(I, *E, E->getNumArgOperands())) { + IC.eraseInstFromFunction(*E); + IC.eraseInstFromFunction(I); + return true; + } + } + break; + } + + return false; +} + +// Convert NVVM intrinsics to target-generic LLVM code where possible. +static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) { + // Each NVVM intrinsic we can simplify can be replaced with one of: + // + // * an LLVM intrinsic, + // * an LLVM cast operation, + // * an LLVM binary operation, or + // * ad-hoc LLVM IR for the particular operation. + + // Some transformations are only valid when the module's + // flush-denormals-to-zero (ftz) setting is true/false, whereas other + // transformations are valid regardless of the module's ftz setting. + enum FtzRequirementTy { + FTZ_Any, // Any ftz setting is ok. + FTZ_MustBeOn, // Transformation is valid only if ftz is on. + FTZ_MustBeOff, // Transformation is valid only if ftz is off. + }; + // Classes of NVVM intrinsics that can't be replaced one-to-one with a + // target-generic intrinsic, cast op, or binary op but that we can nonetheless + // simplify. + enum SpecialCase { + SPC_Reciprocal, + }; + + // SimplifyAction is a poor-man's variant (plus an additional flag) that + // represents how to replace an NVVM intrinsic with target-generic LLVM IR. + struct SimplifyAction { + // Invariant: At most one of these Optionals has a value. + Optional<Intrinsic::ID> IID; + Optional<Instruction::CastOps> CastOp; + Optional<Instruction::BinaryOps> BinaryOp; + Optional<SpecialCase> Special; + + FtzRequirementTy FtzRequirement = FTZ_Any; + + SimplifyAction() = default; + + SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq) + : IID(IID), FtzRequirement(FtzReq) {} + + // Cast operations don't have anything to do with FTZ, so we skip that + // argument. + SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {} + + SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq) + : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {} + + SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq) + : Special(Special), FtzRequirement(FtzReq) {} + }; + + // Try to generate a SimplifyAction describing how to replace our + // IntrinsicInstr with target-generic LLVM IR. + const SimplifyAction Action = [II]() -> SimplifyAction { + switch (II->getIntrinsicID()) { + // NVVM intrinsics that map directly to LLVM intrinsics. + case Intrinsic::nvvm_ceil_d: + return {Intrinsic::ceil, FTZ_Any}; + case Intrinsic::nvvm_ceil_f: + return {Intrinsic::ceil, FTZ_MustBeOff}; + case Intrinsic::nvvm_ceil_ftz_f: + return {Intrinsic::ceil, FTZ_MustBeOn}; + case Intrinsic::nvvm_fabs_d: + return {Intrinsic::fabs, FTZ_Any}; + case Intrinsic::nvvm_fabs_f: + return {Intrinsic::fabs, FTZ_MustBeOff}; + case Intrinsic::nvvm_fabs_ftz_f: + return {Intrinsic::fabs, FTZ_MustBeOn}; + case Intrinsic::nvvm_floor_d: + return {Intrinsic::floor, FTZ_Any}; + case Intrinsic::nvvm_floor_f: + return {Intrinsic::floor, FTZ_MustBeOff}; + case Intrinsic::nvvm_floor_ftz_f: + return {Intrinsic::floor, FTZ_MustBeOn}; + case Intrinsic::nvvm_fma_rn_d: + return {Intrinsic::fma, FTZ_Any}; + case Intrinsic::nvvm_fma_rn_f: + return {Intrinsic::fma, FTZ_MustBeOff}; + case Intrinsic::nvvm_fma_rn_ftz_f: + return {Intrinsic::fma, FTZ_MustBeOn}; + case Intrinsic::nvvm_fmax_d: + return {Intrinsic::maxnum, FTZ_Any}; + case Intrinsic::nvvm_fmax_f: + return {Intrinsic::maxnum, FTZ_MustBeOff}; + case Intrinsic::nvvm_fmax_ftz_f: + return {Intrinsic::maxnum, FTZ_MustBeOn}; + case Intrinsic::nvvm_fmin_d: + return {Intrinsic::minnum, FTZ_Any}; + case Intrinsic::nvvm_fmin_f: + return {Intrinsic::minnum, FTZ_MustBeOff}; + case Intrinsic::nvvm_fmin_ftz_f: + return {Intrinsic::minnum, FTZ_MustBeOn}; + case Intrinsic::nvvm_round_d: + return {Intrinsic::round, FTZ_Any}; + case Intrinsic::nvvm_round_f: + return {Intrinsic::round, FTZ_MustBeOff}; + case Intrinsic::nvvm_round_ftz_f: + return {Intrinsic::round, FTZ_MustBeOn}; + case Intrinsic::nvvm_sqrt_rn_d: + return {Intrinsic::sqrt, FTZ_Any}; + case Intrinsic::nvvm_sqrt_f: + // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the + // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts + // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are + // the versions with explicit ftz-ness. + return {Intrinsic::sqrt, FTZ_Any}; + case Intrinsic::nvvm_sqrt_rn_f: + return {Intrinsic::sqrt, FTZ_MustBeOff}; + case Intrinsic::nvvm_sqrt_rn_ftz_f: + return {Intrinsic::sqrt, FTZ_MustBeOn}; + case Intrinsic::nvvm_trunc_d: + return {Intrinsic::trunc, FTZ_Any}; + case Intrinsic::nvvm_trunc_f: + return {Intrinsic::trunc, FTZ_MustBeOff}; + case Intrinsic::nvvm_trunc_ftz_f: + return {Intrinsic::trunc, FTZ_MustBeOn}; + + // NVVM intrinsics that map to LLVM cast operations. + // + // Note that llvm's target-generic conversion operators correspond to the rz + // (round to zero) versions of the nvvm conversion intrinsics, even though + // most everything else here uses the rn (round to nearest even) nvvm ops. + case Intrinsic::nvvm_d2i_rz: + case Intrinsic::nvvm_f2i_rz: + case Intrinsic::nvvm_d2ll_rz: + case Intrinsic::nvvm_f2ll_rz: + return {Instruction::FPToSI}; + case Intrinsic::nvvm_d2ui_rz: + case Intrinsic::nvvm_f2ui_rz: + case Intrinsic::nvvm_d2ull_rz: + case Intrinsic::nvvm_f2ull_rz: + return {Instruction::FPToUI}; + case Intrinsic::nvvm_i2d_rz: + case Intrinsic::nvvm_i2f_rz: + case Intrinsic::nvvm_ll2d_rz: + case Intrinsic::nvvm_ll2f_rz: + return {Instruction::SIToFP}; + case Intrinsic::nvvm_ui2d_rz: + case Intrinsic::nvvm_ui2f_rz: + case Intrinsic::nvvm_ull2d_rz: + case Intrinsic::nvvm_ull2f_rz: + return {Instruction::UIToFP}; + + // NVVM intrinsics that map to LLVM binary ops. + case Intrinsic::nvvm_add_rn_d: + return {Instruction::FAdd, FTZ_Any}; + case Intrinsic::nvvm_add_rn_f: + return {Instruction::FAdd, FTZ_MustBeOff}; + case Intrinsic::nvvm_add_rn_ftz_f: + return {Instruction::FAdd, FTZ_MustBeOn}; + case Intrinsic::nvvm_mul_rn_d: + return {Instruction::FMul, FTZ_Any}; + case Intrinsic::nvvm_mul_rn_f: + return {Instruction::FMul, FTZ_MustBeOff}; + case Intrinsic::nvvm_mul_rn_ftz_f: + return {Instruction::FMul, FTZ_MustBeOn}; + case Intrinsic::nvvm_div_rn_d: + return {Instruction::FDiv, FTZ_Any}; + case Intrinsic::nvvm_div_rn_f: + return {Instruction::FDiv, FTZ_MustBeOff}; + case Intrinsic::nvvm_div_rn_ftz_f: + return {Instruction::FDiv, FTZ_MustBeOn}; + + // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but + // need special handling. + // + // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just + // as well. + case Intrinsic::nvvm_rcp_rn_d: + return {SPC_Reciprocal, FTZ_Any}; + case Intrinsic::nvvm_rcp_rn_f: + return {SPC_Reciprocal, FTZ_MustBeOff}; + case Intrinsic::nvvm_rcp_rn_ftz_f: + return {SPC_Reciprocal, FTZ_MustBeOn}; + + // We do not currently simplify intrinsics that give an approximate answer. + // These include: + // + // - nvvm_cos_approx_{f,ftz_f} + // - nvvm_ex2_approx_{d,f,ftz_f} + // - nvvm_lg2_approx_{d,f,ftz_f} + // - nvvm_sin_approx_{f,ftz_f} + // - nvvm_sqrt_approx_{f,ftz_f} + // - nvvm_rsqrt_approx_{d,f,ftz_f} + // - nvvm_div_approx_{ftz_d,ftz_f,f} + // - nvvm_rcp_approx_ftz_d + // + // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast" + // means that fastmath is enabled in the intrinsic. Unfortunately only + // binary operators (currently) have a fastmath bit in SelectionDAG, so this + // information gets lost and we can't select on it. + // + // TODO: div and rcp are lowered to a binary op, so these we could in theory + // lower them to "fast fdiv". + + default: + return {}; + } + }(); + + // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we + // can bail out now. (Notice that in the case that IID is not an NVVM + // intrinsic, we don't have to look up any module metadata, as + // FtzRequirementTy will be FTZ_Any.) + if (Action.FtzRequirement != FTZ_Any) { + bool FtzEnabled = + II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() == + "true"; + + if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) + return nullptr; + } + + // Simplify to target-generic intrinsic. + if (Action.IID) { + SmallVector<Value *, 4> Args(II->arg_operands()); + // All the target-generic intrinsics currently of interest to us have one + // type argument, equal to that of the nvvm intrinsic's argument. + Type *Tys[] = {II->getArgOperand(0)->getType()}; + return CallInst::Create( + Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); + } + + // Simplify to target-generic binary op. + if (Action.BinaryOp) + return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0), + II->getArgOperand(1), II->getName()); + + // Simplify to target-generic cast op. + if (Action.CastOp) + return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(), + II->getName()); + + // All that's left are the special cases. + if (!Action.Special) + return nullptr; + + switch (*Action.Special) { + case SPC_Reciprocal: + // Simplify reciprocal. + return BinaryOperator::Create( + Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1), + II->getArgOperand(0), II->getName()); + } + llvm_unreachable("All SpecialCase enumerators should be handled in switch."); +} + +Instruction *InstCombiner::visitVAStartInst(VAStartInst &I) { + removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this); + return nullptr; +} + +Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) { + removeTriviallyEmptyRange(I, Intrinsic::vacopy, Intrinsic::vaend, *this); + return nullptr; +} + +static Instruction *canonicalizeConstantArg0ToArg1(CallInst &Call) { + assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap"); + Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1); + if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) { + Call.setArgOperand(0, Arg1); + Call.setArgOperand(1, Arg0); + return &Call; + } + return nullptr; +} + +Instruction *InstCombiner::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) { + WithOverflowInst *WO = cast<WithOverflowInst>(II); + Value *OperationResult = nullptr; + Constant *OverflowResult = nullptr; + if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(), + WO->getRHS(), *WO, OperationResult, OverflowResult)) + return CreateOverflowTuple(WO, OperationResult, OverflowResult); + return nullptr; +} + +/// CallInst simplification. This mostly only handles folding of intrinsic +/// instructions. For normal calls, it allows visitCallBase to do the heavy +/// lifting. +Instruction *InstCombiner::visitCallInst(CallInst &CI) { + if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI))) + return replaceInstUsesWith(CI, V); + + if (isFreeCall(&CI, &TLI)) + return visitFree(CI); + + // If the caller function is nounwind, mark the call as nounwind, even if the + // callee isn't. + if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) { + CI.setDoesNotThrow(); + return &CI; + } + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI); + if (!II) return visitCallBase(CI); + + // Intrinsics cannot occur in an invoke or a callbr, so handle them here + // instead of in visitCallBase. + if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) { + bool Changed = false; + + // memmove/cpy/set of zero bytes is a noop. + if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) { + if (NumBytes->isNullValue()) + return eraseInstFromFunction(CI); + + if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) + if (CI->getZExtValue() == 1) { + // Replace the instruction with just byte operations. We would + // transform other cases to loads/stores, but we don't know if + // alignment is sufficient. + } + } + + // No other transformations apply to volatile transfers. + if (auto *M = dyn_cast<MemIntrinsic>(MI)) + if (M->isVolatile()) + return nullptr; + + // If we have a memmove and the source operation is a constant global, + // then the source and dest pointers can't alias, so we can change this + // into a call to memcpy. + if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) { + if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource())) + if (GVSrc->isConstant()) { + Module *M = CI.getModule(); + Intrinsic::ID MemCpyID = + isa<AtomicMemMoveInst>(MMI) + ? Intrinsic::memcpy_element_unordered_atomic + : Intrinsic::memcpy; + Type *Tys[3] = { CI.getArgOperand(0)->getType(), + CI.getArgOperand(1)->getType(), + CI.getArgOperand(2)->getType() }; + CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys)); + Changed = true; + } + } + + if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) { + // memmove(x,x,size) -> noop. + if (MTI->getSource() == MTI->getDest()) + return eraseInstFromFunction(CI); + } + + // If we can determine a pointer alignment that is bigger than currently + // set, update the alignment. + if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) { + if (Instruction *I = SimplifyAnyMemTransfer(MTI)) + return I; + } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) { + if (Instruction *I = SimplifyAnyMemSet(MSI)) + return I; + } + + if (Changed) return II; + } + + // For vector result intrinsics, use the generic demanded vector support. + if (II->getType()->isVectorTy()) { + auto VWidth = II->getType()->getVectorNumElements(); + APInt UndefElts(VWidth, 0); + APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); + if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) { + if (V != II) + return replaceInstUsesWith(*II, V); + return II; + } + } + + if (Instruction *I = SimplifyNVVMIntrinsic(II, *this)) + return I; + + auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, + unsigned DemandedWidth) { + APInt UndefElts(Width, 0); + APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); + return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); + }; + + Intrinsic::ID IID = II->getIntrinsicID(); + switch (IID) { + default: break; + case Intrinsic::objectsize: + if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false)) + return replaceInstUsesWith(CI, V); + return nullptr; + case Intrinsic::bswap: { + Value *IIOperand = II->getArgOperand(0); + Value *X = nullptr; + + // bswap(trunc(bswap(x))) -> trunc(lshr(x, c)) + if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) { + unsigned C = X->getType()->getPrimitiveSizeInBits() - + IIOperand->getType()->getPrimitiveSizeInBits(); + Value *CV = ConstantInt::get(X->getType(), C); + Value *V = Builder.CreateLShr(X, CV); + return new TruncInst(V, IIOperand->getType()); + } + break; + } + case Intrinsic::masked_load: + if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II)) + return replaceInstUsesWith(CI, SimplifiedMaskedOp); + break; + case Intrinsic::masked_store: + return simplifyMaskedStore(*II); + case Intrinsic::masked_gather: + return simplifyMaskedGather(*II); + case Intrinsic::masked_scatter: + return simplifyMaskedScatter(*II); + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: + if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this)) + return replaceInstUsesWith(*II, SkippedBarrier); + break; + case Intrinsic::powi: + if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) { + // 0 and 1 are handled in instsimplify + + // powi(x, -1) -> 1/x + if (Power->isMinusOne()) + return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0), + II->getArgOperand(0)); + // powi(x, 2) -> x*x + if (Power->equalsInt(2)) + return BinaryOperator::CreateFMul(II->getArgOperand(0), + II->getArgOperand(0)); + } + break; + + case Intrinsic::cttz: + case Intrinsic::ctlz: + if (auto *I = foldCttzCtlz(*II, *this)) + return I; + break; + + case Intrinsic::ctpop: + if (auto *I = foldCtpop(*II, *this)) + return I; + break; + + case Intrinsic::fshl: + case Intrinsic::fshr: { + Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1); + Type *Ty = II->getType(); + unsigned BitWidth = Ty->getScalarSizeInBits(); + Constant *ShAmtC; + if (match(II->getArgOperand(2), m_Constant(ShAmtC)) && + !isa<ConstantExpr>(ShAmtC) && !ShAmtC->containsConstantExpression()) { + // Canonicalize a shift amount constant operand to modulo the bit-width. + Constant *WidthC = ConstantInt::get(Ty, BitWidth); + Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC); + if (ModuloC != ShAmtC) { + II->setArgOperand(2, ModuloC); + return II; + } + assert(ConstantExpr::getICmp(ICmpInst::ICMP_UGT, WidthC, ShAmtC) == + ConstantInt::getTrue(CmpInst::makeCmpResultType(Ty)) && + "Shift amount expected to be modulo bitwidth"); + + // Canonicalize funnel shift right by constant to funnel shift left. This + // is not entirely arbitrary. For historical reasons, the backend may + // recognize rotate left patterns but miss rotate right patterns. + if (IID == Intrinsic::fshr) { + // fshr X, Y, C --> fshl X, Y, (BitWidth - C) + Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC); + Module *Mod = II->getModule(); + Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty); + return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC }); + } + assert(IID == Intrinsic::fshl && + "All funnel shifts by simple constants should go left"); + + // fshl(X, 0, C) --> shl X, C + // fshl(X, undef, C) --> shl X, C + if (match(Op1, m_ZeroInt()) || match(Op1, m_Undef())) + return BinaryOperator::CreateShl(Op0, ShAmtC); + + // fshl(0, X, C) --> lshr X, (BW-C) + // fshl(undef, X, C) --> lshr X, (BW-C) + if (match(Op0, m_ZeroInt()) || match(Op0, m_Undef())) + return BinaryOperator::CreateLShr(Op1, + ConstantExpr::getSub(WidthC, ShAmtC)); + + // fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form) + if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) { + Module *Mod = II->getModule(); + Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty); + return CallInst::Create(Bswap, { Op0 }); + } + } + + // Left or right might be masked. + if (SimplifyDemandedInstructionBits(*II)) + return &CI; + + // The shift amount (operand 2) of a funnel shift is modulo the bitwidth, + // so only the low bits of the shift amount are demanded if the bitwidth is + // a power-of-2. + if (!isPowerOf2_32(BitWidth)) + break; + APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth)); + KnownBits Op2Known(BitWidth); + if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known)) + return &CI; + break; + } + case Intrinsic::uadd_with_overflow: + case Intrinsic::sadd_with_overflow: { + if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) + return I; + if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) + return I; + + // Given 2 constant operands whose sum does not overflow: + // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1 + // saddo (X +nsw C0), C1 -> saddo X, C0 + C1 + Value *X; + const APInt *C0, *C1; + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + bool IsSigned = IID == Intrinsic::sadd_with_overflow; + bool HasNWAdd = IsSigned ? match(Arg0, m_NSWAdd(m_Value(X), m_APInt(C0))) + : match(Arg0, m_NUWAdd(m_Value(X), m_APInt(C0))); + if (HasNWAdd && match(Arg1, m_APInt(C1))) { + bool Overflow; + APInt NewC = + IsSigned ? C1->sadd_ov(*C0, Overflow) : C1->uadd_ov(*C0, Overflow); + if (!Overflow) + return replaceInstUsesWith( + *II, Builder.CreateBinaryIntrinsic( + IID, X, ConstantInt::get(Arg1->getType(), NewC))); + } + break; + } + + case Intrinsic::umul_with_overflow: + case Intrinsic::smul_with_overflow: + if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) + return I; + LLVM_FALLTHROUGH; + + case Intrinsic::usub_with_overflow: + if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) + return I; + break; + + case Intrinsic::ssub_with_overflow: { + if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) + return I; + + Constant *C; + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + // Given a constant C that is not the minimum signed value + // for an integer of a given bit width: + // + // ssubo X, C -> saddo X, -C + if (match(Arg1, m_Constant(C)) && C->isNotMinSignedValue()) { + Value *NegVal = ConstantExpr::getNeg(C); + // Build a saddo call that is equivalent to the discovered + // ssubo call. + return replaceInstUsesWith( + *II, Builder.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, + Arg0, NegVal)); + } + + break; + } + + case Intrinsic::uadd_sat: + case Intrinsic::sadd_sat: + if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) + return I; + LLVM_FALLTHROUGH; + case Intrinsic::usub_sat: + case Intrinsic::ssub_sat: { + SaturatingInst *SI = cast<SaturatingInst>(II); + Type *Ty = SI->getType(); + Value *Arg0 = SI->getLHS(); + Value *Arg1 = SI->getRHS(); + + // Make use of known overflow information. + OverflowResult OR = computeOverflow(SI->getBinaryOp(), SI->isSigned(), + Arg0, Arg1, SI); + switch (OR) { + case OverflowResult::MayOverflow: + break; + case OverflowResult::NeverOverflows: + if (SI->isSigned()) + return BinaryOperator::CreateNSW(SI->getBinaryOp(), Arg0, Arg1); + else + return BinaryOperator::CreateNUW(SI->getBinaryOp(), Arg0, Arg1); + case OverflowResult::AlwaysOverflowsLow: { + unsigned BitWidth = Ty->getScalarSizeInBits(); + APInt Min = APSInt::getMinValue(BitWidth, !SI->isSigned()); + return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Min)); + } + case OverflowResult::AlwaysOverflowsHigh: { + unsigned BitWidth = Ty->getScalarSizeInBits(); + APInt Max = APSInt::getMaxValue(BitWidth, !SI->isSigned()); + return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Max)); + } + } + + // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN + Constant *C; + if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) && + C->isNotMinSignedValue()) { + Value *NegVal = ConstantExpr::getNeg(C); + return replaceInstUsesWith( + *II, Builder.CreateBinaryIntrinsic( + Intrinsic::sadd_sat, Arg0, NegVal)); + } + + // sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2)) + // sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2)) + // if Val and Val2 have the same sign + if (auto *Other = dyn_cast<IntrinsicInst>(Arg0)) { + Value *X; + const APInt *Val, *Val2; + APInt NewVal; + bool IsUnsigned = + IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat; + if (Other->getIntrinsicID() == IID && + match(Arg1, m_APInt(Val)) && + match(Other->getArgOperand(0), m_Value(X)) && + match(Other->getArgOperand(1), m_APInt(Val2))) { + if (IsUnsigned) + NewVal = Val->uadd_sat(*Val2); + else if (Val->isNonNegative() == Val2->isNonNegative()) { + bool Overflow; + NewVal = Val->sadd_ov(*Val2, Overflow); + if (Overflow) { + // Both adds together may add more than SignedMaxValue + // without saturating the final result. + break; + } + } else { + // Cannot fold saturated addition with different signs. + break; + } + + return replaceInstUsesWith( + *II, Builder.CreateBinaryIntrinsic( + IID, X, ConstantInt::get(II->getType(), NewVal))); + } + } + break; + } + + case Intrinsic::minnum: + case Intrinsic::maxnum: + case Intrinsic::minimum: + case Intrinsic::maximum: { + if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) + return I; + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + Value *X, *Y; + if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) && + (Arg0->hasOneUse() || Arg1->hasOneUse())) { + // If both operands are negated, invert the call and negate the result: + // min(-X, -Y) --> -(max(X, Y)) + // max(-X, -Y) --> -(min(X, Y)) + Intrinsic::ID NewIID; + switch (IID) { + case Intrinsic::maxnum: + NewIID = Intrinsic::minnum; + break; + case Intrinsic::minnum: + NewIID = Intrinsic::maxnum; + break; + case Intrinsic::maximum: + NewIID = Intrinsic::minimum; + break; + case Intrinsic::minimum: + NewIID = Intrinsic::maximum; + break; + default: + llvm_unreachable("unexpected intrinsic ID"); + } + Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II); + Instruction *FNeg = BinaryOperator::CreateFNeg(NewCall); + FNeg->copyIRFlags(II); + return FNeg; + } + + // m(m(X, C2), C1) -> m(X, C) + const APFloat *C1, *C2; + if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) { + if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) && + ((match(M->getArgOperand(0), m_Value(X)) && + match(M->getArgOperand(1), m_APFloat(C2))) || + (match(M->getArgOperand(1), m_Value(X)) && + match(M->getArgOperand(0), m_APFloat(C2))))) { + APFloat Res(0.0); + switch (IID) { + case Intrinsic::maxnum: + Res = maxnum(*C1, *C2); + break; + case Intrinsic::minnum: + Res = minnum(*C1, *C2); + break; + case Intrinsic::maximum: + Res = maximum(*C1, *C2); + break; + case Intrinsic::minimum: + Res = minimum(*C1, *C2); + break; + default: + llvm_unreachable("unexpected intrinsic ID"); + } + Instruction *NewCall = Builder.CreateBinaryIntrinsic( + IID, X, ConstantFP::get(Arg0->getType(), Res)); + NewCall->copyIRFlags(II); + return replaceInstUsesWith(*II, NewCall); + } + } + + break; + } + case Intrinsic::fmuladd: { + // Canonicalize fast fmuladd to the separate fmul + fadd. + if (II->isFast()) { + BuilderTy::FastMathFlagGuard Guard(Builder); + Builder.setFastMathFlags(II->getFastMathFlags()); + Value *Mul = Builder.CreateFMul(II->getArgOperand(0), + II->getArgOperand(1)); + Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2)); + Add->takeName(II); + return replaceInstUsesWith(*II, Add); + } + + // Try to simplify the underlying FMul. + if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1), + II->getFastMathFlags(), + SQ.getWithInstruction(II))) { + auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); + FAdd->copyFastMathFlags(II); + return FAdd; + } + + LLVM_FALLTHROUGH; + } + case Intrinsic::fma: { + if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) + return I; + + // fma fneg(x), fneg(y), z -> fma x, y, z + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); + Value *X, *Y; + if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) { + II->setArgOperand(0, X); + II->setArgOperand(1, Y); + return II; + } + + // fma fabs(x), fabs(x), z -> fma x, x, z + if (match(Src0, m_FAbs(m_Value(X))) && + match(Src1, m_FAbs(m_Specific(X)))) { + II->setArgOperand(0, X); + II->setArgOperand(1, X); + return II; + } + + // Try to simplify the underlying FMul. We can only apply simplifications + // that do not require rounding. + if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1), + II->getFastMathFlags(), + SQ.getWithInstruction(II))) { + auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); + FAdd->copyFastMathFlags(II); + return FAdd; + } + + break; + } + case Intrinsic::fabs: { + Value *Cond; + Constant *LHS, *RHS; + if (match(II->getArgOperand(0), + m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) { + CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS}); + CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS}); + return SelectInst::Create(Cond, Call0, Call1); + } + + LLVM_FALLTHROUGH; + } + case Intrinsic::ceil: + case Intrinsic::floor: + case Intrinsic::round: + case Intrinsic::nearbyint: + case Intrinsic::rint: + case Intrinsic::trunc: { + Value *ExtSrc; + if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) { + // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x) + Value *NarrowII = Builder.CreateUnaryIntrinsic(IID, ExtSrc, II); + return new FPExtInst(NarrowII, II->getType()); + } + break; + } + case Intrinsic::cos: + case Intrinsic::amdgcn_cos: { + Value *X; + Value *Src = II->getArgOperand(0); + if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X)))) { + // cos(-x) -> cos(x) + // cos(fabs(x)) -> cos(x) + II->setArgOperand(0, X); + return II; + } + break; + } + case Intrinsic::sin: { + Value *X; + if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) { + // sin(-x) --> -sin(x) + Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II); + Instruction *FNeg = BinaryOperator::CreateFNeg(NewSin); + FNeg->copyFastMathFlags(II); + return FNeg; + } + break; + } + case Intrinsic::ppc_altivec_lvx: + case Intrinsic::ppc_altivec_lvxl: + // Turn PPC lvx -> load if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, + &DT) >= 16) { + Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), + PointerType::getUnqual(II->getType())); + return new LoadInst(II->getType(), Ptr); + } + break; + case Intrinsic::ppc_vsx_lxvw4x: + case Intrinsic::ppc_vsx_lxvd2x: { + // Turn PPC VSX loads into normal loads. + Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), + PointerType::getUnqual(II->getType())); + return new LoadInst(II->getType(), Ptr, Twine(""), false, Align::None()); + } + case Intrinsic::ppc_altivec_stvx: + case Intrinsic::ppc_altivec_stvxl: + // Turn stvx -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, + &DT) >= 16) { + Type *OpPtrTy = + PointerType::getUnqual(II->getArgOperand(0)->getType()); + Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); + return new StoreInst(II->getArgOperand(0), Ptr); + } + break; + case Intrinsic::ppc_vsx_stxvw4x: + case Intrinsic::ppc_vsx_stxvd2x: { + // Turn PPC VSX stores into normal stores. + Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); + Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); + return new StoreInst(II->getArgOperand(0), Ptr, false, Align::None()); + } + case Intrinsic::ppc_qpx_qvlfs: + // Turn PPC QPX qvlfs -> load if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, + &DT) >= 16) { + Type *VTy = VectorType::get(Builder.getFloatTy(), + II->getType()->getVectorNumElements()); + Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), + PointerType::getUnqual(VTy)); + Value *Load = Builder.CreateLoad(VTy, Ptr); + return new FPExtInst(Load, II->getType()); + } + break; + case Intrinsic::ppc_qpx_qvlfd: + // Turn PPC QPX qvlfd -> load if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC, + &DT) >= 32) { + Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), + PointerType::getUnqual(II->getType())); + return new LoadInst(II->getType(), Ptr); + } + break; + case Intrinsic::ppc_qpx_qvstfs: + // Turn PPC QPX qvstfs -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, + &DT) >= 16) { + Type *VTy = VectorType::get(Builder.getFloatTy(), + II->getArgOperand(0)->getType()->getVectorNumElements()); + Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy); + Type *OpPtrTy = PointerType::getUnqual(VTy); + Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); + return new StoreInst(TOp, Ptr); + } + break; + case Intrinsic::ppc_qpx_qvstfd: + // Turn PPC QPX qvstfd -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC, + &DT) >= 32) { + Type *OpPtrTy = + PointerType::getUnqual(II->getArgOperand(0)->getType()); + Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); + return new StoreInst(II->getArgOperand(0), Ptr); + } + break; + + case Intrinsic::x86_bmi_bextr_32: + case Intrinsic::x86_bmi_bextr_64: + case Intrinsic::x86_tbm_bextri_u32: + case Intrinsic::x86_tbm_bextri_u64: + // If the RHS is a constant we can try some simplifications. + if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) { + uint64_t Shift = C->getZExtValue(); + uint64_t Length = (Shift >> 8) & 0xff; + Shift &= 0xff; + unsigned BitWidth = II->getType()->getIntegerBitWidth(); + // If the length is 0 or the shift is out of range, replace with zero. + if (Length == 0 || Shift >= BitWidth) + return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); + // If the LHS is also a constant, we can completely constant fold this. + if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { + uint64_t Result = InC->getZExtValue() >> Shift; + if (Length > BitWidth) + Length = BitWidth; + Result &= maskTrailingOnes<uint64_t>(Length); + return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); + } + // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we + // are only masking bits that a shift already cleared? + } + break; + + case Intrinsic::x86_bmi_bzhi_32: + case Intrinsic::x86_bmi_bzhi_64: + // If the RHS is a constant we can try some simplifications. + if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) { + uint64_t Index = C->getZExtValue() & 0xff; + unsigned BitWidth = II->getType()->getIntegerBitWidth(); + if (Index >= BitWidth) + return replaceInstUsesWith(CI, II->getArgOperand(0)); + if (Index == 0) + return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); + // If the LHS is also a constant, we can completely constant fold this. + if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { + uint64_t Result = InC->getZExtValue(); + Result &= maskTrailingOnes<uint64_t>(Index); + return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); + } + // TODO should we convert this to an AND if the RHS is constant? + } + break; + + case Intrinsic::x86_vcvtph2ps_128: + case Intrinsic::x86_vcvtph2ps_256: { + auto Arg = II->getArgOperand(0); + auto ArgType = cast<VectorType>(Arg->getType()); + auto RetType = cast<VectorType>(II->getType()); + unsigned ArgWidth = ArgType->getNumElements(); + unsigned RetWidth = RetType->getNumElements(); + assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths"); + assert(ArgType->isIntOrIntVectorTy() && + ArgType->getScalarSizeInBits() == 16 && + "CVTPH2PS input type should be 16-bit integer vector"); + assert(RetType->getScalarType()->isFloatTy() && + "CVTPH2PS output type should be 32-bit float vector"); + + // Constant folding: Convert to generic half to single conversion. + if (isa<ConstantAggregateZero>(Arg)) + return replaceInstUsesWith(*II, ConstantAggregateZero::get(RetType)); + + if (isa<ConstantDataVector>(Arg)) { + auto VectorHalfAsShorts = Arg; + if (RetWidth < ArgWidth) { + SmallVector<uint32_t, 8> SubVecMask; + for (unsigned i = 0; i != RetWidth; ++i) + SubVecMask.push_back((int)i); + VectorHalfAsShorts = Builder.CreateShuffleVector( + Arg, UndefValue::get(ArgType), SubVecMask); + } + + auto VectorHalfType = + VectorType::get(Type::getHalfTy(II->getContext()), RetWidth); + auto VectorHalfs = + Builder.CreateBitCast(VectorHalfAsShorts, VectorHalfType); + auto VectorFloats = Builder.CreateFPExt(VectorHalfs, RetType); + return replaceInstUsesWith(*II, VectorFloats); + } + + // We only use the lowest lanes of the argument. + if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) { + II->setArgOperand(0, V); + return II; + } + break; + } + + case Intrinsic::x86_sse_cvtss2si: + case Intrinsic::x86_sse_cvtss2si64: + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + case Intrinsic::x86_sse2_cvtsd2si: + case Intrinsic::x86_sse2_cvtsd2si64: + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: + case Intrinsic::x86_avx512_vcvtss2si32: + case Intrinsic::x86_avx512_vcvtss2si64: + case Intrinsic::x86_avx512_vcvtss2usi32: + case Intrinsic::x86_avx512_vcvtss2usi64: + case Intrinsic::x86_avx512_vcvtsd2si32: + case Intrinsic::x86_avx512_vcvtsd2si64: + case Intrinsic::x86_avx512_vcvtsd2usi32: + case Intrinsic::x86_avx512_vcvtsd2usi64: + case Intrinsic::x86_avx512_cvttss2si: + case Intrinsic::x86_avx512_cvttss2si64: + case Intrinsic::x86_avx512_cvttss2usi: + case Intrinsic::x86_avx512_cvttss2usi64: + case Intrinsic::x86_avx512_cvttsd2si: + case Intrinsic::x86_avx512_cvttsd2si64: + case Intrinsic::x86_avx512_cvttsd2usi: + case Intrinsic::x86_avx512_cvttsd2usi64: { + // These intrinsics only demand the 0th element of their input vectors. If + // we can simplify the input based on that, do so now. + Value *Arg = II->getArgOperand(0); + unsigned VWidth = Arg->getType()->getVectorNumElements(); + if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { + II->setArgOperand(0, V); + return II; + } + break; + } + + case Intrinsic::x86_mmx_pmovmskb: + case Intrinsic::x86_sse_movmsk_ps: + case Intrinsic::x86_sse2_movmsk_pd: + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx_movmsk_pd_256: + case Intrinsic::x86_avx_movmsk_ps_256: + case Intrinsic::x86_avx2_pmovmskb: + if (Value *V = simplifyX86movmsk(*II, Builder)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse_comieq_ss: + case Intrinsic::x86_sse_comige_ss: + case Intrinsic::x86_sse_comigt_ss: + case Intrinsic::x86_sse_comile_ss: + case Intrinsic::x86_sse_comilt_ss: + case Intrinsic::x86_sse_comineq_ss: + case Intrinsic::x86_sse_ucomieq_ss: + case Intrinsic::x86_sse_ucomige_ss: + case Intrinsic::x86_sse_ucomigt_ss: + case Intrinsic::x86_sse_ucomile_ss: + case Intrinsic::x86_sse_ucomilt_ss: + case Intrinsic::x86_sse_ucomineq_ss: + case Intrinsic::x86_sse2_comieq_sd: + case Intrinsic::x86_sse2_comige_sd: + case Intrinsic::x86_sse2_comigt_sd: + case Intrinsic::x86_sse2_comile_sd: + case Intrinsic::x86_sse2_comilt_sd: + case Intrinsic::x86_sse2_comineq_sd: + case Intrinsic::x86_sse2_ucomieq_sd: + case Intrinsic::x86_sse2_ucomige_sd: + case Intrinsic::x86_sse2_ucomigt_sd: + case Intrinsic::x86_sse2_ucomile_sd: + case Intrinsic::x86_sse2_ucomilt_sd: + case Intrinsic::x86_sse2_ucomineq_sd: + case Intrinsic::x86_avx512_vcomi_ss: + case Intrinsic::x86_avx512_vcomi_sd: + case Intrinsic::x86_avx512_mask_cmp_ss: + case Intrinsic::x86_avx512_mask_cmp_sd: { + // These intrinsics only demand the 0th element of their input vectors. If + // we can simplify the input based on that, do so now. + bool MadeChange = false; + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + unsigned VWidth = Arg0->getType()->getVectorNumElements(); + if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { + II->setArgOperand(0, V); + MadeChange = true; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { + II->setArgOperand(1, V); + MadeChange = true; + } + if (MadeChange) + return II; + break; + } + case Intrinsic::x86_avx512_cmp_pd_128: + case Intrinsic::x86_avx512_cmp_pd_256: + case Intrinsic::x86_avx512_cmp_pd_512: + case Intrinsic::x86_avx512_cmp_ps_128: + case Intrinsic::x86_avx512_cmp_ps_256: + case Intrinsic::x86_avx512_cmp_ps_512: { + // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a) + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + bool Arg0IsZero = match(Arg0, m_PosZeroFP()); + if (Arg0IsZero) + std::swap(Arg0, Arg1); + Value *A, *B; + // This fold requires only the NINF(not +/- inf) since inf minus + // inf is nan. + // NSZ(No Signed Zeros) is not needed because zeros of any sign are + // equal for both compares. + // NNAN is not needed because nans compare the same for both compares. + // The compare intrinsic uses the above assumptions and therefore + // doesn't require additional flags. + if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) && + match(Arg1, m_PosZeroFP()) && isa<Instruction>(Arg0) && + cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) { + if (Arg0IsZero) + std::swap(A, B); + II->setArgOperand(0, A); + II->setArgOperand(1, B); + return II; + } + break; + } + + case Intrinsic::x86_avx512_add_ps_512: + case Intrinsic::x86_avx512_div_ps_512: + case Intrinsic::x86_avx512_mul_ps_512: + case Intrinsic::x86_avx512_sub_ps_512: + case Intrinsic::x86_avx512_add_pd_512: + case Intrinsic::x86_avx512_div_pd_512: + case Intrinsic::x86_avx512_mul_pd_512: + case Intrinsic::x86_avx512_sub_pd_512: + // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular + // IR operations. + if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(2))) { + if (R->getValue() == 4) { + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + + Value *V; + switch (IID) { + default: llvm_unreachable("Case stmts out of sync!"); + case Intrinsic::x86_avx512_add_ps_512: + case Intrinsic::x86_avx512_add_pd_512: + V = Builder.CreateFAdd(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_sub_ps_512: + case Intrinsic::x86_avx512_sub_pd_512: + V = Builder.CreateFSub(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_mul_ps_512: + case Intrinsic::x86_avx512_mul_pd_512: + V = Builder.CreateFMul(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_div_ps_512: + case Intrinsic::x86_avx512_div_pd_512: + V = Builder.CreateFDiv(Arg0, Arg1); + break; + } + + return replaceInstUsesWith(*II, V); + } + } + break; + + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular + // IR operations. + if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) { + if (R->getValue() == 4) { + // Extract the element as scalars. + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0); + Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0); + + Value *V; + switch (IID) { + default: llvm_unreachable("Case stmts out of sync!"); + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + V = Builder.CreateFAdd(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + V = Builder.CreateFSub(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + V = Builder.CreateFMul(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + V = Builder.CreateFDiv(LHS, RHS); + break; + } + + // Handle the masking aspect of the intrinsic. + Value *Mask = II->getArgOperand(3); + auto *C = dyn_cast<ConstantInt>(Mask); + // We don't need a select if we know the mask bit is a 1. + if (!C || !C->getValue()[0]) { + // Cast the mask to an i1 vector and then extract the lowest element. + auto *MaskTy = VectorType::get(Builder.getInt1Ty(), + cast<IntegerType>(Mask->getType())->getBitWidth()); + Mask = Builder.CreateBitCast(Mask, MaskTy); + Mask = Builder.CreateExtractElement(Mask, (uint64_t)0); + // Extract the lowest element from the passthru operand. + Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2), + (uint64_t)0); + V = Builder.CreateSelect(Mask, V, Passthru); + } + + // Insert the result back into the original argument 0. + V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0); + + return replaceInstUsesWith(*II, V); + } + } + break; + + // Constant fold ashr( <A x Bi>, Ci ). + // Constant fold lshr( <A x Bi>, Ci ). + // Constant fold shl( <A x Bi>, Ci ). + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx512_psrai_q_128: + case Intrinsic::x86_avx512_psrai_q_256: + case Intrinsic::x86_avx512_psrai_d_512: + case Intrinsic::x86_avx512_psrai_q_512: + case Intrinsic::x86_avx512_psrai_w_512: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx512_psrli_d_512: + case Intrinsic::x86_avx512_psrli_q_512: + case Intrinsic::x86_avx512_psrli_w_512: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx512_pslli_d_512: + case Intrinsic::x86_avx512_pslli_q_512: + case Intrinsic::x86_avx512_pslli_w_512: + if (Value *V = simplifyX86immShift(*II, Builder)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx512_psra_q_128: + case Intrinsic::x86_avx512_psra_q_256: + case Intrinsic::x86_avx512_psra_d_512: + case Intrinsic::x86_avx512_psra_q_512: + case Intrinsic::x86_avx512_psra_w_512: + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx512_psrl_d_512: + case Intrinsic::x86_avx512_psrl_q_512: + case Intrinsic::x86_avx512_psrl_w_512: + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx512_psll_d_512: + case Intrinsic::x86_avx512_psll_q_512: + case Intrinsic::x86_avx512_psll_w_512: { + if (Value *V = simplifyX86immShift(*II, Builder)) + return replaceInstUsesWith(*II, V); + + // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + Value *Arg1 = II->getArgOperand(1); + assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && + "Unexpected packed shift size"); + unsigned VWidth = Arg1->getType()->getVectorNumElements(); + + if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { + II->setArgOperand(1, V); + return II; + } + break; + } + + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_q_256: + case Intrinsic::x86_avx512_psllv_d_512: + case Intrinsic::x86_avx512_psllv_q_512: + case Intrinsic::x86_avx512_psllv_w_128: + case Intrinsic::x86_avx512_psllv_w_256: + case Intrinsic::x86_avx512_psllv_w_512: + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: + case Intrinsic::x86_avx512_psrav_q_128: + case Intrinsic::x86_avx512_psrav_q_256: + case Intrinsic::x86_avx512_psrav_d_512: + case Intrinsic::x86_avx512_psrav_q_512: + case Intrinsic::x86_avx512_psrav_w_128: + case Intrinsic::x86_avx512_psrav_w_256: + case Intrinsic::x86_avx512_psrav_w_512: + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_q_256: + case Intrinsic::x86_avx512_psrlv_d_512: + case Intrinsic::x86_avx512_psrlv_q_512: + case Intrinsic::x86_avx512_psrlv_w_128: + case Intrinsic::x86_avx512_psrlv_w_256: + case Intrinsic::x86_avx512_psrlv_w_512: + if (Value *V = simplifyX86varShift(*II, Builder)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + case Intrinsic::x86_avx512_packssdw_512: + case Intrinsic::x86_avx512_packsswb_512: + if (Value *V = simplifyX86pack(*II, Builder, true)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packusdw: + case Intrinsic::x86_avx2_packuswb: + case Intrinsic::x86_avx512_packusdw_512: + case Intrinsic::x86_avx512_packuswb_512: + if (Value *V = simplifyX86pack(*II, Builder, false)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_pclmulqdq: + case Intrinsic::x86_pclmulqdq_256: + case Intrinsic::x86_pclmulqdq_512: { + if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) { + unsigned Imm = C->getZExtValue(); + + bool MadeChange = false; + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + unsigned VWidth = Arg0->getType()->getVectorNumElements(); + + APInt UndefElts1(VWidth, 0); + APInt DemandedElts1 = APInt::getSplat(VWidth, + APInt(2, (Imm & 0x01) ? 2 : 1)); + if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1, + UndefElts1)) { + II->setArgOperand(0, V); + MadeChange = true; + } + + APInt UndefElts2(VWidth, 0); + APInt DemandedElts2 = APInt::getSplat(VWidth, + APInt(2, (Imm & 0x10) ? 2 : 1)); + if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2, + UndefElts2)) { + II->setArgOperand(1, V); + MadeChange = true; + } + + // If either input elements are undef, the result is zero. + if (DemandedElts1.isSubsetOf(UndefElts1) || + DemandedElts2.isSubsetOf(UndefElts2)) + return replaceInstUsesWith(*II, + ConstantAggregateZero::get(II->getType())); + + if (MadeChange) + return II; + } + break; + } + + case Intrinsic::x86_sse41_insertps: + if (Value *V = simplifyX86insertps(*II, Builder)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse4a_extrq: { + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth0 = Op0->getType()->getVectorNumElements(); + unsigned VWidth1 = Op1->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 16 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CILength = + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CIIndex = + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or EXTRQI call. + if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) + return replaceInstUsesWith(*II, V); + + // EXTRQ only uses the lowest 64-bits of the first 128-bit vector + // operands and the lowest 16-bits of the second. + bool MadeChange = false; + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + II->setArgOperand(0, V); + MadeChange = true; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { + II->setArgOperand(1, V); + MadeChange = true; + } + if (MadeChange) + return II; + break; + } + + case Intrinsic::x86_sse4a_extrqi: { + // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining + // bits of the lower 64-bits. The upper 64-bits are undefined. + Value *Op0 = II->getArgOperand(0); + unsigned VWidth = Op0->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1)); + ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2)); + + // Attempt to simplify to a constant or shuffle vector. + if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) + return replaceInstUsesWith(*II, V); + + // EXTRQI only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { + II->setArgOperand(0, V); + return II; + } + break; + } + + case Intrinsic::x86_sse4a_insertq: { + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth = Op0->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + Op1->getType()->getVectorNumElements() == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CI11 = + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or INSERTQI call. + if (CI11) { + const APInt &V11 = CI11->getValue(); + APInt Len = V11.zextOrTrunc(6); + APInt Idx = V11.lshr(8).zextOrTrunc(6); + if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) + return replaceInstUsesWith(*II, V); + } + + // INSERTQ only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { + II->setArgOperand(0, V); + return II; + } + break; + } + + case Intrinsic::x86_sse4a_insertqi: { + // INSERTQI: Extract lowest Length bits from lower half of second source and + // insert over first source starting at Index bit. The upper 64-bits are + // undefined. + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth0 = Op0->getType()->getVectorNumElements(); + unsigned VWidth1 = Op1->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 2 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2)); + ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3)); + + // Attempt to simplify to a constant or shuffle vector. + if (CILength && CIIndex) { + APInt Len = CILength->getValue().zextOrTrunc(6); + APInt Idx = CIIndex->getValue().zextOrTrunc(6); + if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) + return replaceInstUsesWith(*II, V); + } + + // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector + // operands. + bool MadeChange = false; + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + II->setArgOperand(0, V); + MadeChange = true; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { + II->setArgOperand(1, V); + MadeChange = true; + } + if (MadeChange) + return II; + break; + } + + case Intrinsic::x86_sse41_pblendvb: + case Intrinsic::x86_sse41_blendvps: + case Intrinsic::x86_sse41_blendvpd: + case Intrinsic::x86_avx_blendv_ps_256: + case Intrinsic::x86_avx_blendv_pd_256: + case Intrinsic::x86_avx2_pblendvb: { + // fold (blend A, A, Mask) -> A + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + Value *Mask = II->getArgOperand(2); + if (Op0 == Op1) + return replaceInstUsesWith(CI, Op0); + + // Zero Mask - select 1st argument. + if (isa<ConstantAggregateZero>(Mask)) + return replaceInstUsesWith(CI, Op0); + + // Constant Mask - select 1st/2nd argument lane based on top bit of mask. + if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { + Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); + return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); + } + + // Convert to a vector select if we can bypass casts and find a boolean + // vector condition value. + Value *BoolVec; + Mask = peekThroughBitcast(Mask); + if (match(Mask, m_SExt(m_Value(BoolVec))) && + BoolVec->getType()->isVectorTy() && + BoolVec->getType()->getScalarSizeInBits() == 1) { + assert(Mask->getType()->getPrimitiveSizeInBits() == + II->getType()->getPrimitiveSizeInBits() && + "Not expecting mask and operands with different sizes"); + + unsigned NumMaskElts = Mask->getType()->getVectorNumElements(); + unsigned NumOperandElts = II->getType()->getVectorNumElements(); + if (NumMaskElts == NumOperandElts) + return SelectInst::Create(BoolVec, Op1, Op0); + + // If the mask has less elements than the operands, each mask bit maps to + // multiple elements of the operands. Bitcast back and forth. + if (NumMaskElts < NumOperandElts) { + Value *CastOp0 = Builder.CreateBitCast(Op0, Mask->getType()); + Value *CastOp1 = Builder.CreateBitCast(Op1, Mask->getType()); + Value *Sel = Builder.CreateSelect(BoolVec, CastOp1, CastOp0); + return new BitCastInst(Sel, II->getType()); + } + } + + break; + } + + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + case Intrinsic::x86_avx512_pshuf_b_512: + if (Value *V = simplifyX86pshufb(*II, Builder)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_avx_vpermilvar_ps: + case Intrinsic::x86_avx_vpermilvar_ps_256: + case Intrinsic::x86_avx512_vpermilvar_ps_512: + case Intrinsic::x86_avx_vpermilvar_pd: + case Intrinsic::x86_avx_vpermilvar_pd_256: + case Intrinsic::x86_avx512_vpermilvar_pd_512: + if (Value *V = simplifyX86vpermilvar(*II, Builder)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: + case Intrinsic::x86_avx512_permvar_df_256: + case Intrinsic::x86_avx512_permvar_df_512: + case Intrinsic::x86_avx512_permvar_di_256: + case Intrinsic::x86_avx512_permvar_di_512: + case Intrinsic::x86_avx512_permvar_hi_128: + case Intrinsic::x86_avx512_permvar_hi_256: + case Intrinsic::x86_avx512_permvar_hi_512: + case Intrinsic::x86_avx512_permvar_qi_128: + case Intrinsic::x86_avx512_permvar_qi_256: + case Intrinsic::x86_avx512_permvar_qi_512: + case Intrinsic::x86_avx512_permvar_sf_512: + case Intrinsic::x86_avx512_permvar_si_512: + if (Value *V = simplifyX86vpermv(*II, Builder)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_avx_maskload_ps: + case Intrinsic::x86_avx_maskload_pd: + case Intrinsic::x86_avx_maskload_ps_256: + case Intrinsic::x86_avx_maskload_pd_256: + case Intrinsic::x86_avx2_maskload_d: + case Intrinsic::x86_avx2_maskload_q: + case Intrinsic::x86_avx2_maskload_d_256: + case Intrinsic::x86_avx2_maskload_q_256: + if (Instruction *I = simplifyX86MaskedLoad(*II, *this)) + return I; + break; + + case Intrinsic::x86_sse2_maskmov_dqu: + case Intrinsic::x86_avx_maskstore_ps: + case Intrinsic::x86_avx_maskstore_pd: + case Intrinsic::x86_avx_maskstore_ps_256: + case Intrinsic::x86_avx_maskstore_pd_256: + case Intrinsic::x86_avx2_maskstore_d: + case Intrinsic::x86_avx2_maskstore_q: + case Intrinsic::x86_avx2_maskstore_d_256: + case Intrinsic::x86_avx2_maskstore_q_256: + if (simplifyX86MaskedStore(*II, *this)) + return nullptr; + break; + + case Intrinsic::x86_addcarry_32: + case Intrinsic::x86_addcarry_64: + if (Value *V = simplifyX86addcarry(*II, Builder)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::ppc_altivec_vperm: + // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. + // Note that ppc_altivec_vperm has a big-endian bias, so when creating + // a vectorshuffle for little endian, we must undo the transformation + // performed on vec_perm in altivec.h. That is, we must complement + // the permutation mask with respect to 31 and reverse the order of + // V1 and V2. + if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) { + assert(Mask->getType()->getVectorNumElements() == 16 && + "Bad type for intrinsic!"); + + // Check that all of the elements are integer constants or undefs. + bool AllEltsOk = true; + for (unsigned i = 0; i != 16; ++i) { + Constant *Elt = Mask->getAggregateElement(i); + if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) { + AllEltsOk = false; + break; + } + } + + if (AllEltsOk) { + // Cast the input vectors to byte vectors. + Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0), + Mask->getType()); + Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1), + Mask->getType()); + Value *Result = UndefValue::get(Op0->getType()); + + // Only extract each element once. + Value *ExtractedElts[32]; + memset(ExtractedElts, 0, sizeof(ExtractedElts)); + + for (unsigned i = 0; i != 16; ++i) { + if (isa<UndefValue>(Mask->getAggregateElement(i))) + continue; + unsigned Idx = + cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue(); + Idx &= 31; // Match the hardware behavior. + if (DL.isLittleEndian()) + Idx = 31 - Idx; + + if (!ExtractedElts[Idx]) { + Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0; + Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1; + ExtractedElts[Idx] = + Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse, + Builder.getInt32(Idx&15)); + } + + // Insert this value into the result vector. + Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx], + Builder.getInt32(i)); + } + return CastInst::Create(Instruction::BitCast, Result, CI.getType()); + } + } + break; + + case Intrinsic::arm_neon_vld1: { + unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), + DL, II, &AC, &DT); + if (Value *V = simplifyNeonVld1(*II, MemAlign, Builder)) + return replaceInstUsesWith(*II, V); + break; + } + + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: { + unsigned MemAlign = + getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT); + unsigned AlignArg = II->getNumArgOperands() - 1; + ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg)); + if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) { + II->setArgOperand(AlignArg, + ConstantInt::get(Type::getInt32Ty(II->getContext()), + MemAlign, false)); + return II; + } + break; + } + + case Intrinsic::arm_neon_vtbl1: + case Intrinsic::aarch64_neon_tbl1: + if (Value *V = simplifyNeonTbl1(*II, Builder)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::arm_neon_vmulls: + case Intrinsic::arm_neon_vmullu: + case Intrinsic::aarch64_neon_smull: + case Intrinsic::aarch64_neon_umull: { + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + + // Handle mul by zero first: + if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) { + return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType())); + } + + // Check for constant LHS & RHS - in this case we just simplify. + bool Zext = (IID == Intrinsic::arm_neon_vmullu || + IID == Intrinsic::aarch64_neon_umull); + VectorType *NewVT = cast<VectorType>(II->getType()); + if (Constant *CV0 = dyn_cast<Constant>(Arg0)) { + if (Constant *CV1 = dyn_cast<Constant>(Arg1)) { + CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext); + CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext); + + return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1)); + } + + // Couldn't simplify - canonicalize constant to the RHS. + std::swap(Arg0, Arg1); + } + + // Handle mul by one: + if (Constant *CV1 = dyn_cast<Constant>(Arg1)) + if (ConstantInt *Splat = + dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) + if (Splat->isOne()) + return CastInst::CreateIntegerCast(Arg0, II->getType(), + /*isSigned=*/!Zext); + + break; + } + case Intrinsic::arm_neon_aesd: + case Intrinsic::arm_neon_aese: + case Intrinsic::aarch64_crypto_aesd: + case Intrinsic::aarch64_crypto_aese: { + Value *DataArg = II->getArgOperand(0); + Value *KeyArg = II->getArgOperand(1); + + // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR + Value *Data, *Key; + if (match(KeyArg, m_ZeroInt()) && + match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) { + II->setArgOperand(0, Data); + II->setArgOperand(1, Key); + return II; + } + break; + } + case Intrinsic::amdgcn_rcp: { + Value *Src = II->getArgOperand(0); + + // TODO: Move to ConstantFolding/InstSimplify? + if (isa<UndefValue>(Src)) + return replaceInstUsesWith(CI, Src); + + if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { + const APFloat &ArgVal = C->getValueAPF(); + APFloat Val(ArgVal.getSemantics(), 1.0); + APFloat::opStatus Status = Val.divide(ArgVal, + APFloat::rmNearestTiesToEven); + // Only do this if it was exact and therefore not dependent on the + // rounding mode. + if (Status == APFloat::opOK) + return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val)); + } + + break; + } + case Intrinsic::amdgcn_rsq: { + Value *Src = II->getArgOperand(0); + + // TODO: Move to ConstantFolding/InstSimplify? + if (isa<UndefValue>(Src)) + return replaceInstUsesWith(CI, Src); + break; + } + case Intrinsic::amdgcn_frexp_mant: + case Intrinsic::amdgcn_frexp_exp: { + Value *Src = II->getArgOperand(0); + if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { + int Exp; + APFloat Significand = frexp(C->getValueAPF(), Exp, + APFloat::rmNearestTiesToEven); + + if (IID == Intrinsic::amdgcn_frexp_mant) { + return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), + Significand)); + } + + // Match instruction special case behavior. + if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) + Exp = 0; + + return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp)); + } + + if (isa<UndefValue>(Src)) + return replaceInstUsesWith(CI, UndefValue::get(II->getType())); + + break; + } + case Intrinsic::amdgcn_class: { + enum { + S_NAN = 1 << 0, // Signaling NaN + Q_NAN = 1 << 1, // Quiet NaN + N_INFINITY = 1 << 2, // Negative infinity + N_NORMAL = 1 << 3, // Negative normal + N_SUBNORMAL = 1 << 4, // Negative subnormal + N_ZERO = 1 << 5, // Negative zero + P_ZERO = 1 << 6, // Positive zero + P_SUBNORMAL = 1 << 7, // Positive subnormal + P_NORMAL = 1 << 8, // Positive normal + P_INFINITY = 1 << 9 // Positive infinity + }; + + const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | + N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY; + + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); + const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); + if (!CMask) { + if (isa<UndefValue>(Src0)) + return replaceInstUsesWith(*II, UndefValue::get(II->getType())); + + if (isa<UndefValue>(Src1)) + return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); + break; + } + + uint32_t Mask = CMask->getZExtValue(); + + // If all tests are made, it doesn't matter what the value is. + if ((Mask & FullMask) == FullMask) + return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true)); + + if ((Mask & FullMask) == 0) + return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); + + if (Mask == (S_NAN | Q_NAN)) { + // Equivalent of isnan. Replace with standard fcmp. + Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0); + FCmp->takeName(II); + return replaceInstUsesWith(*II, FCmp); + } + + if (Mask == (N_ZERO | P_ZERO)) { + // Equivalent of == 0. + Value *FCmp = Builder.CreateFCmpOEQ( + Src0, ConstantFP::get(Src0->getType(), 0.0)); + + FCmp->takeName(II); + return replaceInstUsesWith(*II, FCmp); + } + + // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other + if (((Mask & S_NAN) || (Mask & Q_NAN)) && isKnownNeverNaN(Src0, &TLI)) { + II->setArgOperand(1, ConstantInt::get(Src1->getType(), + Mask & ~(S_NAN | Q_NAN))); + return II; + } + + const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); + if (!CVal) { + if (isa<UndefValue>(Src0)) + return replaceInstUsesWith(*II, UndefValue::get(II->getType())); + + // Clamp mask to used bits + if ((Mask & FullMask) != Mask) { + CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(), + { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) } + ); + + NewCall->takeName(II); + return replaceInstUsesWith(*II, NewCall); + } + + break; + } + + const APFloat &Val = CVal->getValueAPF(); + + bool Result = + ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || + ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || + ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || + ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || + ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || + ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || + ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || + ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || + ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || + ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); + + return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result)); + } + case Intrinsic::amdgcn_cvt_pkrtz: { + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); + if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { + if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { + const fltSemantics &HalfSem + = II->getType()->getScalarType()->getFltSemantics(); + bool LosesInfo; + APFloat Val0 = C0->getValueAPF(); + APFloat Val1 = C1->getValueAPF(); + Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); + Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); + + Constant *Folded = ConstantVector::get({ + ConstantFP::get(II->getContext(), Val0), + ConstantFP::get(II->getContext(), Val1) }); + return replaceInstUsesWith(*II, Folded); + } + } + + if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) + return replaceInstUsesWith(*II, UndefValue::get(II->getType())); + + break; + } + case Intrinsic::amdgcn_cvt_pknorm_i16: + case Intrinsic::amdgcn_cvt_pknorm_u16: + case Intrinsic::amdgcn_cvt_pk_i16: + case Intrinsic::amdgcn_cvt_pk_u16: { + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); + + if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) + return replaceInstUsesWith(*II, UndefValue::get(II->getType())); + + break; + } + case Intrinsic::amdgcn_ubfe: + case Intrinsic::amdgcn_sbfe: { + // Decompose simple cases into standard shifts. + Value *Src = II->getArgOperand(0); + if (isa<UndefValue>(Src)) + return replaceInstUsesWith(*II, Src); + + unsigned Width; + Type *Ty = II->getType(); + unsigned IntSize = Ty->getIntegerBitWidth(); + + ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2)); + if (CWidth) { + Width = CWidth->getZExtValue(); + if ((Width & (IntSize - 1)) == 0) + return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty)); + + if (Width >= IntSize) { + // Hardware ignores high bits, so remove those. + II->setArgOperand(2, ConstantInt::get(CWidth->getType(), + Width & (IntSize - 1))); + return II; + } + } + + unsigned Offset; + ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1)); + if (COffset) { + Offset = COffset->getZExtValue(); + if (Offset >= IntSize) { + II->setArgOperand(1, ConstantInt::get(COffset->getType(), + Offset & (IntSize - 1))); + return II; + } + } + + bool Signed = IID == Intrinsic::amdgcn_sbfe; + + if (!CWidth || !COffset) + break; + + // The case of Width == 0 is handled above, which makes this tranformation + // safe. If Width == 0, then the ashr and lshr instructions become poison + // value since the shift amount would be equal to the bit size. + assert(Width != 0); + + // TODO: This allows folding to undef when the hardware has specific + // behavior? + if (Offset + Width < IntSize) { + Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width); + Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width) + : Builder.CreateLShr(Shl, IntSize - Width); + RightShift->takeName(II); + return replaceInstUsesWith(*II, RightShift); + } + + Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset) + : Builder.CreateLShr(Src, Offset); + + RightShift->takeName(II); + return replaceInstUsesWith(*II, RightShift); + } + case Intrinsic::amdgcn_exp: + case Intrinsic::amdgcn_exp_compr: { + ConstantInt *En = cast<ConstantInt>(II->getArgOperand(1)); + unsigned EnBits = En->getZExtValue(); + if (EnBits == 0xf) + break; // All inputs enabled. + + bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; + bool Changed = false; + for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { + if ((!IsCompr && (EnBits & (1 << I)) == 0) || + (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { + Value *Src = II->getArgOperand(I + 2); + if (!isa<UndefValue>(Src)) { + II->setArgOperand(I + 2, UndefValue::get(Src->getType())); + Changed = true; + } + } + } + + if (Changed) + return II; + + break; + } + case Intrinsic::amdgcn_fmed3: { + // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled + // for the shader. + + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); + Value *Src2 = II->getArgOperand(2); + + // Checking for NaN before canonicalization provides better fidelity when + // mapping other operations onto fmed3 since the order of operands is + // unchanged. + CallInst *NewCall = nullptr; + if (match(Src0, m_NaN()) || isa<UndefValue>(Src0)) { + NewCall = Builder.CreateMinNum(Src1, Src2); + } else if (match(Src1, m_NaN()) || isa<UndefValue>(Src1)) { + NewCall = Builder.CreateMinNum(Src0, Src2); + } else if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) { + NewCall = Builder.CreateMaxNum(Src0, Src1); + } + + if (NewCall) { + NewCall->copyFastMathFlags(II); + NewCall->takeName(II); + return replaceInstUsesWith(*II, NewCall); + } + + bool Swap = false; + // Canonicalize constants to RHS operands. + // + // fmed3(c0, x, c1) -> fmed3(x, c0, c1) + if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { + std::swap(Src0, Src1); + Swap = true; + } + + if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { + std::swap(Src1, Src2); + Swap = true; + } + + if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { + std::swap(Src0, Src1); + Swap = true; + } + + if (Swap) { + II->setArgOperand(0, Src0); + II->setArgOperand(1, Src1); + II->setArgOperand(2, Src2); + return II; + } + + if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { + if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { + if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { + APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), + C2->getValueAPF()); + return replaceInstUsesWith(*II, + ConstantFP::get(Builder.getContext(), Result)); + } + } + } + + break; + } + case Intrinsic::amdgcn_icmp: + case Intrinsic::amdgcn_fcmp: { + const ConstantInt *CC = cast<ConstantInt>(II->getArgOperand(2)); + // Guard against invalid arguments. + int64_t CCVal = CC->getZExtValue(); + bool IsInteger = IID == Intrinsic::amdgcn_icmp; + if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || + CCVal > CmpInst::LAST_ICMP_PREDICATE)) || + (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || + CCVal > CmpInst::LAST_FCMP_PREDICATE))) + break; + + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); + + if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { + if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { + Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); + if (CCmp->isNullValue()) { + return replaceInstUsesWith( + *II, ConstantExpr::getSExt(CCmp, II->getType())); + } + + // The result of V_ICMP/V_FCMP assembly instructions (which this + // intrinsic exposes) is one bit per thread, masked with the EXEC + // register (which contains the bitmask of live threads). So a + // comparison that always returns true is the same as a read of the + // EXEC register. + Function *NewF = Intrinsic::getDeclaration( + II->getModule(), Intrinsic::read_register, II->getType()); + Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")}; + MDNode *MD = MDNode::get(II->getContext(), MDArgs); + Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)}; + CallInst *NewCall = Builder.CreateCall(NewF, Args); + NewCall->addAttribute(AttributeList::FunctionIndex, + Attribute::Convergent); + NewCall->takeName(II); + return replaceInstUsesWith(*II, NewCall); + } + + // Canonicalize constants to RHS. + CmpInst::Predicate SwapPred + = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); + II->setArgOperand(0, Src1); + II->setArgOperand(1, Src0); + II->setArgOperand(2, ConstantInt::get(CC->getType(), + static_cast<int>(SwapPred))); + return II; + } + + if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) + break; + + // Canonicalize compare eq with true value to compare != 0 + // llvm.amdgcn.icmp(zext (i1 x), 1, eq) + // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) + // llvm.amdgcn.icmp(sext (i1 x), -1, eq) + // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) + Value *ExtSrc; + if (CCVal == CmpInst::ICMP_EQ && + ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) || + (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) && + ExtSrc->getType()->isIntegerTy(1)) { + II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType())); + II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); + return II; + } + + CmpInst::Predicate SrcPred; + Value *SrcLHS; + Value *SrcRHS; + + // Fold compare eq/ne with 0 from a compare result as the predicate to the + // intrinsic. The typical use is a wave vote function in the library, which + // will be fed from a user code condition compared with 0. Fold in the + // redundant compare. + + // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) + // -> llvm.amdgcn.[if]cmp(a, b, pred) + // + // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) + // -> llvm.amdgcn.[if]cmp(a, b, inv pred) + if (match(Src1, m_Zero()) && + match(Src0, + m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) { + if (CCVal == CmpInst::ICMP_EQ) + SrcPred = CmpInst::getInversePredicate(SrcPred); + + Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ? + Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp; + + Type *Ty = SrcLHS->getType(); + if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { + // Promote to next legal integer type. + unsigned Width = CmpType->getBitWidth(); + unsigned NewWidth = Width; + + // Don't do anything for i1 comparisons. + if (Width == 1) + break; + + if (Width <= 16) + NewWidth = 16; + else if (Width <= 32) + NewWidth = 32; + else if (Width <= 64) + NewWidth = 64; + else if (Width > 64) + break; // Can't handle this. + + if (Width != NewWidth) { + IntegerType *CmpTy = Builder.getIntNTy(NewWidth); + if (CmpInst::isSigned(SrcPred)) { + SrcLHS = Builder.CreateSExt(SrcLHS, CmpTy); + SrcRHS = Builder.CreateSExt(SrcRHS, CmpTy); + } else { + SrcLHS = Builder.CreateZExt(SrcLHS, CmpTy); + SrcRHS = Builder.CreateZExt(SrcRHS, CmpTy); + } + } + } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) + break; + + Function *NewF = + Intrinsic::getDeclaration(II->getModule(), NewIID, + { II->getType(), + SrcLHS->getType() }); + Value *Args[] = { SrcLHS, SrcRHS, + ConstantInt::get(CC->getType(), SrcPred) }; + CallInst *NewCall = Builder.CreateCall(NewF, Args); + NewCall->takeName(II); + return replaceInstUsesWith(*II, NewCall); + } + + break; + } + case Intrinsic::amdgcn_wqm_vote: { + // wqm_vote is identity when the argument is constant. + if (!isa<Constant>(II->getArgOperand(0))) + break; + + return replaceInstUsesWith(*II, II->getArgOperand(0)); + } + case Intrinsic::amdgcn_kill: { + const ConstantInt *C = dyn_cast<ConstantInt>(II->getArgOperand(0)); + if (!C || !C->getZExtValue()) + break; + + // amdgcn.kill(i1 1) is a no-op + return eraseInstFromFunction(CI); + } + case Intrinsic::amdgcn_update_dpp: { + Value *Old = II->getArgOperand(0); + + auto BC = cast<ConstantInt>(II->getArgOperand(5)); + auto RM = cast<ConstantInt>(II->getArgOperand(3)); + auto BM = cast<ConstantInt>(II->getArgOperand(4)); + if (BC->isZeroValue() || + RM->getZExtValue() != 0xF || + BM->getZExtValue() != 0xF || + isa<UndefValue>(Old)) + break; + + // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. + II->setOperand(0, UndefValue::get(Old->getType())); + return II; + } + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: { + // A constant value is trivially uniform. + if (Constant *C = dyn_cast<Constant>(II->getArgOperand(0))) + return replaceInstUsesWith(*II, C); + + // The rest of these may not be safe if the exec may not be the same between + // the def and use. + Value *Src = II->getArgOperand(0); + Instruction *SrcInst = dyn_cast<Instruction>(Src); + if (SrcInst && SrcInst->getParent() != II->getParent()) + break; + + // readfirstlane (readfirstlane x) -> readfirstlane x + // readlane (readfirstlane x), y -> readfirstlane x + if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) + return replaceInstUsesWith(*II, Src); + + if (IID == Intrinsic::amdgcn_readfirstlane) { + // readfirstlane (readlane x, y) -> readlane x, y + if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>())) + return replaceInstUsesWith(*II, Src); + } else { + // readlane (readlane x, y), y -> readlane x, y + if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>( + m_Value(), m_Specific(II->getArgOperand(1))))) + return replaceInstUsesWith(*II, Src); + } + + break; + } + case Intrinsic::stackrestore: { + // If the save is right next to the restore, remove the restore. This can + // happen when variable allocas are DCE'd. + if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { + if (SS->getIntrinsicID() == Intrinsic::stacksave) { + // Skip over debug info. + if (SS->getNextNonDebugInstruction() == II) { + return eraseInstFromFunction(CI); + } + } + } + + // Scan down this block to see if there is another stack restore in the + // same block without an intervening call/alloca. + BasicBlock::iterator BI(II); + Instruction *TI = II->getParent()->getTerminator(); + bool CannotRemove = false; + for (++BI; &*BI != TI; ++BI) { + if (isa<AllocaInst>(BI)) { + CannotRemove = true; + break; + } + if (CallInst *BCI = dyn_cast<CallInst>(BI)) { + if (auto *II2 = dyn_cast<IntrinsicInst>(BCI)) { + // If there is a stackrestore below this one, remove this one. + if (II2->getIntrinsicID() == Intrinsic::stackrestore) + return eraseInstFromFunction(CI); + + // Bail if we cross over an intrinsic with side effects, such as + // llvm.stacksave, llvm.read_register, or llvm.setjmp. + if (II2->mayHaveSideEffects()) { + CannotRemove = true; + break; + } + } else { + // If we found a non-intrinsic call, we can't remove the stack + // restore. + CannotRemove = true; + break; + } + } + } + + // If the stack restore is in a return, resume, or unwind block and if there + // are no allocas or calls between the restore and the return, nuke the + // restore. + if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI))) + return eraseInstFromFunction(CI); + break; + } + case Intrinsic::lifetime_start: + // Asan needs to poison memory to detect invalid access which is possible + // even for empty lifetime range. + if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) || + II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) || + II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress)) + break; + + if (removeTriviallyEmptyRange(*II, Intrinsic::lifetime_start, + Intrinsic::lifetime_end, *this)) + return nullptr; + break; + case Intrinsic::assume: { + Value *IIOperand = II->getArgOperand(0); + // Remove an assume if it is followed by an identical assume. + // TODO: Do we need this? Unless there are conflicting assumptions, the + // computeKnownBits(IIOperand) below here eliminates redundant assumes. + Instruction *Next = II->getNextNonDebugInstruction(); + if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand)))) + return eraseInstFromFunction(CI); + + // Canonicalize assume(a && b) -> assume(a); assume(b); + // Note: New assumption intrinsics created here are registered by + // the InstCombineIRInserter object. + FunctionType *AssumeIntrinsicTy = II->getFunctionType(); + Value *AssumeIntrinsic = II->getCalledValue(); + Value *A, *B; + if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) { + Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, II->getName()); + Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName()); + return eraseInstFromFunction(*II); + } + // assume(!(a || b)) -> assume(!a); assume(!b); + if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) { + Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, + Builder.CreateNot(A), II->getName()); + Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, + Builder.CreateNot(B), II->getName()); + return eraseInstFromFunction(*II); + } + + // assume( (load addr) != null ) -> add 'nonnull' metadata to load + // (if assume is valid at the load) + CmpInst::Predicate Pred; + Instruction *LHS; + if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) && + Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load && + LHS->getType()->isPointerTy() && + isValidAssumeForContext(II, LHS, &DT)) { + MDNode *MD = MDNode::get(II->getContext(), None); + LHS->setMetadata(LLVMContext::MD_nonnull, MD); + return eraseInstFromFunction(*II); + + // TODO: apply nonnull return attributes to calls and invokes + // TODO: apply range metadata for range check patterns? + } + + // If there is a dominating assume with the same condition as this one, + // then this one is redundant, and should be removed. + KnownBits Known(1); + computeKnownBits(IIOperand, Known, 0, II); + if (Known.isAllOnes()) + return eraseInstFromFunction(*II); + + // Update the cache of affected values for this assumption (we might be + // here because we just simplified the condition). + AC.updateAffectedValues(II); + break; + } + case Intrinsic::experimental_gc_relocate: { + auto &GCR = *cast<GCRelocateInst>(II); + + // If we have two copies of the same pointer in the statepoint argument + // list, canonicalize to one. This may let us common gc.relocates. + if (GCR.getBasePtr() == GCR.getDerivedPtr() && + GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) { + auto *OpIntTy = GCR.getOperand(2)->getType(); + II->setOperand(2, ConstantInt::get(OpIntTy, GCR.getBasePtrIndex())); + return II; + } + + // Translate facts known about a pointer before relocating into + // facts about the relocate value, while being careful to + // preserve relocation semantics. + Value *DerivedPtr = GCR.getDerivedPtr(); + + // Remove the relocation if unused, note that this check is required + // to prevent the cases below from looping forever. + if (II->use_empty()) + return eraseInstFromFunction(*II); + + // Undef is undef, even after relocation. + // TODO: provide a hook for this in GCStrategy. This is clearly legal for + // most practical collectors, but there was discussion in the review thread + // about whether it was legal for all possible collectors. + if (isa<UndefValue>(DerivedPtr)) + // Use undef of gc_relocate's type to replace it. + return replaceInstUsesWith(*II, UndefValue::get(II->getType())); + + if (auto *PT = dyn_cast<PointerType>(II->getType())) { + // The relocation of null will be null for most any collector. + // TODO: provide a hook for this in GCStrategy. There might be some + // weird collector this property does not hold for. + if (isa<ConstantPointerNull>(DerivedPtr)) + // Use null-pointer of gc_relocate's type to replace it. + return replaceInstUsesWith(*II, ConstantPointerNull::get(PT)); + + // isKnownNonNull -> nonnull attribute + if (!II->hasRetAttr(Attribute::NonNull) && + isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) { + II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); + return II; + } + } + + // TODO: bitcast(relocate(p)) -> relocate(bitcast(p)) + // Canonicalize on the type from the uses to the defs + + // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...) + break; + } + + case Intrinsic::experimental_guard: { + // Is this guard followed by another guard? We scan forward over a small + // fixed window of instructions to handle common cases with conditions + // computed between guards. + Instruction *NextInst = II->getNextNode(); + for (unsigned i = 0; i < GuardWideningWindow; i++) { + // Note: Using context-free form to avoid compile time blow up + if (!isSafeToSpeculativelyExecute(NextInst)) + break; + NextInst = NextInst->getNextNode(); + } + Value *NextCond = nullptr; + if (match(NextInst, + m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) { + Value *CurrCond = II->getArgOperand(0); + + // Remove a guard that it is immediately preceded by an identical guard. + if (CurrCond == NextCond) + return eraseInstFromFunction(*NextInst); + + // Otherwise canonicalize guard(a); guard(b) -> guard(a & b). + Instruction* MoveI = II->getNextNode(); + while (MoveI != NextInst) { + auto *Temp = MoveI; + MoveI = MoveI->getNextNode(); + Temp->moveBefore(II); + } + II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond)); + return eraseInstFromFunction(*NextInst); + } + break; + } + } + return visitCallBase(*II); +} + +// Fence instruction simplification +Instruction *InstCombiner::visitFenceInst(FenceInst &FI) { + // Remove identical consecutive fences. + Instruction *Next = FI.getNextNonDebugInstruction(); + if (auto *NFI = dyn_cast<FenceInst>(Next)) + if (FI.isIdenticalTo(NFI)) + return eraseInstFromFunction(FI); + return nullptr; +} + +// InvokeInst simplification +Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) { + return visitCallBase(II); +} + +// CallBrInst simplification +Instruction *InstCombiner::visitCallBrInst(CallBrInst &CBI) { + return visitCallBase(CBI); +} + +/// If this cast does not affect the value passed through the varargs area, we +/// can eliminate the use of the cast. +static bool isSafeToEliminateVarargsCast(const CallBase &Call, + const DataLayout &DL, + const CastInst *const CI, + const int ix) { + if (!CI->isLosslessCast()) + return false; + + // If this is a GC intrinsic, avoid munging types. We need types for + // statepoint reconstruction in SelectionDAG. + // TODO: This is probably something which should be expanded to all + // intrinsics since the entire point of intrinsics is that + // they are understandable by the optimizer. + if (isStatepoint(&Call) || isGCRelocate(&Call) || isGCResult(&Call)) + return false; + + // The size of ByVal or InAlloca arguments is derived from the type, so we + // can't change to a type with a different size. If the size were + // passed explicitly we could avoid this check. + if (!Call.isByValOrInAllocaArgument(ix)) + return true; + + Type* SrcTy = + cast<PointerType>(CI->getOperand(0)->getType())->getElementType(); + Type *DstTy = Call.isByValArgument(ix) + ? Call.getParamByValType(ix) + : cast<PointerType>(CI->getType())->getElementType(); + if (!SrcTy->isSized() || !DstTy->isSized()) + return false; + if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy)) + return false; + return true; +} + +Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) { + if (!CI->getCalledFunction()) return nullptr; + + auto InstCombineRAUW = [this](Instruction *From, Value *With) { + replaceInstUsesWith(*From, With); + }; + auto InstCombineErase = [this](Instruction *I) { + eraseInstFromFunction(*I); + }; + LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW, + InstCombineErase); + if (Value *With = Simplifier.optimizeCall(CI)) { + ++NumSimplified; + return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); + } + + return nullptr; +} + +static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) { + // Strip off at most one level of pointer casts, looking for an alloca. This + // is good enough in practice and simpler than handling any number of casts. + Value *Underlying = TrampMem->stripPointerCasts(); + if (Underlying != TrampMem && + (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem)) + return nullptr; + if (!isa<AllocaInst>(Underlying)) + return nullptr; + + IntrinsicInst *InitTrampoline = nullptr; + for (User *U : TrampMem->users()) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(U); + if (!II) + return nullptr; + if (II->getIntrinsicID() == Intrinsic::init_trampoline) { + if (InitTrampoline) + // More than one init_trampoline writes to this value. Give up. + return nullptr; + InitTrampoline = II; + continue; + } + if (II->getIntrinsicID() == Intrinsic::adjust_trampoline) + // Allow any number of calls to adjust.trampoline. + continue; + return nullptr; + } + + // No call to init.trampoline found. + if (!InitTrampoline) + return nullptr; + + // Check that the alloca is being used in the expected way. + if (InitTrampoline->getOperand(0) != TrampMem) + return nullptr; + + return InitTrampoline; +} + +static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp, + Value *TrampMem) { + // Visit all the previous instructions in the basic block, and try to find a + // init.trampoline which has a direct path to the adjust.trampoline. + for (BasicBlock::iterator I = AdjustTramp->getIterator(), + E = AdjustTramp->getParent()->begin(); + I != E;) { + Instruction *Inst = &*--I; + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) + if (II->getIntrinsicID() == Intrinsic::init_trampoline && + II->getOperand(0) == TrampMem) + return II; + if (Inst->mayWriteToMemory()) + return nullptr; + } + return nullptr; +} + +// Given a call to llvm.adjust.trampoline, find and return the corresponding +// call to llvm.init.trampoline if the call to the trampoline can be optimized +// to a direct call to a function. Otherwise return NULL. +static IntrinsicInst *findInitTrampoline(Value *Callee) { + Callee = Callee->stripPointerCasts(); + IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee); + if (!AdjustTramp || + AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline) + return nullptr; + + Value *TrampMem = AdjustTramp->getOperand(0); + + if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem)) + return IT; + if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem)) + return IT; + return nullptr; +} + +static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) { + unsigned NumArgs = Call.getNumArgOperands(); + ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0)); + ConstantInt *Op1C = + (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1)); + // Bail out if the allocation size is zero. + if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue())) + return; + + if (isMallocLikeFn(&Call, TLI) && Op0C) { + if (isOpNewLikeFn(&Call, TLI)) + Call.addAttribute(AttributeList::ReturnIndex, + Attribute::getWithDereferenceableBytes( + Call.getContext(), Op0C->getZExtValue())); + else + Call.addAttribute(AttributeList::ReturnIndex, + Attribute::getWithDereferenceableOrNullBytes( + Call.getContext(), Op0C->getZExtValue())); + } else if (isReallocLikeFn(&Call, TLI) && Op1C) { + Call.addAttribute(AttributeList::ReturnIndex, + Attribute::getWithDereferenceableOrNullBytes( + Call.getContext(), Op1C->getZExtValue())); + } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) { + bool Overflow; + const APInt &N = Op0C->getValue(); + APInt Size = N.umul_ov(Op1C->getValue(), Overflow); + if (!Overflow) + Call.addAttribute(AttributeList::ReturnIndex, + Attribute::getWithDereferenceableOrNullBytes( + Call.getContext(), Size.getZExtValue())); + } else if (isStrdupLikeFn(&Call, TLI)) { + uint64_t Len = GetStringLength(Call.getOperand(0)); + if (Len) { + // strdup + if (NumArgs == 1) + Call.addAttribute(AttributeList::ReturnIndex, + Attribute::getWithDereferenceableOrNullBytes( + Call.getContext(), Len)); + // strndup + else if (NumArgs == 2 && Op1C) + Call.addAttribute( + AttributeList::ReturnIndex, + Attribute::getWithDereferenceableOrNullBytes( + Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1))); + } + } +} + +/// Improvements for call, callbr and invoke instructions. +Instruction *InstCombiner::visitCallBase(CallBase &Call) { + if (isAllocationFn(&Call, &TLI)) + annotateAnyAllocSite(Call, &TLI); + + bool Changed = false; + + // Mark any parameters that are known to be non-null with the nonnull + // attribute. This is helpful for inlining calls to functions with null + // checks on their arguments. + SmallVector<unsigned, 4> ArgNos; + unsigned ArgNo = 0; + + for (Value *V : Call.args()) { + if (V->getType()->isPointerTy() && + !Call.paramHasAttr(ArgNo, Attribute::NonNull) && + isKnownNonZero(V, DL, 0, &AC, &Call, &DT)) + ArgNos.push_back(ArgNo); + ArgNo++; + } + + assert(ArgNo == Call.arg_size() && "sanity check"); + + if (!ArgNos.empty()) { + AttributeList AS = Call.getAttributes(); + LLVMContext &Ctx = Call.getContext(); + AS = AS.addParamAttribute(Ctx, ArgNos, + Attribute::get(Ctx, Attribute::NonNull)); + Call.setAttributes(AS); + Changed = true; + } + + // If the callee is a pointer to a function, attempt to move any casts to the + // arguments of the call/callbr/invoke. + Value *Callee = Call.getCalledValue(); + if (!isa<Function>(Callee) && transformConstExprCastCall(Call)) + return nullptr; + + if (Function *CalleeF = dyn_cast<Function>(Callee)) { + // Remove the convergent attr on calls when the callee is not convergent. + if (Call.isConvergent() && !CalleeF->isConvergent() && + !CalleeF->isIntrinsic()) { + LLVM_DEBUG(dbgs() << "Removing convergent attr from instr " << Call + << "\n"); + Call.setNotConvergent(); + return &Call; + } + + // If the call and callee calling conventions don't match, this call must + // be unreachable, as the call is undefined. + if (CalleeF->getCallingConv() != Call.getCallingConv() && + // Only do this for calls to a function with a body. A prototype may + // not actually end up matching the implementation's calling conv for a + // variety of reasons (e.g. it may be written in assembly). + !CalleeF->isDeclaration()) { + Instruction *OldCall = &Call; + CreateNonTerminatorUnreachable(OldCall); + // If OldCall does not return void then replaceAllUsesWith undef. + // This allows ValueHandlers and custom metadata to adjust itself. + if (!OldCall->getType()->isVoidTy()) + replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType())); + if (isa<CallInst>(OldCall)) + return eraseInstFromFunction(*OldCall); + + // We cannot remove an invoke or a callbr, because it would change thexi + // CFG, just change the callee to a null pointer. + cast<CallBase>(OldCall)->setCalledFunction( + CalleeF->getFunctionType(), + Constant::getNullValue(CalleeF->getType())); + return nullptr; + } + } + + if ((isa<ConstantPointerNull>(Callee) && + !NullPointerIsDefined(Call.getFunction())) || + isa<UndefValue>(Callee)) { + // If Call does not return void then replaceAllUsesWith undef. + // This allows ValueHandlers and custom metadata to adjust itself. + if (!Call.getType()->isVoidTy()) + replaceInstUsesWith(Call, UndefValue::get(Call.getType())); + + if (Call.isTerminator()) { + // Can't remove an invoke or callbr because we cannot change the CFG. + return nullptr; + } + + // This instruction is not reachable, just remove it. + CreateNonTerminatorUnreachable(&Call); + return eraseInstFromFunction(Call); + } + + if (IntrinsicInst *II = findInitTrampoline(Callee)) + return transformCallThroughTrampoline(Call, *II); + + PointerType *PTy = cast<PointerType>(Callee->getType()); + FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); + if (FTy->isVarArg()) { + int ix = FTy->getNumParams(); + // See if we can optimize any arguments passed through the varargs area of + // the call. + for (auto I = Call.arg_begin() + FTy->getNumParams(), E = Call.arg_end(); + I != E; ++I, ++ix) { + CastInst *CI = dyn_cast<CastInst>(*I); + if (CI && isSafeToEliminateVarargsCast(Call, DL, CI, ix)) { + *I = CI->getOperand(0); + + // Update the byval type to match the argument type. + if (Call.isByValArgument(ix)) { + Call.removeParamAttr(ix, Attribute::ByVal); + Call.addParamAttr( + ix, Attribute::getWithByValType( + Call.getContext(), + CI->getOperand(0)->getType()->getPointerElementType())); + } + Changed = true; + } + } + } + + if (isa<InlineAsm>(Callee) && !Call.doesNotThrow()) { + // Inline asm calls cannot throw - mark them 'nounwind'. + Call.setDoesNotThrow(); + Changed = true; + } + + // Try to optimize the call if possible, we require DataLayout for most of + // this. None of these calls are seen as possibly dead so go ahead and + // delete the instruction now. + if (CallInst *CI = dyn_cast<CallInst>(&Call)) { + Instruction *I = tryOptimizeCall(CI); + // If we changed something return the result, etc. Otherwise let + // the fallthrough check. + if (I) return eraseInstFromFunction(*I); + } + + if (isAllocLikeFn(&Call, &TLI)) + return visitAllocSite(Call); + + return Changed ? &Call : nullptr; +} + +/// If the callee is a constexpr cast of a function, attempt to move the cast to +/// the arguments of the call/callbr/invoke. +bool InstCombiner::transformConstExprCastCall(CallBase &Call) { + auto *Callee = dyn_cast<Function>(Call.getCalledValue()->stripPointerCasts()); + if (!Callee) + return false; + + // If this is a call to a thunk function, don't remove the cast. Thunks are + // used to transparently forward all incoming parameters and outgoing return + // values, so it's important to leave the cast in place. + if (Callee->hasFnAttribute("thunk")) + return false; + + // If this is a musttail call, the callee's prototype must match the caller's + // prototype with the exception of pointee types. The code below doesn't + // implement that, so we can't do this transform. + // TODO: Do the transform if it only requires adding pointer casts. + if (Call.isMustTailCall()) + return false; + + Instruction *Caller = &Call; + const AttributeList &CallerPAL = Call.getAttributes(); + + // Okay, this is a cast from a function to a different type. Unless doing so + // would cause a type conversion of one of our arguments, change this call to + // be a direct call with arguments casted to the appropriate types. + FunctionType *FT = Callee->getFunctionType(); + Type *OldRetTy = Caller->getType(); + Type *NewRetTy = FT->getReturnType(); + + // Check to see if we are changing the return type... + if (OldRetTy != NewRetTy) { + + if (NewRetTy->isStructTy()) + return false; // TODO: Handle multiple return values. + + if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) { + if (Callee->isDeclaration()) + return false; // Cannot transform this return value. + + if (!Caller->use_empty() && + // void -> non-void is handled specially + !NewRetTy->isVoidTy()) + return false; // Cannot transform this return value. + } + + if (!CallerPAL.isEmpty() && !Caller->use_empty()) { + AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); + if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy))) + return false; // Attribute not compatible with transformed value. + } + + // If the callbase is an invoke/callbr instruction, and the return value is + // used by a PHI node in a successor, we cannot change the return type of + // the call because there is no place to put the cast instruction (without + // breaking the critical edge). Bail out in this case. + if (!Caller->use_empty()) { + if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) + for (User *U : II->users()) + if (PHINode *PN = dyn_cast<PHINode>(U)) + if (PN->getParent() == II->getNormalDest() || + PN->getParent() == II->getUnwindDest()) + return false; + // FIXME: Be conservative for callbr to avoid a quadratic search. + if (isa<CallBrInst>(Caller)) + return false; + } + } + + unsigned NumActualArgs = Call.arg_size(); + unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs); + + // Prevent us turning: + // declare void @takes_i32_inalloca(i32* inalloca) + // call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0) + // + // into: + // call void @takes_i32_inalloca(i32* null) + // + // Similarly, avoid folding away bitcasts of byval calls. + if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) || + Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal)) + return false; + + auto AI = Call.arg_begin(); + for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) { + Type *ParamTy = FT->getParamType(i); + Type *ActTy = (*AI)->getType(); + + if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL)) + return false; // Cannot transform this parameter value. + + if (AttrBuilder(CallerPAL.getParamAttributes(i)) + .overlaps(AttributeFuncs::typeIncompatible(ParamTy))) + return false; // Attribute not compatible with transformed value. + + if (Call.isInAllocaArgument(i)) + return false; // Cannot transform to and from inalloca. + + // If the parameter is passed as a byval argument, then we have to have a + // sized type and the sized type has to have the same size as the old type. + if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { + PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); + if (!ParamPTy || !ParamPTy->getElementType()->isSized()) + return false; + + Type *CurElTy = Call.getParamByValType(i); + if (DL.getTypeAllocSize(CurElTy) != + DL.getTypeAllocSize(ParamPTy->getElementType())) + return false; + } + } + + if (Callee->isDeclaration()) { + // Do not delete arguments unless we have a function body. + if (FT->getNumParams() < NumActualArgs && !FT->isVarArg()) + return false; + + // If the callee is just a declaration, don't change the varargsness of the + // call. We don't want to introduce a varargs call where one doesn't + // already exist. + PointerType *APTy = cast<PointerType>(Call.getCalledValue()->getType()); + if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg()) + return false; + + // If both the callee and the cast type are varargs, we still have to make + // sure the number of fixed parameters are the same or we have the same + // ABI issues as if we introduce a varargs call. + if (FT->isVarArg() && + cast<FunctionType>(APTy->getElementType())->isVarArg() && + FT->getNumParams() != + cast<FunctionType>(APTy->getElementType())->getNumParams()) + return false; + } + + if (FT->getNumParams() < NumActualArgs && FT->isVarArg() && + !CallerPAL.isEmpty()) { + // In this case we have more arguments than the new function type, but we + // won't be dropping them. Check that these extra arguments have attributes + // that are compatible with being a vararg call argument. + unsigned SRetIdx; + if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) && + SRetIdx > FT->getNumParams()) + return false; + } + + // Okay, we decided that this is a safe thing to do: go ahead and start + // inserting cast instructions as necessary. + SmallVector<Value *, 8> Args; + SmallVector<AttributeSet, 8> ArgAttrs; + Args.reserve(NumActualArgs); + ArgAttrs.reserve(NumActualArgs); + + // Get any return attributes. + AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); + + // If the return value is not being used, the type may not be compatible + // with the existing attributes. Wipe out any problematic attributes. + RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy)); + + LLVMContext &Ctx = Call.getContext(); + AI = Call.arg_begin(); + for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) { + Type *ParamTy = FT->getParamType(i); + + Value *NewArg = *AI; + if ((*AI)->getType() != ParamTy) + NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy); + Args.push_back(NewArg); + + // Add any parameter attributes. + if (CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { + AttrBuilder AB(CallerPAL.getParamAttributes(i)); + AB.addByValAttr(NewArg->getType()->getPointerElementType()); + ArgAttrs.push_back(AttributeSet::get(Ctx, AB)); + } else + ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); + } + + // If the function takes more arguments than the call was taking, add them + // now. + for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) { + Args.push_back(Constant::getNullValue(FT->getParamType(i))); + ArgAttrs.push_back(AttributeSet()); + } + + // If we are removing arguments to the function, emit an obnoxious warning. + if (FT->getNumParams() < NumActualArgs) { + // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722 + if (FT->isVarArg()) { + // Add all of the arguments in their promoted form to the arg list. + for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) { + Type *PTy = getPromotedType((*AI)->getType()); + Value *NewArg = *AI; + if (PTy != (*AI)->getType()) { + // Must promote to pass through va_arg area! + Instruction::CastOps opcode = + CastInst::getCastOpcode(*AI, false, PTy, false); + NewArg = Builder.CreateCast(opcode, *AI, PTy); + } + Args.push_back(NewArg); + + // Add any parameter attributes. + ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); + } + } + } + + AttributeSet FnAttrs = CallerPAL.getFnAttributes(); + + if (NewRetTy->isVoidTy()) + Caller->setName(""); // Void type should not have a name. + + assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) && + "missing argument attributes"); + AttributeList NewCallerPAL = AttributeList::get( + Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs); + + SmallVector<OperandBundleDef, 1> OpBundles; + Call.getOperandBundlesAsDefs(OpBundles); + + CallBase *NewCall; + if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { + NewCall = Builder.CreateInvoke(Callee, II->getNormalDest(), + II->getUnwindDest(), Args, OpBundles); + } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) { + NewCall = Builder.CreateCallBr(Callee, CBI->getDefaultDest(), + CBI->getIndirectDests(), Args, OpBundles); + } else { + NewCall = Builder.CreateCall(Callee, Args, OpBundles); + cast<CallInst>(NewCall)->setTailCallKind( + cast<CallInst>(Caller)->getTailCallKind()); + } + NewCall->takeName(Caller); + NewCall->setCallingConv(Call.getCallingConv()); + NewCall->setAttributes(NewCallerPAL); + + // Preserve the weight metadata for the new call instruction. The metadata + // is used by SamplePGO to check callsite's hotness. + uint64_t W; + if (Caller->extractProfTotalWeight(W)) + NewCall->setProfWeight(W); + + // Insert a cast of the return type as necessary. + Instruction *NC = NewCall; + Value *NV = NC; + if (OldRetTy != NV->getType() && !Caller->use_empty()) { + if (!NV->getType()->isVoidTy()) { + NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy); + NC->setDebugLoc(Caller->getDebugLoc()); + + // If this is an invoke/callbr instruction, we should insert it after the + // first non-phi instruction in the normal successor block. + if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { + BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt(); + InsertNewInstBefore(NC, *I); + } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) { + BasicBlock::iterator I = CBI->getDefaultDest()->getFirstInsertionPt(); + InsertNewInstBefore(NC, *I); + } else { + // Otherwise, it's a call, just insert cast right after the call. + InsertNewInstBefore(NC, *Caller); + } + Worklist.AddUsersToWorkList(*Caller); + } else { + NV = UndefValue::get(Caller->getType()); + } + } + + if (!Caller->use_empty()) + replaceInstUsesWith(*Caller, NV); + else if (Caller->hasValueHandle()) { + if (OldRetTy == NV->getType()) + ValueHandleBase::ValueIsRAUWd(Caller, NV); + else + // We cannot call ValueIsRAUWd with a different type, and the + // actual tracked value will disappear. + ValueHandleBase::ValueIsDeleted(Caller); + } + + eraseInstFromFunction(*Caller); + return true; +} + +/// Turn a call to a function created by init_trampoline / adjust_trampoline +/// intrinsic pair into a direct call to the underlying function. +Instruction * +InstCombiner::transformCallThroughTrampoline(CallBase &Call, + IntrinsicInst &Tramp) { + Value *Callee = Call.getCalledValue(); + Type *CalleeTy = Callee->getType(); + FunctionType *FTy = Call.getFunctionType(); + AttributeList Attrs = Call.getAttributes(); + + // If the call already has the 'nest' attribute somewhere then give up - + // otherwise 'nest' would occur twice after splicing in the chain. + if (Attrs.hasAttrSomewhere(Attribute::Nest)) + return nullptr; + + Function *NestF = cast<Function>(Tramp.getArgOperand(1)->stripPointerCasts()); + FunctionType *NestFTy = NestF->getFunctionType(); + + AttributeList NestAttrs = NestF->getAttributes(); + if (!NestAttrs.isEmpty()) { + unsigned NestArgNo = 0; + Type *NestTy = nullptr; + AttributeSet NestAttr; + + // Look for a parameter marked with the 'nest' attribute. + for (FunctionType::param_iterator I = NestFTy->param_begin(), + E = NestFTy->param_end(); + I != E; ++NestArgNo, ++I) { + AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo); + if (AS.hasAttribute(Attribute::Nest)) { + // Record the parameter type and any other attributes. + NestTy = *I; + NestAttr = AS; + break; + } + } + + if (NestTy) { + std::vector<Value*> NewArgs; + std::vector<AttributeSet> NewArgAttrs; + NewArgs.reserve(Call.arg_size() + 1); + NewArgAttrs.reserve(Call.arg_size()); + + // Insert the nest argument into the call argument list, which may + // mean appending it. Likewise for attributes. + + { + unsigned ArgNo = 0; + auto I = Call.arg_begin(), E = Call.arg_end(); + do { + if (ArgNo == NestArgNo) { + // Add the chain argument and attributes. + Value *NestVal = Tramp.getArgOperand(2); + if (NestVal->getType() != NestTy) + NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest"); + NewArgs.push_back(NestVal); + NewArgAttrs.push_back(NestAttr); + } + + if (I == E) + break; + + // Add the original argument and attributes. + NewArgs.push_back(*I); + NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo)); + + ++ArgNo; + ++I; + } while (true); + } + + // The trampoline may have been bitcast to a bogus type (FTy). + // Handle this by synthesizing a new function type, equal to FTy + // with the chain parameter inserted. + + std::vector<Type*> NewTypes; + NewTypes.reserve(FTy->getNumParams()+1); + + // Insert the chain's type into the list of parameter types, which may + // mean appending it. + { + unsigned ArgNo = 0; + FunctionType::param_iterator I = FTy->param_begin(), + E = FTy->param_end(); + + do { + if (ArgNo == NestArgNo) + // Add the chain's type. + NewTypes.push_back(NestTy); + + if (I == E) + break; + + // Add the original type. + NewTypes.push_back(*I); + + ++ArgNo; + ++I; + } while (true); + } + + // Replace the trampoline call with a direct call. Let the generic + // code sort out any function type mismatches. + FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes, + FTy->isVarArg()); + Constant *NewCallee = + NestF->getType() == PointerType::getUnqual(NewFTy) ? + NestF : ConstantExpr::getBitCast(NestF, + PointerType::getUnqual(NewFTy)); + AttributeList NewPAL = + AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(), + Attrs.getRetAttributes(), NewArgAttrs); + + SmallVector<OperandBundleDef, 1> OpBundles; + Call.getOperandBundlesAsDefs(OpBundles); + + Instruction *NewCaller; + if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) { + NewCaller = InvokeInst::Create(NewFTy, NewCallee, + II->getNormalDest(), II->getUnwindDest(), + NewArgs, OpBundles); + cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv()); + cast<InvokeInst>(NewCaller)->setAttributes(NewPAL); + } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(&Call)) { + NewCaller = + CallBrInst::Create(NewFTy, NewCallee, CBI->getDefaultDest(), + CBI->getIndirectDests(), NewArgs, OpBundles); + cast<CallBrInst>(NewCaller)->setCallingConv(CBI->getCallingConv()); + cast<CallBrInst>(NewCaller)->setAttributes(NewPAL); + } else { + NewCaller = CallInst::Create(NewFTy, NewCallee, NewArgs, OpBundles); + cast<CallInst>(NewCaller)->setTailCallKind( + cast<CallInst>(Call).getTailCallKind()); + cast<CallInst>(NewCaller)->setCallingConv( + cast<CallInst>(Call).getCallingConv()); + cast<CallInst>(NewCaller)->setAttributes(NewPAL); + } + NewCaller->setDebugLoc(Call.getDebugLoc()); + + return NewCaller; + } + } + + // Replace the trampoline call with a direct call. Since there is no 'nest' + // parameter, there is no need to adjust the argument list. Let the generic + // code sort out any function type mismatches. + Constant *NewCallee = ConstantExpr::getBitCast(NestF, CalleeTy); + Call.setCalledFunction(FTy, NewCallee); + return &Call; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp new file mode 100644 index 000000000000..65aaef28d87a --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -0,0 +1,2492 @@ +//===- InstCombineCasts.cpp -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the visit functions for cast operations. +// +//===----------------------------------------------------------------------===// + +#include "InstCombineInternal.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DIBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/KnownBits.h" +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "instcombine" + +/// Analyze 'Val', seeing if it is a simple linear expression. +/// If so, decompose it, returning some value X, such that Val is +/// X*Scale+Offset. +/// +static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale, + uint64_t &Offset) { + if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) { + Offset = CI->getZExtValue(); + Scale = 0; + return ConstantInt::get(Val->getType(), 0); + } + + if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) { + // Cannot look past anything that might overflow. + OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val); + if (OBI && !OBI->hasNoUnsignedWrap() && !OBI->hasNoSignedWrap()) { + Scale = 1; + Offset = 0; + return Val; + } + + if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) { + if (I->getOpcode() == Instruction::Shl) { + // This is a value scaled by '1 << the shift amt'. + Scale = UINT64_C(1) << RHS->getZExtValue(); + Offset = 0; + return I->getOperand(0); + } + + if (I->getOpcode() == Instruction::Mul) { + // This value is scaled by 'RHS'. + Scale = RHS->getZExtValue(); + Offset = 0; + return I->getOperand(0); + } + + if (I->getOpcode() == Instruction::Add) { + // We have X+C. Check to see if we really have (X*C2)+C1, + // where C1 is divisible by C2. + unsigned SubScale; + Value *SubVal = + decomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset); + Offset += RHS->getZExtValue(); + Scale = SubScale; + return SubVal; + } + } + } + + // Otherwise, we can't look past this. + Scale = 1; + Offset = 0; + return Val; +} + +/// If we find a cast of an allocation instruction, try to eliminate the cast by +/// moving the type information into the alloc. +Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, + AllocaInst &AI) { + PointerType *PTy = cast<PointerType>(CI.getType()); + + BuilderTy AllocaBuilder(Builder); + AllocaBuilder.SetInsertPoint(&AI); + + // Get the type really allocated and the type casted to. + Type *AllocElTy = AI.getAllocatedType(); + Type *CastElTy = PTy->getElementType(); + if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr; + + unsigned AllocElTyAlign = DL.getABITypeAlignment(AllocElTy); + unsigned CastElTyAlign = DL.getABITypeAlignment(CastElTy); + if (CastElTyAlign < AllocElTyAlign) return nullptr; + + // If the allocation has multiple uses, only promote it if we are strictly + // increasing the alignment of the resultant allocation. If we keep it the + // same, we open the door to infinite loops of various kinds. + if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr; + + uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy); + uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy); + if (CastElTySize == 0 || AllocElTySize == 0) return nullptr; + + // If the allocation has multiple uses, only promote it if we're not + // shrinking the amount of memory being allocated. + uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy); + uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy); + if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr; + + // See if we can satisfy the modulus by pulling a scale out of the array + // size argument. + unsigned ArraySizeScale; + uint64_t ArrayOffset; + Value *NumElements = // See if the array size is a decomposable linear expr. + decomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset); + + // If we can now satisfy the modulus, by using a non-1 scale, we really can + // do the xform. + if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 || + (AllocElTySize*ArrayOffset ) % CastElTySize != 0) return nullptr; + + unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize; + Value *Amt = nullptr; + if (Scale == 1) { + Amt = NumElements; + } else { + Amt = ConstantInt::get(AI.getArraySize()->getType(), Scale); + // Insert before the alloca, not before the cast. + Amt = AllocaBuilder.CreateMul(Amt, NumElements); + } + + if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) { + Value *Off = ConstantInt::get(AI.getArraySize()->getType(), + Offset, true); + Amt = AllocaBuilder.CreateAdd(Amt, Off); + } + + AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt); + New->setAlignment(MaybeAlign(AI.getAlignment())); + New->takeName(&AI); + New->setUsedWithInAlloca(AI.isUsedWithInAlloca()); + + // If the allocation has multiple real uses, insert a cast and change all + // things that used it to use the new cast. This will also hack on CI, but it + // will die soon. + if (!AI.hasOneUse()) { + // New is the allocation instruction, pointer typed. AI is the original + // allocation instruction, also pointer typed. Thus, cast to use is BitCast. + Value *NewCast = AllocaBuilder.CreateBitCast(New, AI.getType(), "tmpcast"); + replaceInstUsesWith(AI, NewCast); + } + return replaceInstUsesWith(CI, New); +} + +/// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns +/// true for, actually insert the code to evaluate the expression. +Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty, + bool isSigned) { + if (Constant *C = dyn_cast<Constant>(V)) { + C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/); + // If we got a constantexpr back, try to simplify it with DL info. + if (Constant *FoldedC = ConstantFoldConstant(C, DL, &TLI)) + C = FoldedC; + return C; + } + + // Otherwise, it must be an instruction. + Instruction *I = cast<Instruction>(V); + Instruction *Res = nullptr; + unsigned Opc = I->getOpcode(); + switch (Opc) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::AShr: + case Instruction::LShr: + case Instruction::Shl: + case Instruction::UDiv: + case Instruction::URem: { + Value *LHS = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned); + Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned); + Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS); + break; + } + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + // If the source type of the cast is the type we're trying for then we can + // just return the source. There's no need to insert it because it is not + // new. + if (I->getOperand(0)->getType() == Ty) + return I->getOperand(0); + + // Otherwise, must be the same type of cast, so just reinsert a new one. + // This also handles the case of zext(trunc(x)) -> zext(x). + Res = CastInst::CreateIntegerCast(I->getOperand(0), Ty, + Opc == Instruction::SExt); + break; + case Instruction::Select: { + Value *True = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned); + Value *False = EvaluateInDifferentType(I->getOperand(2), Ty, isSigned); + Res = SelectInst::Create(I->getOperand(0), True, False); + break; + } + case Instruction::PHI: { + PHINode *OPN = cast<PHINode>(I); + PHINode *NPN = PHINode::Create(Ty, OPN->getNumIncomingValues()); + for (unsigned i = 0, e = OPN->getNumIncomingValues(); i != e; ++i) { + Value *V = + EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned); + NPN->addIncoming(V, OPN->getIncomingBlock(i)); + } + Res = NPN; + break; + } + default: + // TODO: Can handle more cases here. + llvm_unreachable("Unreachable!"); + } + + Res->takeName(I); + return InsertNewInstWith(Res, *I); +} + +Instruction::CastOps InstCombiner::isEliminableCastPair(const CastInst *CI1, + const CastInst *CI2) { + Type *SrcTy = CI1->getSrcTy(); + Type *MidTy = CI1->getDestTy(); + Type *DstTy = CI2->getDestTy(); + + Instruction::CastOps firstOp = CI1->getOpcode(); + Instruction::CastOps secondOp = CI2->getOpcode(); + Type *SrcIntPtrTy = + SrcTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(SrcTy) : nullptr; + Type *MidIntPtrTy = + MidTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(MidTy) : nullptr; + Type *DstIntPtrTy = + DstTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(DstTy) : nullptr; + unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy, + DstTy, SrcIntPtrTy, MidIntPtrTy, + DstIntPtrTy); + + // We don't want to form an inttoptr or ptrtoint that converts to an integer + // type that differs from the pointer size. + if ((Res == Instruction::IntToPtr && SrcTy != DstIntPtrTy) || + (Res == Instruction::PtrToInt && DstTy != SrcIntPtrTy)) + Res = 0; + + return Instruction::CastOps(Res); +} + +/// Implement the transforms common to all CastInst visitors. +Instruction *InstCombiner::commonCastTransforms(CastInst &CI) { + Value *Src = CI.getOperand(0); + + // Try to eliminate a cast of a cast. + if (auto *CSrc = dyn_cast<CastInst>(Src)) { // A->B->C cast + if (Instruction::CastOps NewOpc = isEliminableCastPair(CSrc, &CI)) { + // The first cast (CSrc) is eliminable so we need to fix up or replace + // the second cast (CI). CSrc will then have a good chance of being dead. + auto *Ty = CI.getType(); + auto *Res = CastInst::Create(NewOpc, CSrc->getOperand(0), Ty); + // Point debug users of the dying cast to the new one. + if (CSrc->hasOneUse()) + replaceAllDbgUsesWith(*CSrc, *Res, CI, DT); + return Res; + } + } + + if (auto *Sel = dyn_cast<SelectInst>(Src)) { + // We are casting a select. Try to fold the cast into the select, but only + // if the select does not have a compare instruction with matching operand + // types. Creating a select with operands that are different sizes than its + // condition may inhibit other folds and lead to worse codegen. + auto *Cmp = dyn_cast<CmpInst>(Sel->getCondition()); + if (!Cmp || Cmp->getOperand(0)->getType() != Sel->getType()) + if (Instruction *NV = FoldOpIntoSelect(CI, Sel)) { + replaceAllDbgUsesWith(*Sel, *NV, CI, DT); + return NV; + } + } + + // If we are casting a PHI, then fold the cast into the PHI. + if (auto *PN = dyn_cast<PHINode>(Src)) { + // Don't do this if it would create a PHI node with an illegal type from a + // legal type. + if (!Src->getType()->isIntegerTy() || !CI.getType()->isIntegerTy() || + shouldChangeType(CI.getType(), Src->getType())) + if (Instruction *NV = foldOpIntoPhi(CI, PN)) + return NV; + } + + return nullptr; +} + +/// Constants and extensions/truncates from the destination type are always +/// free to be evaluated in that type. This is a helper for canEvaluate*. +static bool canAlwaysEvaluateInType(Value *V, Type *Ty) { + if (isa<Constant>(V)) + return true; + Value *X; + if ((match(V, m_ZExtOrSExt(m_Value(X))) || match(V, m_Trunc(m_Value(X)))) && + X->getType() == Ty) + return true; + + return false; +} + +/// Filter out values that we can not evaluate in the destination type for free. +/// This is a helper for canEvaluate*. +static bool canNotEvaluateInType(Value *V, Type *Ty) { + assert(!isa<Constant>(V) && "Constant should already be handled."); + if (!isa<Instruction>(V)) + return true; + // We don't extend or shrink something that has multiple uses -- doing so + // would require duplicating the instruction which isn't profitable. + if (!V->hasOneUse()) + return true; + + return false; +} + +/// Return true if we can evaluate the specified expression tree as type Ty +/// instead of its larger type, and arrive with the same value. +/// This is used by code that tries to eliminate truncates. +/// +/// Ty will always be a type smaller than V. We should return true if trunc(V) +/// can be computed by computing V in the smaller type. If V is an instruction, +/// then trunc(inst(x,y)) can be computed as inst(trunc(x),trunc(y)), which only +/// makes sense if x and y can be efficiently truncated. +/// +/// This function works on both vectors and scalars. +/// +static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, + Instruction *CxtI) { + if (canAlwaysEvaluateInType(V, Ty)) + return true; + if (canNotEvaluateInType(V, Ty)) + return false; + + auto *I = cast<Instruction>(V); + Type *OrigTy = V->getType(); + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + // These operators can all arbitrarily be extended or truncated. + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && + canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); + + case Instruction::UDiv: + case Instruction::URem: { + // UDiv and URem can be truncated if all the truncated bits are zero. + uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits(); + uint32_t BitWidth = Ty->getScalarSizeInBits(); + assert(BitWidth < OrigBitWidth && "Unexpected bitwidths!"); + APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); + if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) && + IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) { + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && + canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); + } + break; + } + case Instruction::Shl: { + // If we are truncating the result of this SHL, and if it's a shift of a + // constant amount, we can always perform a SHL in a smaller type. + const APInt *Amt; + if (match(I->getOperand(1), m_APInt(Amt))) { + uint32_t BitWidth = Ty->getScalarSizeInBits(); + if (Amt->getLimitedValue(BitWidth) < BitWidth) + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); + } + break; + } + case Instruction::LShr: { + // If this is a truncate of a logical shr, we can truncate it to a smaller + // lshr iff we know that the bits we would otherwise be shifting in are + // already zeros. + const APInt *Amt; + if (match(I->getOperand(1), m_APInt(Amt))) { + uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits(); + uint32_t BitWidth = Ty->getScalarSizeInBits(); + if (Amt->getLimitedValue(BitWidth) < BitWidth && + IC.MaskedValueIsZero(I->getOperand(0), + APInt::getBitsSetFrom(OrigBitWidth, BitWidth), 0, CxtI)) { + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); + } + } + break; + } + case Instruction::AShr: { + // If this is a truncate of an arithmetic shr, we can truncate it to a + // smaller ashr iff we know that all the bits from the sign bit of the + // original type and the sign bit of the truncate type are similar. + // TODO: It is enough to check that the bits we would be shifting in are + // similar to sign bit of the truncate type. + const APInt *Amt; + if (match(I->getOperand(1), m_APInt(Amt))) { + uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits(); + uint32_t BitWidth = Ty->getScalarSizeInBits(); + if (Amt->getLimitedValue(BitWidth) < BitWidth && + OrigBitWidth - BitWidth < + IC.ComputeNumSignBits(I->getOperand(0), 0, CxtI)) + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); + } + break; + } + case Instruction::Trunc: + // trunc(trunc(x)) -> trunc(x) + return true; + case Instruction::ZExt: + case Instruction::SExt: + // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest + // trunc(ext(x)) -> trunc(x) if the source type is larger than the new dest + return true; + case Instruction::Select: { + SelectInst *SI = cast<SelectInst>(I); + return canEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) && + canEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI); + } + case Instruction::PHI: { + // We can change a phi if we can change all operands. Note that we never + // get into trouble with cyclic PHIs here because we only consider + // instructions with a single use. + PHINode *PN = cast<PHINode>(I); + for (Value *IncValue : PN->incoming_values()) + if (!canEvaluateTruncated(IncValue, Ty, IC, CxtI)) + return false; + return true; + } + default: + // TODO: Can handle more cases here. + break; + } + + return false; +} + +/// Given a vector that is bitcast to an integer, optionally logically +/// right-shifted, and truncated, convert it to an extractelement. +/// Example (big endian): +/// trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32 +/// ---> +/// extractelement <4 x i32> %X, 1 +static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) { + Value *TruncOp = Trunc.getOperand(0); + Type *DestType = Trunc.getType(); + if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType)) + return nullptr; + + Value *VecInput = nullptr; + ConstantInt *ShiftVal = nullptr; + if (!match(TruncOp, m_CombineOr(m_BitCast(m_Value(VecInput)), + m_LShr(m_BitCast(m_Value(VecInput)), + m_ConstantInt(ShiftVal)))) || + !isa<VectorType>(VecInput->getType())) + return nullptr; + + VectorType *VecType = cast<VectorType>(VecInput->getType()); + unsigned VecWidth = VecType->getPrimitiveSizeInBits(); + unsigned DestWidth = DestType->getPrimitiveSizeInBits(); + unsigned ShiftAmount = ShiftVal ? ShiftVal->getZExtValue() : 0; + + if ((VecWidth % DestWidth != 0) || (ShiftAmount % DestWidth != 0)) + return nullptr; + + // If the element type of the vector doesn't match the result type, + // bitcast it to a vector type that we can extract from. + unsigned NumVecElts = VecWidth / DestWidth; + if (VecType->getElementType() != DestType) { + VecType = VectorType::get(DestType, NumVecElts); + VecInput = IC.Builder.CreateBitCast(VecInput, VecType, "bc"); + } + + unsigned Elt = ShiftAmount / DestWidth; + if (IC.getDataLayout().isBigEndian()) + Elt = NumVecElts - 1 - Elt; + + return ExtractElementInst::Create(VecInput, IC.Builder.getInt32(Elt)); +} + +/// Rotate left/right may occur in a wider type than necessary because of type +/// promotion rules. Try to narrow the inputs and convert to funnel shift. +Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) { + assert((isa<VectorType>(Trunc.getSrcTy()) || + shouldChangeType(Trunc.getSrcTy(), Trunc.getType())) && + "Don't narrow to an illegal scalar type"); + + // Bail out on strange types. It is possible to handle some of these patterns + // even with non-power-of-2 sizes, but it is not a likely scenario. + Type *DestTy = Trunc.getType(); + unsigned NarrowWidth = DestTy->getScalarSizeInBits(); + if (!isPowerOf2_32(NarrowWidth)) + return nullptr; + + // First, find an or'd pair of opposite shifts with the same shifted operand: + // trunc (or (lshr ShVal, ShAmt0), (shl ShVal, ShAmt1)) + Value *Or0, *Or1; + if (!match(Trunc.getOperand(0), m_OneUse(m_Or(m_Value(Or0), m_Value(Or1))))) + return nullptr; + + Value *ShVal, *ShAmt0, *ShAmt1; + if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal), m_Value(ShAmt0)))) || + !match(Or1, m_OneUse(m_LogicalShift(m_Specific(ShVal), m_Value(ShAmt1))))) + return nullptr; + + auto ShiftOpcode0 = cast<BinaryOperator>(Or0)->getOpcode(); + auto ShiftOpcode1 = cast<BinaryOperator>(Or1)->getOpcode(); + if (ShiftOpcode0 == ShiftOpcode1) + return nullptr; + + // Match the shift amount operands for a rotate pattern. This always matches + // a subtraction on the R operand. + auto matchShiftAmount = [](Value *L, Value *R, unsigned Width) -> Value * { + // The shift amounts may add up to the narrow bit width: + // (shl ShVal, L) | (lshr ShVal, Width - L) + if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L))))) + return L; + + // The shift amount may be masked with negation: + // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1))) + Value *X; + unsigned Mask = Width - 1; + if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) && + match(R, m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask)))) + return X; + + // Same as above, but the shift amount may be extended after masking: + if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) && + match(R, m_ZExt(m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask))))) + return X; + + return nullptr; + }; + + Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, NarrowWidth); + bool SubIsOnLHS = false; + if (!ShAmt) { + ShAmt = matchShiftAmount(ShAmt1, ShAmt0, NarrowWidth); + SubIsOnLHS = true; + } + if (!ShAmt) + return nullptr; + + // The shifted value must have high zeros in the wide type. Typically, this + // will be a zext, but it could also be the result of an 'and' or 'shift'. + unsigned WideWidth = Trunc.getSrcTy()->getScalarSizeInBits(); + APInt HiBitMask = APInt::getHighBitsSet(WideWidth, WideWidth - NarrowWidth); + if (!MaskedValueIsZero(ShVal, HiBitMask, 0, &Trunc)) + return nullptr; + + // We have an unnecessarily wide rotate! + // trunc (or (lshr ShVal, ShAmt), (shl ShVal, BitWidth - ShAmt)) + // Narrow the inputs and convert to funnel shift intrinsic: + // llvm.fshl.i8(trunc(ShVal), trunc(ShVal), trunc(ShAmt)) + Value *NarrowShAmt = Builder.CreateTrunc(ShAmt, DestTy); + Value *X = Builder.CreateTrunc(ShVal, DestTy); + bool IsFshl = (!SubIsOnLHS && ShiftOpcode0 == BinaryOperator::Shl) || + (SubIsOnLHS && ShiftOpcode1 == BinaryOperator::Shl); + Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr; + Function *F = Intrinsic::getDeclaration(Trunc.getModule(), IID, DestTy); + return IntrinsicInst::Create(F, { X, X, NarrowShAmt }); +} + +/// Try to narrow the width of math or bitwise logic instructions by pulling a +/// truncate ahead of binary operators. +/// TODO: Transforms for truncated shifts should be moved into here. +Instruction *InstCombiner::narrowBinOp(TruncInst &Trunc) { + Type *SrcTy = Trunc.getSrcTy(); + Type *DestTy = Trunc.getType(); + if (!isa<VectorType>(SrcTy) && !shouldChangeType(SrcTy, DestTy)) + return nullptr; + + BinaryOperator *BinOp; + if (!match(Trunc.getOperand(0), m_OneUse(m_BinOp(BinOp)))) + return nullptr; + + Value *BinOp0 = BinOp->getOperand(0); + Value *BinOp1 = BinOp->getOperand(1); + switch (BinOp->getOpcode()) { + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: { + Constant *C; + if (match(BinOp0, m_Constant(C))) { + // trunc (binop C, X) --> binop (trunc C', X) + Constant *NarrowC = ConstantExpr::getTrunc(C, DestTy); + Value *TruncX = Builder.CreateTrunc(BinOp1, DestTy); + return BinaryOperator::Create(BinOp->getOpcode(), NarrowC, TruncX); + } + if (match(BinOp1, m_Constant(C))) { + // trunc (binop X, C) --> binop (trunc X, C') + Constant *NarrowC = ConstantExpr::getTrunc(C, DestTy); + Value *TruncX = Builder.CreateTrunc(BinOp0, DestTy); + return BinaryOperator::Create(BinOp->getOpcode(), TruncX, NarrowC); + } + Value *X; + if (match(BinOp0, m_ZExtOrSExt(m_Value(X))) && X->getType() == DestTy) { + // trunc (binop (ext X), Y) --> binop X, (trunc Y) + Value *NarrowOp1 = Builder.CreateTrunc(BinOp1, DestTy); + return BinaryOperator::Create(BinOp->getOpcode(), X, NarrowOp1); + } + if (match(BinOp1, m_ZExtOrSExt(m_Value(X))) && X->getType() == DestTy) { + // trunc (binop Y, (ext X)) --> binop (trunc Y), X + Value *NarrowOp0 = Builder.CreateTrunc(BinOp0, DestTy); + return BinaryOperator::Create(BinOp->getOpcode(), NarrowOp0, X); + } + break; + } + + default: break; + } + + if (Instruction *NarrowOr = narrowRotate(Trunc)) + return NarrowOr; + + return nullptr; +} + +/// Try to narrow the width of a splat shuffle. This could be generalized to any +/// shuffle with a constant operand, but we limit the transform to avoid +/// creating a shuffle type that targets may not be able to lower effectively. +static Instruction *shrinkSplatShuffle(TruncInst &Trunc, + InstCombiner::BuilderTy &Builder) { + auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0)); + if (Shuf && Shuf->hasOneUse() && isa<UndefValue>(Shuf->getOperand(1)) && + Shuf->getMask()->getSplatValue() && + Shuf->getType() == Shuf->getOperand(0)->getType()) { + // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Undef, SplatMask + Constant *NarrowUndef = UndefValue::get(Trunc.getType()); + Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType()); + return new ShuffleVectorInst(NarrowOp, NarrowUndef, Shuf->getMask()); + } + + return nullptr; +} + +/// Try to narrow the width of an insert element. This could be generalized for +/// any vector constant, but we limit the transform to insertion into undef to +/// avoid potential backend problems from unsupported insertion widths. This +/// could also be extended to handle the case of inserting a scalar constant +/// into a vector variable. +static Instruction *shrinkInsertElt(CastInst &Trunc, + InstCombiner::BuilderTy &Builder) { + Instruction::CastOps Opcode = Trunc.getOpcode(); + assert((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) && + "Unexpected instruction for shrinking"); + + auto *InsElt = dyn_cast<InsertElementInst>(Trunc.getOperand(0)); + if (!InsElt || !InsElt->hasOneUse()) + return nullptr; + + Type *DestTy = Trunc.getType(); + Type *DestScalarTy = DestTy->getScalarType(); + Value *VecOp = InsElt->getOperand(0); + Value *ScalarOp = InsElt->getOperand(1); + Value *Index = InsElt->getOperand(2); + + if (isa<UndefValue>(VecOp)) { + // trunc (inselt undef, X, Index) --> inselt undef, (trunc X), Index + // fptrunc (inselt undef, X, Index) --> inselt undef, (fptrunc X), Index + UndefValue *NarrowUndef = UndefValue::get(DestTy); + Value *NarrowOp = Builder.CreateCast(Opcode, ScalarOp, DestScalarTy); + return InsertElementInst::Create(NarrowUndef, NarrowOp, Index); + } + + return nullptr; +} + +Instruction *InstCombiner::visitTrunc(TruncInst &CI) { + if (Instruction *Result = commonCastTransforms(CI)) + return Result; + + Value *Src = CI.getOperand(0); + Type *DestTy = CI.getType(), *SrcTy = Src->getType(); + + // Attempt to truncate the entire input expression tree to the destination + // type. Only do this if the dest type is a simple type, don't convert the + // expression tree to something weird like i93 unless the source is also + // strange. + if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) && + canEvaluateTruncated(Src, DestTy, *this, &CI)) { + + // If this cast is a truncate, evaluting in a different type always + // eliminates the cast, so it is always a win. + LLVM_DEBUG( + dbgs() << "ICE: EvaluateInDifferentType converting expression type" + " to avoid cast: " + << CI << '\n'); + Value *Res = EvaluateInDifferentType(Src, DestTy, false); + assert(Res->getType() == DestTy); + return replaceInstUsesWith(CI, Res); + } + + // Test if the trunc is the user of a select which is part of a + // minimum or maximum operation. If so, don't do any more simplification. + // Even simplifying demanded bits can break the canonical form of a + // min/max. + Value *LHS, *RHS; + if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0))) + if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN) + return nullptr; + + // See if we can simplify any instructions used by the input whose sole + // purpose is to compute bits we don't care about. + if (SimplifyDemandedInstructionBits(CI)) + return &CI; + + if (DestTy->getScalarSizeInBits() == 1) { + Value *Zero = Constant::getNullValue(Src->getType()); + if (DestTy->isIntegerTy()) { + // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only). + // TODO: We canonicalize to more instructions here because we are probably + // lacking equivalent analysis for trunc relative to icmp. There may also + // be codegen concerns. If those trunc limitations were removed, we could + // remove this transform. + Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1)); + return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); + } + + // For vectors, we do not canonicalize all truncs to icmp, so optimize + // patterns that would be covered within visitICmpInst. + Value *X; + const APInt *C; + if (match(Src, m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) { + // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0 + APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C); + Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC)); + return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); + } + if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_APInt(C)), + m_Deferred(X))))) { + // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0 + APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C) | 1; + Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC)); + return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); + } + } + + // FIXME: Maybe combine the next two transforms to handle the no cast case + // more efficiently. Support vector types. Cleanup code by using m_OneUse. + + // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion. + Value *A = nullptr; ConstantInt *Cst = nullptr; + if (Src->hasOneUse() && + match(Src, m_LShr(m_ZExt(m_Value(A)), m_ConstantInt(Cst)))) { + // We have three types to worry about here, the type of A, the source of + // the truncate (MidSize), and the destination of the truncate. We know that + // ASize < MidSize and MidSize > ResultSize, but don't know the relation + // between ASize and ResultSize. + unsigned ASize = A->getType()->getPrimitiveSizeInBits(); + + // If the shift amount is larger than the size of A, then the result is + // known to be zero because all the input bits got shifted out. + if (Cst->getZExtValue() >= ASize) + return replaceInstUsesWith(CI, Constant::getNullValue(DestTy)); + + // Since we're doing an lshr and a zero extend, and know that the shift + // amount is smaller than ASize, it is always safe to do the shift in A's + // type, then zero extend or truncate to the result. + Value *Shift = Builder.CreateLShr(A, Cst->getZExtValue()); + Shift->takeName(Src); + return CastInst::CreateIntegerCast(Shift, DestTy, false); + } + + // FIXME: We should canonicalize to zext/trunc and remove this transform. + // Transform trunc(lshr (sext A), Cst) to ashr A, Cst to eliminate type + // conversion. + // It works because bits coming from sign extension have the same value as + // the sign bit of the original value; performing ashr instead of lshr + // generates bits of the same value as the sign bit. + if (Src->hasOneUse() && + match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst)))) { + Value *SExt = cast<Instruction>(Src)->getOperand(0); + const unsigned SExtSize = SExt->getType()->getPrimitiveSizeInBits(); + const unsigned ASize = A->getType()->getPrimitiveSizeInBits(); + const unsigned CISize = CI.getType()->getPrimitiveSizeInBits(); + const unsigned MaxAmt = SExtSize - std::max(CISize, ASize); + unsigned ShiftAmt = Cst->getZExtValue(); + + // This optimization can be only performed when zero bits generated by + // the original lshr aren't pulled into the value after truncation, so we + // can only shift by values no larger than the number of extension bits. + // FIXME: Instead of bailing when the shift is too large, use and to clear + // the extra bits. + if (ShiftAmt <= MaxAmt) { + if (CISize == ASize) + return BinaryOperator::CreateAShr(A, ConstantInt::get(CI.getType(), + std::min(ShiftAmt, ASize - 1))); + if (SExt->hasOneUse()) { + Value *Shift = Builder.CreateAShr(A, std::min(ShiftAmt, ASize - 1)); + Shift->takeName(Src); + return CastInst::CreateIntegerCast(Shift, CI.getType(), true); + } + } + } + + if (Instruction *I = narrowBinOp(CI)) + return I; + + if (Instruction *I = shrinkSplatShuffle(CI, Builder)) + return I; + + if (Instruction *I = shrinkInsertElt(CI, Builder)) + return I; + + if (Src->hasOneUse() && isa<IntegerType>(SrcTy) && + shouldChangeType(SrcTy, DestTy)) { + // Transform "trunc (shl X, cst)" -> "shl (trunc X), cst" so long as the + // dest type is native and cst < dest size. + if (match(Src, m_Shl(m_Value(A), m_ConstantInt(Cst))) && + !match(A, m_Shr(m_Value(), m_Constant()))) { + // Skip shifts of shift by constants. It undoes a combine in + // FoldShiftByConstant and is the extend in reg pattern. + const unsigned DestSize = DestTy->getScalarSizeInBits(); + if (Cst->getValue().ult(DestSize)) { + Value *NewTrunc = Builder.CreateTrunc(A, DestTy, A->getName() + ".tr"); + + return BinaryOperator::Create( + Instruction::Shl, NewTrunc, + ConstantInt::get(DestTy, Cst->getValue().trunc(DestSize))); + } + } + } + + if (Instruction *I = foldVecTruncToExtElt(CI, *this)) + return I; + + return nullptr; +} + +Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, + bool DoTransform) { + // If we are just checking for a icmp eq of a single bit and zext'ing it + // to an integer, then shift the bit to the appropriate place and then + // cast to integer to avoid the comparison. + const APInt *Op1CV; + if (match(ICI->getOperand(1), m_APInt(Op1CV))) { + + // zext (x <s 0) to i32 --> x>>u31 true if signbit set. + // zext (x >s -1) to i32 --> (x>>u31)^1 true if signbit clear. + if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isNullValue()) || + (ICI->getPredicate() == ICmpInst::ICMP_SGT && Op1CV->isAllOnesValue())) { + if (!DoTransform) return ICI; + + Value *In = ICI->getOperand(0); + Value *Sh = ConstantInt::get(In->getType(), + In->getType()->getScalarSizeInBits() - 1); + In = Builder.CreateLShr(In, Sh, In->getName() + ".lobit"); + if (In->getType() != CI.getType()) + In = Builder.CreateIntCast(In, CI.getType(), false /*ZExt*/); + + if (ICI->getPredicate() == ICmpInst::ICMP_SGT) { + Constant *One = ConstantInt::get(In->getType(), 1); + In = Builder.CreateXor(In, One, In->getName() + ".not"); + } + + return replaceInstUsesWith(CI, In); + } + + // zext (X == 0) to i32 --> X^1 iff X has only the low bit set. + // zext (X == 0) to i32 --> (X>>1)^1 iff X has only the 2nd bit set. + // zext (X == 1) to i32 --> X iff X has only the low bit set. + // zext (X == 2) to i32 --> X>>1 iff X has only the 2nd bit set. + // zext (X != 0) to i32 --> X iff X has only the low bit set. + // zext (X != 0) to i32 --> X>>1 iff X has only the 2nd bit set. + // zext (X != 1) to i32 --> X^1 iff X has only the low bit set. + // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set. + if ((Op1CV->isNullValue() || Op1CV->isPowerOf2()) && + // This only works for EQ and NE + ICI->isEquality()) { + // If Op1C some other power of two, convert: + KnownBits Known = computeKnownBits(ICI->getOperand(0), 0, &CI); + + APInt KnownZeroMask(~Known.Zero); + if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1? + if (!DoTransform) return ICI; + + bool isNE = ICI->getPredicate() == ICmpInst::ICMP_NE; + if (!Op1CV->isNullValue() && (*Op1CV != KnownZeroMask)) { + // (X&4) == 2 --> false + // (X&4) != 2 --> true + Constant *Res = ConstantInt::get(CI.getType(), isNE); + return replaceInstUsesWith(CI, Res); + } + + uint32_t ShAmt = KnownZeroMask.logBase2(); + Value *In = ICI->getOperand(0); + if (ShAmt) { + // Perform a logical shr by shiftamt. + // Insert the shift to put the result in the low bit. + In = Builder.CreateLShr(In, ConstantInt::get(In->getType(), ShAmt), + In->getName() + ".lobit"); + } + + if (!Op1CV->isNullValue() == isNE) { // Toggle the low bit. + Constant *One = ConstantInt::get(In->getType(), 1); + In = Builder.CreateXor(In, One); + } + + if (CI.getType() == In->getType()) + return replaceInstUsesWith(CI, In); + + Value *IntCast = Builder.CreateIntCast(In, CI.getType(), false); + return replaceInstUsesWith(CI, IntCast); + } + } + } + + // icmp ne A, B is equal to xor A, B when A and B only really have one bit. + // It is also profitable to transform icmp eq into not(xor(A, B)) because that + // may lead to additional simplifications. + if (ICI->isEquality() && CI.getType() == ICI->getOperand(0)->getType()) { + if (IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) { + Value *LHS = ICI->getOperand(0); + Value *RHS = ICI->getOperand(1); + + KnownBits KnownLHS = computeKnownBits(LHS, 0, &CI); + KnownBits KnownRHS = computeKnownBits(RHS, 0, &CI); + + if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) { + APInt KnownBits = KnownLHS.Zero | KnownLHS.One; + APInt UnknownBit = ~KnownBits; + if (UnknownBit.countPopulation() == 1) { + if (!DoTransform) return ICI; + + Value *Result = Builder.CreateXor(LHS, RHS); + + // Mask off any bits that are set and won't be shifted away. + if (KnownLHS.One.uge(UnknownBit)) + Result = Builder.CreateAnd(Result, + ConstantInt::get(ITy, UnknownBit)); + + // Shift the bit we're testing down to the lsb. + Result = Builder.CreateLShr( + Result, ConstantInt::get(ITy, UnknownBit.countTrailingZeros())); + + if (ICI->getPredicate() == ICmpInst::ICMP_EQ) + Result = Builder.CreateXor(Result, ConstantInt::get(ITy, 1)); + Result->takeName(ICI); + return replaceInstUsesWith(CI, Result); + } + } + } + } + + return nullptr; +} + +/// Determine if the specified value can be computed in the specified wider type +/// and produce the same low bits. If not, return false. +/// +/// If this function returns true, it can also return a non-zero number of bits +/// (in BitsToClear) which indicates that the value it computes is correct for +/// the zero extend, but that the additional BitsToClear bits need to be zero'd +/// out. For example, to promote something like: +/// +/// %B = trunc i64 %A to i32 +/// %C = lshr i32 %B, 8 +/// %E = zext i32 %C to i64 +/// +/// CanEvaluateZExtd for the 'lshr' will return true, and BitsToClear will be +/// set to 8 to indicate that the promoted value needs to have bits 24-31 +/// cleared in addition to bits 32-63. Since an 'and' will be generated to +/// clear the top bits anyway, doing this has no extra cost. +/// +/// This function works on both vectors and scalars. +static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, + InstCombiner &IC, Instruction *CxtI) { + BitsToClear = 0; + if (canAlwaysEvaluateInType(V, Ty)) + return true; + if (canNotEvaluateInType(V, Ty)) + return false; + + auto *I = cast<Instruction>(V); + unsigned Tmp; + switch (I->getOpcode()) { + case Instruction::ZExt: // zext(zext(x)) -> zext(x). + case Instruction::SExt: // zext(sext(x)) -> sext(x). + case Instruction::Trunc: // zext(trunc(x)) -> trunc(x) or zext(x) + return true; + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) || + !canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI)) + return false; + // These can all be promoted if neither operand has 'bits to clear'. + if (BitsToClear == 0 && Tmp == 0) + return true; + + // If the operation is an AND/OR/XOR and the bits to clear are zero in the + // other side, BitsToClear is ok. + if (Tmp == 0 && I->isBitwiseLogicOp()) { + // We use MaskedValueIsZero here for generality, but the case we care + // about the most is constant RHS. + unsigned VSize = V->getType()->getScalarSizeInBits(); + if (IC.MaskedValueIsZero(I->getOperand(1), + APInt::getHighBitsSet(VSize, BitsToClear), + 0, CxtI)) { + // If this is an And instruction and all of the BitsToClear are + // known to be zero we can reset BitsToClear. + if (I->getOpcode() == Instruction::And) + BitsToClear = 0; + return true; + } + } + + // Otherwise, we don't know how to analyze this BitsToClear case yet. + return false; + + case Instruction::Shl: { + // We can promote shl(x, cst) if we can promote x. Since shl overwrites the + // upper bits we can reduce BitsToClear by the shift amount. + const APInt *Amt; + if (match(I->getOperand(1), m_APInt(Amt))) { + if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) + return false; + uint64_t ShiftAmt = Amt->getZExtValue(); + BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0; + return true; + } + return false; + } + case Instruction::LShr: { + // We can promote lshr(x, cst) if we can promote x. This requires the + // ultimate 'and' to clear out the high zero bits we're clearing out though. + const APInt *Amt; + if (match(I->getOperand(1), m_APInt(Amt))) { + if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) + return false; + BitsToClear += Amt->getZExtValue(); + if (BitsToClear > V->getType()->getScalarSizeInBits()) + BitsToClear = V->getType()->getScalarSizeInBits(); + return true; + } + // Cannot promote variable LSHR. + return false; + } + case Instruction::Select: + if (!canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) || + !canEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) || + // TODO: If important, we could handle the case when the BitsToClear are + // known zero in the disagreeing side. + Tmp != BitsToClear) + return false; + return true; + + case Instruction::PHI: { + // We can change a phi if we can change all operands. Note that we never + // get into trouble with cyclic PHIs here because we only consider + // instructions with a single use. + PHINode *PN = cast<PHINode>(I); + if (!canEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI)) + return false; + for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) + if (!canEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) || + // TODO: If important, we could handle the case when the BitsToClear + // are known zero in the disagreeing input. + Tmp != BitsToClear) + return false; + return true; + } + default: + // TODO: Can handle more cases here. + return false; + } +} + +Instruction *InstCombiner::visitZExt(ZExtInst &CI) { + // If this zero extend is only used by a truncate, let the truncate be + // eliminated before we try to optimize this zext. + if (CI.hasOneUse() && isa<TruncInst>(CI.user_back())) + return nullptr; + + // If one of the common conversion will work, do it. + if (Instruction *Result = commonCastTransforms(CI)) + return Result; + + Value *Src = CI.getOperand(0); + Type *SrcTy = Src->getType(), *DestTy = CI.getType(); + + // Try to extend the entire expression tree to the wide destination type. + unsigned BitsToClear; + if (shouldChangeType(SrcTy, DestTy) && + canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) { + assert(BitsToClear <= SrcTy->getScalarSizeInBits() && + "Can't clear more bits than in SrcTy"); + + // Okay, we can transform this! Insert the new expression now. + LLVM_DEBUG( + dbgs() << "ICE: EvaluateInDifferentType converting expression type" + " to avoid zero extend: " + << CI << '\n'); + Value *Res = EvaluateInDifferentType(Src, DestTy, false); + assert(Res->getType() == DestTy); + + // Preserve debug values referring to Src if the zext is its last use. + if (auto *SrcOp = dyn_cast<Instruction>(Src)) + if (SrcOp->hasOneUse()) + replaceAllDbgUsesWith(*SrcOp, *Res, CI, DT); + + uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear; + uint32_t DestBitSize = DestTy->getScalarSizeInBits(); + + // If the high bits are already filled with zeros, just replace this + // cast with the result. + if (MaskedValueIsZero(Res, + APInt::getHighBitsSet(DestBitSize, + DestBitSize-SrcBitsKept), + 0, &CI)) + return replaceInstUsesWith(CI, Res); + + // We need to emit an AND to clear the high bits. + Constant *C = ConstantInt::get(Res->getType(), + APInt::getLowBitsSet(DestBitSize, SrcBitsKept)); + return BinaryOperator::CreateAnd(Res, C); + } + + // If this is a TRUNC followed by a ZEXT then we are dealing with integral + // types and if the sizes are just right we can convert this into a logical + // 'and' which will be much cheaper than the pair of casts. + if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) { // A->B->C cast + // TODO: Subsume this into EvaluateInDifferentType. + + // Get the sizes of the types involved. We know that the intermediate type + // will be smaller than A or C, but don't know the relation between A and C. + Value *A = CSrc->getOperand(0); + unsigned SrcSize = A->getType()->getScalarSizeInBits(); + unsigned MidSize = CSrc->getType()->getScalarSizeInBits(); + unsigned DstSize = CI.getType()->getScalarSizeInBits(); + // If we're actually extending zero bits, then if + // SrcSize < DstSize: zext(a & mask) + // SrcSize == DstSize: a & mask + // SrcSize > DstSize: trunc(a) & mask + if (SrcSize < DstSize) { + APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize)); + Constant *AndConst = ConstantInt::get(A->getType(), AndValue); + Value *And = Builder.CreateAnd(A, AndConst, CSrc->getName() + ".mask"); + return new ZExtInst(And, CI.getType()); + } + + if (SrcSize == DstSize) { + APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize)); + return BinaryOperator::CreateAnd(A, ConstantInt::get(A->getType(), + AndValue)); + } + if (SrcSize > DstSize) { + Value *Trunc = Builder.CreateTrunc(A, CI.getType()); + APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize)); + return BinaryOperator::CreateAnd(Trunc, + ConstantInt::get(Trunc->getType(), + AndValue)); + } + } + + if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src)) + return transformZExtICmp(ICI, CI); + + BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src); + if (SrcI && SrcI->getOpcode() == Instruction::Or) { + // zext (or icmp, icmp) -> or (zext icmp), (zext icmp) if at least one + // of the (zext icmp) can be eliminated. If so, immediately perform the + // according elimination. + ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0)); + ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1)); + if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() && + (transformZExtICmp(LHS, CI, false) || + transformZExtICmp(RHS, CI, false))) { + // zext (or icmp, icmp) -> or (zext icmp), (zext icmp) + Value *LCast = Builder.CreateZExt(LHS, CI.getType(), LHS->getName()); + Value *RCast = Builder.CreateZExt(RHS, CI.getType(), RHS->getName()); + BinaryOperator *Or = BinaryOperator::Create(Instruction::Or, LCast, RCast); + + // Perform the elimination. + if (auto *LZExt = dyn_cast<ZExtInst>(LCast)) + transformZExtICmp(LHS, *LZExt); + if (auto *RZExt = dyn_cast<ZExtInst>(RCast)) + transformZExtICmp(RHS, *RZExt); + + return Or; + } + } + + // zext(trunc(X) & C) -> (X & zext(C)). + Constant *C; + Value *X; + if (SrcI && + match(SrcI, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Constant(C)))) && + X->getType() == CI.getType()) + return BinaryOperator::CreateAnd(X, ConstantExpr::getZExt(C, CI.getType())); + + // zext((trunc(X) & C) ^ C) -> ((X & zext(C)) ^ zext(C)). + Value *And; + if (SrcI && match(SrcI, m_OneUse(m_Xor(m_Value(And), m_Constant(C)))) && + match(And, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Specific(C)))) && + X->getType() == CI.getType()) { + Constant *ZC = ConstantExpr::getZExt(C, CI.getType()); + return BinaryOperator::CreateXor(Builder.CreateAnd(X, ZC), ZC); + } + + return nullptr; +} + +/// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp. +Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { + Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1); + ICmpInst::Predicate Pred = ICI->getPredicate(); + + // Don't bother if Op1 isn't of vector or integer type. + if (!Op1->getType()->isIntOrIntVectorTy()) + return nullptr; + + if ((Pred == ICmpInst::ICMP_SLT && match(Op1, m_ZeroInt())) || + (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes()))) { + // (x <s 0) ? -1 : 0 -> ashr x, 31 -> all ones if negative + // (x >s -1) ? -1 : 0 -> not (ashr x, 31) -> all ones if positive + Value *Sh = ConstantInt::get(Op0->getType(), + Op0->getType()->getScalarSizeInBits() - 1); + Value *In = Builder.CreateAShr(Op0, Sh, Op0->getName() + ".lobit"); + if (In->getType() != CI.getType()) + In = Builder.CreateIntCast(In, CI.getType(), true /*SExt*/); + + if (Pred == ICmpInst::ICMP_SGT) + In = Builder.CreateNot(In, In->getName() + ".not"); + return replaceInstUsesWith(CI, In); + } + + if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { + // If we know that only one bit of the LHS of the icmp can be set and we + // have an equality comparison with zero or a power of 2, we can transform + // the icmp and sext into bitwise/integer operations. + if (ICI->hasOneUse() && + ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){ + KnownBits Known = computeKnownBits(Op0, 0, &CI); + + APInt KnownZeroMask(~Known.Zero); + if (KnownZeroMask.isPowerOf2()) { + Value *In = ICI->getOperand(0); + + // If the icmp tests for a known zero bit we can constant fold it. + if (!Op1C->isZero() && Op1C->getValue() != KnownZeroMask) { + Value *V = Pred == ICmpInst::ICMP_NE ? + ConstantInt::getAllOnesValue(CI.getType()) : + ConstantInt::getNullValue(CI.getType()); + return replaceInstUsesWith(CI, V); + } + + if (!Op1C->isZero() == (Pred == ICmpInst::ICMP_NE)) { + // sext ((x & 2^n) == 0) -> (x >> n) - 1 + // sext ((x & 2^n) != 2^n) -> (x >> n) - 1 + unsigned ShiftAmt = KnownZeroMask.countTrailingZeros(); + // Perform a right shift to place the desired bit in the LSB. + if (ShiftAmt) + In = Builder.CreateLShr(In, + ConstantInt::get(In->getType(), ShiftAmt)); + + // At this point "In" is either 1 or 0. Subtract 1 to turn + // {1, 0} -> {0, -1}. + In = Builder.CreateAdd(In, + ConstantInt::getAllOnesValue(In->getType()), + "sext"); + } else { + // sext ((x & 2^n) != 0) -> (x << bitwidth-n) a>> bitwidth-1 + // sext ((x & 2^n) == 2^n) -> (x << bitwidth-n) a>> bitwidth-1 + unsigned ShiftAmt = KnownZeroMask.countLeadingZeros(); + // Perform a left shift to place the desired bit in the MSB. + if (ShiftAmt) + In = Builder.CreateShl(In, + ConstantInt::get(In->getType(), ShiftAmt)); + + // Distribute the bit over the whole bit width. + In = Builder.CreateAShr(In, ConstantInt::get(In->getType(), + KnownZeroMask.getBitWidth() - 1), "sext"); + } + + if (CI.getType() == In->getType()) + return replaceInstUsesWith(CI, In); + return CastInst::CreateIntegerCast(In, CI.getType(), true/*SExt*/); + } + } + } + + return nullptr; +} + +/// Return true if we can take the specified value and return it as type Ty +/// without inserting any new casts and without changing the value of the common +/// low bits. This is used by code that tries to promote integer operations to +/// a wider types will allow us to eliminate the extension. +/// +/// This function works on both vectors and scalars. +/// +static bool canEvaluateSExtd(Value *V, Type *Ty) { + assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() && + "Can't sign extend type to a smaller type"); + if (canAlwaysEvaluateInType(V, Ty)) + return true; + if (canNotEvaluateInType(V, Ty)) + return false; + + auto *I = cast<Instruction>(V); + switch (I->getOpcode()) { + case Instruction::SExt: // sext(sext(x)) -> sext(x) + case Instruction::ZExt: // sext(zext(x)) -> zext(x) + case Instruction::Trunc: // sext(trunc(x)) -> trunc(x) or sext(x) + return true; + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + // These operators can all arbitrarily be extended if their inputs can. + return canEvaluateSExtd(I->getOperand(0), Ty) && + canEvaluateSExtd(I->getOperand(1), Ty); + + //case Instruction::Shl: TODO + //case Instruction::LShr: TODO + + case Instruction::Select: + return canEvaluateSExtd(I->getOperand(1), Ty) && + canEvaluateSExtd(I->getOperand(2), Ty); + + case Instruction::PHI: { + // We can change a phi if we can change all operands. Note that we never + // get into trouble with cyclic PHIs here because we only consider + // instructions with a single use. + PHINode *PN = cast<PHINode>(I); + for (Value *IncValue : PN->incoming_values()) + if (!canEvaluateSExtd(IncValue, Ty)) return false; + return true; + } + default: + // TODO: Can handle more cases here. + break; + } + + return false; +} + +Instruction *InstCombiner::visitSExt(SExtInst &CI) { + // If this sign extend is only used by a truncate, let the truncate be + // eliminated before we try to optimize this sext. + if (CI.hasOneUse() && isa<TruncInst>(CI.user_back())) + return nullptr; + + if (Instruction *I = commonCastTransforms(CI)) + return I; + + Value *Src = CI.getOperand(0); + Type *SrcTy = Src->getType(), *DestTy = CI.getType(); + + // If we know that the value being extended is positive, we can use a zext + // instead. + KnownBits Known = computeKnownBits(Src, 0, &CI); + if (Known.isNonNegative()) + return CastInst::Create(Instruction::ZExt, Src, DestTy); + + // Try to extend the entire expression tree to the wide destination type. + if (shouldChangeType(SrcTy, DestTy) && canEvaluateSExtd(Src, DestTy)) { + // Okay, we can transform this! Insert the new expression now. + LLVM_DEBUG( + dbgs() << "ICE: EvaluateInDifferentType converting expression type" + " to avoid sign extend: " + << CI << '\n'); + Value *Res = EvaluateInDifferentType(Src, DestTy, true); + assert(Res->getType() == DestTy); + + uint32_t SrcBitSize = SrcTy->getScalarSizeInBits(); + uint32_t DestBitSize = DestTy->getScalarSizeInBits(); + + // If the high bits are already filled with sign bit, just replace this + // cast with the result. + if (ComputeNumSignBits(Res, 0, &CI) > DestBitSize - SrcBitSize) + return replaceInstUsesWith(CI, Res); + + // We need to emit a shl + ashr to do the sign extend. + Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize); + return BinaryOperator::CreateAShr(Builder.CreateShl(Res, ShAmt, "sext"), + ShAmt); + } + + // If the input is a trunc from the destination type, then turn sext(trunc(x)) + // into shifts. + Value *X; + if (match(Src, m_OneUse(m_Trunc(m_Value(X)))) && X->getType() == DestTy) { + // sext(trunc(X)) --> ashr(shl(X, C), C) + unsigned SrcBitSize = SrcTy->getScalarSizeInBits(); + unsigned DestBitSize = DestTy->getScalarSizeInBits(); + Constant *ShAmt = ConstantInt::get(DestTy, DestBitSize - SrcBitSize); + return BinaryOperator::CreateAShr(Builder.CreateShl(X, ShAmt), ShAmt); + } + + if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src)) + return transformSExtICmp(ICI, CI); + + // If the input is a shl/ashr pair of a same constant, then this is a sign + // extension from a smaller value. If we could trust arbitrary bitwidth + // integers, we could turn this into a truncate to the smaller bit and then + // use a sext for the whole extension. Since we don't, look deeper and check + // for a truncate. If the source and dest are the same type, eliminate the + // trunc and extend and just do shifts. For example, turn: + // %a = trunc i32 %i to i8 + // %b = shl i8 %a, 6 + // %c = ashr i8 %b, 6 + // %d = sext i8 %c to i32 + // into: + // %a = shl i32 %i, 30 + // %d = ashr i32 %a, 30 + Value *A = nullptr; + // TODO: Eventually this could be subsumed by EvaluateInDifferentType. + ConstantInt *BA = nullptr, *CA = nullptr; + if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_ConstantInt(BA)), + m_ConstantInt(CA))) && + BA == CA && A->getType() == CI.getType()) { + unsigned MidSize = Src->getType()->getScalarSizeInBits(); + unsigned SrcDstSize = CI.getType()->getScalarSizeInBits(); + unsigned ShAmt = CA->getZExtValue()+SrcDstSize-MidSize; + Constant *ShAmtV = ConstantInt::get(CI.getType(), ShAmt); + A = Builder.CreateShl(A, ShAmtV, CI.getName()); + return BinaryOperator::CreateAShr(A, ShAmtV); + } + + return nullptr; +} + + +/// Return a Constant* for the specified floating-point constant if it fits +/// in the specified FP type without changing its value. +static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { + bool losesInfo; + APFloat F = CFP->getValueAPF(); + (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo); + return !losesInfo; +} + +static Type *shrinkFPConstant(ConstantFP *CFP) { + if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext())) + return nullptr; // No constant folding of this. + // See if the value can be truncated to half and then reextended. + if (fitsInFPType(CFP, APFloat::IEEEhalf())) + return Type::getHalfTy(CFP->getContext()); + // See if the value can be truncated to float and then reextended. + if (fitsInFPType(CFP, APFloat::IEEEsingle())) + return Type::getFloatTy(CFP->getContext()); + if (CFP->getType()->isDoubleTy()) + return nullptr; // Won't shrink. + if (fitsInFPType(CFP, APFloat::IEEEdouble())) + return Type::getDoubleTy(CFP->getContext()); + // Don't try to shrink to various long double types. + return nullptr; +} + +// Determine if this is a vector of ConstantFPs and if so, return the minimal +// type we can safely truncate all elements to. +// TODO: Make these support undef elements. +static Type *shrinkFPConstantVector(Value *V) { + auto *CV = dyn_cast<Constant>(V); + if (!CV || !CV->getType()->isVectorTy()) + return nullptr; + + Type *MinType = nullptr; + + unsigned NumElts = CV->getType()->getVectorNumElements(); + for (unsigned i = 0; i != NumElts; ++i) { + auto *CFP = dyn_cast_or_null<ConstantFP>(CV->getAggregateElement(i)); + if (!CFP) + return nullptr; + + Type *T = shrinkFPConstant(CFP); + if (!T) + return nullptr; + + // If we haven't found a type yet or this type has a larger mantissa than + // our previous type, this is our new minimal type. + if (!MinType || T->getFPMantissaWidth() > MinType->getFPMantissaWidth()) + MinType = T; + } + + // Make a vector type from the minimal type. + return VectorType::get(MinType, NumElts); +} + +/// Find the minimum FP type we can safely truncate to. +static Type *getMinimumFPType(Value *V) { + if (auto *FPExt = dyn_cast<FPExtInst>(V)) + return FPExt->getOperand(0)->getType(); + + // If this value is a constant, return the constant in the smallest FP type + // that can accurately represent it. This allows us to turn + // (float)((double)X+2.0) into x+2.0f. + if (auto *CFP = dyn_cast<ConstantFP>(V)) + if (Type *T = shrinkFPConstant(CFP)) + return T; + + // Try to shrink a vector of FP constants. + if (Type *T = shrinkFPConstantVector(V)) + return T; + + return V->getType(); +} + +Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) { + if (Instruction *I = commonCastTransforms(FPT)) + return I; + + // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to + // simplify this expression to avoid one or more of the trunc/extend + // operations if we can do so without changing the numerical results. + // + // The exact manner in which the widths of the operands interact to limit + // what we can and cannot do safely varies from operation to operation, and + // is explained below in the various case statements. + Type *Ty = FPT.getType(); + auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0)); + if (BO && BO->hasOneUse()) { + Type *LHSMinType = getMinimumFPType(BO->getOperand(0)); + Type *RHSMinType = getMinimumFPType(BO->getOperand(1)); + unsigned OpWidth = BO->getType()->getFPMantissaWidth(); + unsigned LHSWidth = LHSMinType->getFPMantissaWidth(); + unsigned RHSWidth = RHSMinType->getFPMantissaWidth(); + unsigned SrcWidth = std::max(LHSWidth, RHSWidth); + unsigned DstWidth = Ty->getFPMantissaWidth(); + switch (BO->getOpcode()) { + default: break; + case Instruction::FAdd: + case Instruction::FSub: + // For addition and subtraction, the infinitely precise result can + // essentially be arbitrarily wide; proving that double rounding + // will not occur because the result of OpI is exact (as we will for + // FMul, for example) is hopeless. However, we *can* nonetheless + // frequently know that double rounding cannot occur (or that it is + // innocuous) by taking advantage of the specific structure of + // infinitely-precise results that admit double rounding. + // + // Specifically, if OpWidth >= 2*DstWdith+1 and DstWidth is sufficient + // to represent both sources, we can guarantee that the double + // rounding is innocuous (See p50 of Figueroa's 2000 PhD thesis, + // "A Rigorous Framework for Fully Supporting the IEEE Standard ..." + // for proof of this fact). + // + // Note: Figueroa does not consider the case where DstFormat != + // SrcFormat. It's possible (likely even!) that this analysis + // could be tightened for those cases, but they are rare (the main + // case of interest here is (float)((double)float + float)). + if (OpWidth >= 2*DstWidth+1 && DstWidth >= SrcWidth) { + Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty); + Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty); + Instruction *RI = BinaryOperator::Create(BO->getOpcode(), LHS, RHS); + RI->copyFastMathFlags(BO); + return RI; + } + break; + case Instruction::FMul: + // For multiplication, the infinitely precise result has at most + // LHSWidth + RHSWidth significant bits; if OpWidth is sufficient + // that such a value can be exactly represented, then no double + // rounding can possibly occur; we can safely perform the operation + // in the destination format if it can represent both sources. + if (OpWidth >= LHSWidth + RHSWidth && DstWidth >= SrcWidth) { + Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty); + Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty); + return BinaryOperator::CreateFMulFMF(LHS, RHS, BO); + } + break; + case Instruction::FDiv: + // For division, we use again use the bound from Figueroa's + // dissertation. I am entirely certain that this bound can be + // tightened in the unbalanced operand case by an analysis based on + // the diophantine rational approximation bound, but the well-known + // condition used here is a good conservative first pass. + // TODO: Tighten bound via rigorous analysis of the unbalanced case. + if (OpWidth >= 2*DstWidth && DstWidth >= SrcWidth) { + Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty); + Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty); + return BinaryOperator::CreateFDivFMF(LHS, RHS, BO); + } + break; + case Instruction::FRem: { + // Remainder is straightforward. Remainder is always exact, so the + // type of OpI doesn't enter into things at all. We simply evaluate + // in whichever source type is larger, then convert to the + // destination type. + if (SrcWidth == OpWidth) + break; + Value *LHS, *RHS; + if (LHSWidth == SrcWidth) { + LHS = Builder.CreateFPTrunc(BO->getOperand(0), LHSMinType); + RHS = Builder.CreateFPTrunc(BO->getOperand(1), LHSMinType); + } else { + LHS = Builder.CreateFPTrunc(BO->getOperand(0), RHSMinType); + RHS = Builder.CreateFPTrunc(BO->getOperand(1), RHSMinType); + } + + Value *ExactResult = Builder.CreateFRemFMF(LHS, RHS, BO); + return CastInst::CreateFPCast(ExactResult, Ty); + } + } + } + + // (fptrunc (fneg x)) -> (fneg (fptrunc x)) + Value *X; + Instruction *Op = dyn_cast<Instruction>(FPT.getOperand(0)); + if (Op && Op->hasOneUse()) { + if (match(Op, m_FNeg(m_Value(X)))) { + Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty); + + // FIXME: Once we're sure that unary FNeg optimizations are on par with + // binary FNeg, this should always return a unary operator. + if (isa<BinaryOperator>(Op)) + return BinaryOperator::CreateFNegFMF(InnerTrunc, Op); + return UnaryOperator::CreateFNegFMF(InnerTrunc, Op); + } + } + + if (auto *II = dyn_cast<IntrinsicInst>(FPT.getOperand(0))) { + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::ceil: + case Intrinsic::fabs: + case Intrinsic::floor: + case Intrinsic::nearbyint: + case Intrinsic::rint: + case Intrinsic::round: + case Intrinsic::trunc: { + Value *Src = II->getArgOperand(0); + if (!Src->hasOneUse()) + break; + + // Except for fabs, this transformation requires the input of the unary FP + // operation to be itself an fpext from the type to which we're + // truncating. + if (II->getIntrinsicID() != Intrinsic::fabs) { + FPExtInst *FPExtSrc = dyn_cast<FPExtInst>(Src); + if (!FPExtSrc || FPExtSrc->getSrcTy() != Ty) + break; + } + + // Do unary FP operation on smaller type. + // (fptrunc (fabs x)) -> (fabs (fptrunc x)) + Value *InnerTrunc = Builder.CreateFPTrunc(Src, Ty); + Function *Overload = Intrinsic::getDeclaration(FPT.getModule(), + II->getIntrinsicID(), Ty); + SmallVector<OperandBundleDef, 1> OpBundles; + II->getOperandBundlesAsDefs(OpBundles); + CallInst *NewCI = + CallInst::Create(Overload, {InnerTrunc}, OpBundles, II->getName()); + NewCI->copyFastMathFlags(II); + return NewCI; + } + } + } + + if (Instruction *I = shrinkInsertElt(FPT, Builder)) + return I; + + return nullptr; +} + +Instruction *InstCombiner::visitFPExt(CastInst &CI) { + return commonCastTransforms(CI); +} + +// fpto{s/u}i({u/s}itofp(X)) --> X or zext(X) or sext(X) or trunc(X) +// This is safe if the intermediate type has enough bits in its mantissa to +// accurately represent all values of X. For example, this won't work with +// i64 -> float -> i64. +Instruction *InstCombiner::FoldItoFPtoI(Instruction &FI) { + if (!isa<UIToFPInst>(FI.getOperand(0)) && !isa<SIToFPInst>(FI.getOperand(0))) + return nullptr; + Instruction *OpI = cast<Instruction>(FI.getOperand(0)); + + Value *SrcI = OpI->getOperand(0); + Type *FITy = FI.getType(); + Type *OpITy = OpI->getType(); + Type *SrcTy = SrcI->getType(); + bool IsInputSigned = isa<SIToFPInst>(OpI); + bool IsOutputSigned = isa<FPToSIInst>(FI); + + // We can safely assume the conversion won't overflow the output range, + // because (for example) (uint8_t)18293.f is undefined behavior. + + // Since we can assume the conversion won't overflow, our decision as to + // whether the input will fit in the float should depend on the minimum + // of the input range and output range. + + // This means this is also safe for a signed input and unsigned output, since + // a negative input would lead to undefined behavior. + int InputSize = (int)SrcTy->getScalarSizeInBits() - IsInputSigned; + int OutputSize = (int)FITy->getScalarSizeInBits() - IsOutputSigned; + int ActualSize = std::min(InputSize, OutputSize); + + if (ActualSize <= OpITy->getFPMantissaWidth()) { + if (FITy->getScalarSizeInBits() > SrcTy->getScalarSizeInBits()) { + if (IsInputSigned && IsOutputSigned) + return new SExtInst(SrcI, FITy); + return new ZExtInst(SrcI, FITy); + } + if (FITy->getScalarSizeInBits() < SrcTy->getScalarSizeInBits()) + return new TruncInst(SrcI, FITy); + if (SrcTy == FITy) + return replaceInstUsesWith(FI, SrcI); + return new BitCastInst(SrcI, FITy); + } + return nullptr; +} + +Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) { + Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0)); + if (!OpI) + return commonCastTransforms(FI); + + if (Instruction *I = FoldItoFPtoI(FI)) + return I; + + return commonCastTransforms(FI); +} + +Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) { + Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0)); + if (!OpI) + return commonCastTransforms(FI); + + if (Instruction *I = FoldItoFPtoI(FI)) + return I; + + return commonCastTransforms(FI); +} + +Instruction *InstCombiner::visitUIToFP(CastInst &CI) { + return commonCastTransforms(CI); +} + +Instruction *InstCombiner::visitSIToFP(CastInst &CI) { + return commonCastTransforms(CI); +} + +Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) { + // If the source integer type is not the intptr_t type for this target, do a + // trunc or zext to the intptr_t type, then inttoptr of it. This allows the + // cast to be exposed to other transforms. + unsigned AS = CI.getAddressSpace(); + if (CI.getOperand(0)->getType()->getScalarSizeInBits() != + DL.getPointerSizeInBits(AS)) { + Type *Ty = DL.getIntPtrType(CI.getContext(), AS); + if (CI.getType()->isVectorTy()) // Handle vectors of pointers. + Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements()); + + Value *P = Builder.CreateZExtOrTrunc(CI.getOperand(0), Ty); + return new IntToPtrInst(P, CI.getType()); + } + + if (Instruction *I = commonCastTransforms(CI)) + return I; + + return nullptr; +} + +/// Implement the transforms for cast of pointer (bitcast/ptrtoint) +Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) { + Value *Src = CI.getOperand(0); + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) { + // If casting the result of a getelementptr instruction with no offset, turn + // this into a cast of the original pointer! + if (GEP->hasAllZeroIndices() && + // If CI is an addrspacecast and GEP changes the poiner type, merging + // GEP into CI would undo canonicalizing addrspacecast with different + // pointer types, causing infinite loops. + (!isa<AddrSpaceCastInst>(CI) || + GEP->getType() == GEP->getPointerOperandType())) { + // Changing the cast operand is usually not a good idea but it is safe + // here because the pointer operand is being replaced with another + // pointer operand so the opcode doesn't need to change. + Worklist.Add(GEP); + CI.setOperand(0, GEP->getOperand(0)); + return &CI; + } + } + + return commonCastTransforms(CI); +} + +Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) { + // If the destination integer type is not the intptr_t type for this target, + // do a ptrtoint to intptr_t then do a trunc or zext. This allows the cast + // to be exposed to other transforms. + + Type *Ty = CI.getType(); + unsigned AS = CI.getPointerAddressSpace(); + + if (Ty->getScalarSizeInBits() == DL.getIndexSizeInBits(AS)) + return commonPointerCastTransforms(CI); + + Type *PtrTy = DL.getIntPtrType(CI.getContext(), AS); + if (Ty->isVectorTy()) // Handle vectors of pointers. + PtrTy = VectorType::get(PtrTy, Ty->getVectorNumElements()); + + Value *P = Builder.CreatePtrToInt(CI.getOperand(0), PtrTy); + return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false); +} + +/// This input value (which is known to have vector type) is being zero extended +/// or truncated to the specified vector type. +/// Try to replace it with a shuffle (and vector/vector bitcast) if possible. +/// +/// The source and destination vector types may have different element types. +static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy, + InstCombiner &IC) { + // We can only do this optimization if the output is a multiple of the input + // element size, or the input is a multiple of the output element size. + // Convert the input type to have the same element type as the output. + VectorType *SrcTy = cast<VectorType>(InVal->getType()); + + if (SrcTy->getElementType() != DestTy->getElementType()) { + // The input types don't need to be identical, but for now they must be the + // same size. There is no specific reason we couldn't handle things like + // <4 x i16> -> <4 x i32> by bitcasting to <2 x i32> but haven't gotten + // there yet. + if (SrcTy->getElementType()->getPrimitiveSizeInBits() != + DestTy->getElementType()->getPrimitiveSizeInBits()) + return nullptr; + + SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements()); + InVal = IC.Builder.CreateBitCast(InVal, SrcTy); + } + + // Now that the element types match, get the shuffle mask and RHS of the + // shuffle to use, which depends on whether we're increasing or decreasing the + // size of the input. + SmallVector<uint32_t, 16> ShuffleMask; + Value *V2; + + if (SrcTy->getNumElements() > DestTy->getNumElements()) { + // If we're shrinking the number of elements, just shuffle in the low + // elements from the input and use undef as the second shuffle input. + V2 = UndefValue::get(SrcTy); + for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i) + ShuffleMask.push_back(i); + + } else { + // If we're increasing the number of elements, shuffle in all of the + // elements from InVal and fill the rest of the result elements with zeros + // from a constant zero. + V2 = Constant::getNullValue(SrcTy); + unsigned SrcElts = SrcTy->getNumElements(); + for (unsigned i = 0, e = SrcElts; i != e; ++i) + ShuffleMask.push_back(i); + + // The excess elements reference the first element of the zero input. + for (unsigned i = 0, e = DestTy->getNumElements()-SrcElts; i != e; ++i) + ShuffleMask.push_back(SrcElts); + } + + return new ShuffleVectorInst(InVal, V2, + ConstantDataVector::get(V2->getContext(), + ShuffleMask)); +} + +static bool isMultipleOfTypeSize(unsigned Value, Type *Ty) { + return Value % Ty->getPrimitiveSizeInBits() == 0; +} + +static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) { + return Value / Ty->getPrimitiveSizeInBits(); +} + +/// V is a value which is inserted into a vector of VecEltTy. +/// Look through the value to see if we can decompose it into +/// insertions into the vector. See the example in the comment for +/// OptimizeIntegerToVectorInsertions for the pattern this handles. +/// The type of V is always a non-zero multiple of VecEltTy's size. +/// Shift is the number of bits between the lsb of V and the lsb of +/// the vector. +/// +/// This returns false if the pattern can't be matched or true if it can, +/// filling in Elements with the elements found here. +static bool collectInsertionElements(Value *V, unsigned Shift, + SmallVectorImpl<Value *> &Elements, + Type *VecEltTy, bool isBigEndian) { + assert(isMultipleOfTypeSize(Shift, VecEltTy) && + "Shift should be a multiple of the element type size"); + + // Undef values never contribute useful bits to the result. + if (isa<UndefValue>(V)) return true; + + // If we got down to a value of the right type, we win, try inserting into the + // right element. + if (V->getType() == VecEltTy) { + // Inserting null doesn't actually insert any elements. + if (Constant *C = dyn_cast<Constant>(V)) + if (C->isNullValue()) + return true; + + unsigned ElementIndex = getTypeSizeIndex(Shift, VecEltTy); + if (isBigEndian) + ElementIndex = Elements.size() - ElementIndex - 1; + + // Fail if multiple elements are inserted into this slot. + if (Elements[ElementIndex]) + return false; + + Elements[ElementIndex] = V; + return true; + } + + if (Constant *C = dyn_cast<Constant>(V)) { + // Figure out the # elements this provides, and bitcast it or slice it up + // as required. + unsigned NumElts = getTypeSizeIndex(C->getType()->getPrimitiveSizeInBits(), + VecEltTy); + // If the constant is the size of a vector element, we just need to bitcast + // it to the right type so it gets properly inserted. + if (NumElts == 1) + return collectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy), + Shift, Elements, VecEltTy, isBigEndian); + + // Okay, this is a constant that covers multiple elements. Slice it up into + // pieces and insert each element-sized piece into the vector. + if (!isa<IntegerType>(C->getType())) + C = ConstantExpr::getBitCast(C, IntegerType::get(V->getContext(), + C->getType()->getPrimitiveSizeInBits())); + unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits(); + Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize); + + for (unsigned i = 0; i != NumElts; ++i) { + unsigned ShiftI = Shift+i*ElementSize; + Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(), + ShiftI)); + Piece = ConstantExpr::getTrunc(Piece, ElementIntTy); + if (!collectInsertionElements(Piece, ShiftI, Elements, VecEltTy, + isBigEndian)) + return false; + } + return true; + } + + if (!V->hasOneUse()) return false; + + Instruction *I = dyn_cast<Instruction>(V); + if (!I) return false; + switch (I->getOpcode()) { + default: return false; // Unhandled case. + case Instruction::BitCast: + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + isBigEndian); + case Instruction::ZExt: + if (!isMultipleOfTypeSize( + I->getOperand(0)->getType()->getPrimitiveSizeInBits(), + VecEltTy)) + return false; + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + isBigEndian); + case Instruction::Or: + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + isBigEndian) && + collectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy, + isBigEndian); + case Instruction::Shl: { + // Must be shifting by a constant that is a multiple of the element size. + ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1)); + if (!CI) return false; + Shift += CI->getZExtValue(); + if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false; + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + isBigEndian); + } + + } +} + + +/// If the input is an 'or' instruction, we may be doing shifts and ors to +/// assemble the elements of the vector manually. +/// Try to rip the code out and replace it with insertelements. This is to +/// optimize code like this: +/// +/// %tmp37 = bitcast float %inc to i32 +/// %tmp38 = zext i32 %tmp37 to i64 +/// %tmp31 = bitcast float %inc5 to i32 +/// %tmp32 = zext i32 %tmp31 to i64 +/// %tmp33 = shl i64 %tmp32, 32 +/// %ins35 = or i64 %tmp33, %tmp38 +/// %tmp43 = bitcast i64 %ins35 to <2 x float> +/// +/// Into two insertelements that do "buildvector{%inc, %inc5}". +static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI, + InstCombiner &IC) { + VectorType *DestVecTy = cast<VectorType>(CI.getType()); + Value *IntInput = CI.getOperand(0); + + SmallVector<Value*, 8> Elements(DestVecTy->getNumElements()); + if (!collectInsertionElements(IntInput, 0, Elements, + DestVecTy->getElementType(), + IC.getDataLayout().isBigEndian())) + return nullptr; + + // If we succeeded, we know that all of the element are specified by Elements + // or are zero if Elements has a null entry. Recast this as a set of + // insertions. + Value *Result = Constant::getNullValue(CI.getType()); + for (unsigned i = 0, e = Elements.size(); i != e; ++i) { + if (!Elements[i]) continue; // Unset element. + + Result = IC.Builder.CreateInsertElement(Result, Elements[i], + IC.Builder.getInt32(i)); + } + + return Result; +} + +/// Canonicalize scalar bitcasts of extracted elements into a bitcast of the +/// vector followed by extract element. The backend tends to handle bitcasts of +/// vectors better than bitcasts of scalars because vector registers are +/// usually not type-specific like scalar integer or scalar floating-point. +static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast, + InstCombiner &IC) { + // TODO: Create and use a pattern matcher for ExtractElementInst. + auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0)); + if (!ExtElt || !ExtElt->hasOneUse()) + return nullptr; + + // The bitcast must be to a vectorizable type, otherwise we can't make a new + // type to extract from. + Type *DestType = BitCast.getType(); + if (!VectorType::isValidElementType(DestType)) + return nullptr; + + unsigned NumElts = ExtElt->getVectorOperandType()->getNumElements(); + auto *NewVecType = VectorType::get(DestType, NumElts); + auto *NewBC = IC.Builder.CreateBitCast(ExtElt->getVectorOperand(), + NewVecType, "bc"); + return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand()); +} + +/// Change the type of a bitwise logic operation if we can eliminate a bitcast. +static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast, + InstCombiner::BuilderTy &Builder) { + Type *DestTy = BitCast.getType(); + BinaryOperator *BO; + if (!DestTy->isIntOrIntVectorTy() || + !match(BitCast.getOperand(0), m_OneUse(m_BinOp(BO))) || + !BO->isBitwiseLogicOp()) + return nullptr; + + // FIXME: This transform is restricted to vector types to avoid backend + // problems caused by creating potentially illegal operations. If a fix-up is + // added to handle that situation, we can remove this check. + if (!DestTy->isVectorTy() || !BO->getType()->isVectorTy()) + return nullptr; + + Value *X; + if (match(BO->getOperand(0), m_OneUse(m_BitCast(m_Value(X)))) && + X->getType() == DestTy && !isa<Constant>(X)) { + // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y)) + Value *CastedOp1 = Builder.CreateBitCast(BO->getOperand(1), DestTy); + return BinaryOperator::Create(BO->getOpcode(), X, CastedOp1); + } + + if (match(BO->getOperand(1), m_OneUse(m_BitCast(m_Value(X)))) && + X->getType() == DestTy && !isa<Constant>(X)) { + // bitcast(logic(Y, bitcast(X))) --> logic'(bitcast(Y), X) + Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy); + return BinaryOperator::Create(BO->getOpcode(), CastedOp0, X); + } + + // Canonicalize vector bitcasts to come before vector bitwise logic with a + // constant. This eases recognition of special constants for later ops. + // Example: + // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b + Constant *C; + if (match(BO->getOperand(1), m_Constant(C))) { + // bitcast (logic X, C) --> logic (bitcast X, C') + Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy); + Value *CastedC = ConstantExpr::getBitCast(C, DestTy); + return BinaryOperator::Create(BO->getOpcode(), CastedOp0, CastedC); + } + + return nullptr; +} + +/// Change the type of a select if we can eliminate a bitcast. +static Instruction *foldBitCastSelect(BitCastInst &BitCast, + InstCombiner::BuilderTy &Builder) { + Value *Cond, *TVal, *FVal; + if (!match(BitCast.getOperand(0), + m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal))))) + return nullptr; + + // A vector select must maintain the same number of elements in its operands. + Type *CondTy = Cond->getType(); + Type *DestTy = BitCast.getType(); + if (CondTy->isVectorTy()) { + if (!DestTy->isVectorTy()) + return nullptr; + if (DestTy->getVectorNumElements() != CondTy->getVectorNumElements()) + return nullptr; + } + + // FIXME: This transform is restricted from changing the select between + // scalars and vectors to avoid backend problems caused by creating + // potentially illegal operations. If a fix-up is added to handle that + // situation, we can remove this check. + if (DestTy->isVectorTy() != TVal->getType()->isVectorTy()) + return nullptr; + + auto *Sel = cast<Instruction>(BitCast.getOperand(0)); + Value *X; + if (match(TVal, m_OneUse(m_BitCast(m_Value(X)))) && X->getType() == DestTy && + !isa<Constant>(X)) { + // bitcast(select(Cond, bitcast(X), Y)) --> select'(Cond, X, bitcast(Y)) + Value *CastedVal = Builder.CreateBitCast(FVal, DestTy); + return SelectInst::Create(Cond, X, CastedVal, "", nullptr, Sel); + } + + if (match(FVal, m_OneUse(m_BitCast(m_Value(X)))) && X->getType() == DestTy && + !isa<Constant>(X)) { + // bitcast(select(Cond, Y, bitcast(X))) --> select'(Cond, bitcast(Y), X) + Value *CastedVal = Builder.CreateBitCast(TVal, DestTy); + return SelectInst::Create(Cond, CastedVal, X, "", nullptr, Sel); + } + + return nullptr; +} + +/// Check if all users of CI are StoreInsts. +static bool hasStoreUsersOnly(CastInst &CI) { + for (User *U : CI.users()) { + if (!isa<StoreInst>(U)) + return false; + } + return true; +} + +/// This function handles following case +/// +/// A -> B cast +/// PHI +/// B -> A cast +/// +/// All the related PHI nodes can be replaced by new PHI nodes with type A. +/// The uses of \p CI can be changed to the new PHI node corresponding to \p PN. +Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) { + // BitCast used by Store can be handled in InstCombineLoadStoreAlloca.cpp. + if (hasStoreUsersOnly(CI)) + return nullptr; + + Value *Src = CI.getOperand(0); + Type *SrcTy = Src->getType(); // Type B + Type *DestTy = CI.getType(); // Type A + + SmallVector<PHINode *, 4> PhiWorklist; + SmallSetVector<PHINode *, 4> OldPhiNodes; + + // Find all of the A->B casts and PHI nodes. + // We need to inspect all related PHI nodes, but PHIs can be cyclic, so + // OldPhiNodes is used to track all known PHI nodes, before adding a new + // PHI to PhiWorklist, it is checked against and added to OldPhiNodes first. + PhiWorklist.push_back(PN); + OldPhiNodes.insert(PN); + while (!PhiWorklist.empty()) { + auto *OldPN = PhiWorklist.pop_back_val(); + for (Value *IncValue : OldPN->incoming_values()) { + if (isa<Constant>(IncValue)) + continue; + + if (auto *LI = dyn_cast<LoadInst>(IncValue)) { + // If there is a sequence of one or more load instructions, each loaded + // value is used as address of later load instruction, bitcast is + // necessary to change the value type, don't optimize it. For + // simplicity we give up if the load address comes from another load. + Value *Addr = LI->getOperand(0); + if (Addr == &CI || isa<LoadInst>(Addr)) + return nullptr; + if (LI->hasOneUse() && LI->isSimple()) + continue; + // If a LoadInst has more than one use, changing the type of loaded + // value may create another bitcast. + return nullptr; + } + + if (auto *PNode = dyn_cast<PHINode>(IncValue)) { + if (OldPhiNodes.insert(PNode)) + PhiWorklist.push_back(PNode); + continue; + } + + auto *BCI = dyn_cast<BitCastInst>(IncValue); + // We can't handle other instructions. + if (!BCI) + return nullptr; + + // Verify it's a A->B cast. + Type *TyA = BCI->getOperand(0)->getType(); + Type *TyB = BCI->getType(); + if (TyA != DestTy || TyB != SrcTy) + return nullptr; + } + } + + // For each old PHI node, create a corresponding new PHI node with a type A. + SmallDenseMap<PHINode *, PHINode *> NewPNodes; + for (auto *OldPN : OldPhiNodes) { + Builder.SetInsertPoint(OldPN); + PHINode *NewPN = Builder.CreatePHI(DestTy, OldPN->getNumOperands()); + NewPNodes[OldPN] = NewPN; + } + + // Fill in the operands of new PHI nodes. + for (auto *OldPN : OldPhiNodes) { + PHINode *NewPN = NewPNodes[OldPN]; + for (unsigned j = 0, e = OldPN->getNumOperands(); j != e; ++j) { + Value *V = OldPN->getOperand(j); + Value *NewV = nullptr; + if (auto *C = dyn_cast<Constant>(V)) { + NewV = ConstantExpr::getBitCast(C, DestTy); + } else if (auto *LI = dyn_cast<LoadInst>(V)) { + Builder.SetInsertPoint(LI->getNextNode()); + NewV = Builder.CreateBitCast(LI, DestTy); + Worklist.Add(LI); + } else if (auto *BCI = dyn_cast<BitCastInst>(V)) { + NewV = BCI->getOperand(0); + } else if (auto *PrevPN = dyn_cast<PHINode>(V)) { + NewV = NewPNodes[PrevPN]; + } + assert(NewV); + NewPN->addIncoming(NewV, OldPN->getIncomingBlock(j)); + } + } + + // Traverse all accumulated PHI nodes and process its users, + // which are Stores and BitcCasts. Without this processing + // NewPHI nodes could be replicated and could lead to extra + // moves generated after DeSSA. + // If there is a store with type B, change it to type A. + + + // Replace users of BitCast B->A with NewPHI. These will help + // later to get rid off a closure formed by OldPHI nodes. + Instruction *RetVal = nullptr; + for (auto *OldPN : OldPhiNodes) { + PHINode *NewPN = NewPNodes[OldPN]; + for (User *V : OldPN->users()) { + if (auto *SI = dyn_cast<StoreInst>(V)) { + if (SI->isSimple() && SI->getOperand(0) == OldPN) { + Builder.SetInsertPoint(SI); + auto *NewBC = + cast<BitCastInst>(Builder.CreateBitCast(NewPN, SrcTy)); + SI->setOperand(0, NewBC); + Worklist.Add(SI); + assert(hasStoreUsersOnly(*NewBC)); + } + } + else if (auto *BCI = dyn_cast<BitCastInst>(V)) { + // Verify it's a B->A cast. + Type *TyB = BCI->getOperand(0)->getType(); + Type *TyA = BCI->getType(); + if (TyA == DestTy && TyB == SrcTy) { + Instruction *I = replaceInstUsesWith(*BCI, NewPN); + if (BCI == &CI) + RetVal = I; + } + } + } + } + + return RetVal; +} + +Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { + // If the operands are integer typed then apply the integer transforms, + // otherwise just apply the common ones. + Value *Src = CI.getOperand(0); + Type *SrcTy = Src->getType(); + Type *DestTy = CI.getType(); + + // Get rid of casts from one type to the same type. These are useless and can + // be replaced by the operand. + if (DestTy == Src->getType()) + return replaceInstUsesWith(CI, Src); + + if (PointerType *DstPTy = dyn_cast<PointerType>(DestTy)) { + PointerType *SrcPTy = cast<PointerType>(SrcTy); + Type *DstElTy = DstPTy->getElementType(); + Type *SrcElTy = SrcPTy->getElementType(); + + // Casting pointers between the same type, but with different address spaces + // is an addrspace cast rather than a bitcast. + if ((DstElTy == SrcElTy) && + (DstPTy->getAddressSpace() != SrcPTy->getAddressSpace())) + return new AddrSpaceCastInst(Src, DestTy); + + // If we are casting a alloca to a pointer to a type of the same + // size, rewrite the allocation instruction to allocate the "right" type. + // There is no need to modify malloc calls because it is their bitcast that + // needs to be cleaned up. + if (AllocaInst *AI = dyn_cast<AllocaInst>(Src)) + if (Instruction *V = PromoteCastOfAllocation(CI, *AI)) + return V; + + // When the type pointed to is not sized the cast cannot be + // turned into a gep. + Type *PointeeType = + cast<PointerType>(Src->getType()->getScalarType())->getElementType(); + if (!PointeeType->isSized()) + return nullptr; + + // If the source and destination are pointers, and this cast is equivalent + // to a getelementptr X, 0, 0, 0... turn it into the appropriate gep. + // This can enhance SROA and other transforms that want type-safe pointers. + unsigned NumZeros = 0; + while (SrcElTy != DstElTy && + isa<CompositeType>(SrcElTy) && !SrcElTy->isPointerTy() && + SrcElTy->getNumContainedTypes() /* not "{}" */) { + SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(0U); + ++NumZeros; + } + + // If we found a path from the src to dest, create the getelementptr now. + if (SrcElTy == DstElTy) { + SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder.getInt32(0)); + GetElementPtrInst *GEP = + GetElementPtrInst::Create(SrcPTy->getElementType(), Src, Idxs); + + // If the source pointer is dereferenceable, then assume it points to an + // allocated object and apply "inbounds" to the GEP. + bool CanBeNull; + if (Src->getPointerDereferenceableBytes(DL, CanBeNull)) { + // In a non-default address space (not 0), a null pointer can not be + // assumed inbounds, so ignore that case (dereferenceable_or_null). + // The reason is that 'null' is not treated differently in these address + // spaces, and we consequently ignore the 'gep inbounds' special case + // for 'null' which allows 'inbounds' on 'null' if the indices are + // zeros. + if (SrcPTy->getAddressSpace() == 0 || !CanBeNull) + GEP->setIsInBounds(); + } + return GEP; + } + } + + if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) { + if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) { + Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType()); + return InsertElementInst::Create(UndefValue::get(DestTy), Elem, + Constant::getNullValue(Type::getInt32Ty(CI.getContext()))); + // FIXME: Canonicalize bitcast(insertelement) -> insertelement(bitcast) + } + + if (isa<IntegerType>(SrcTy)) { + // If this is a cast from an integer to vector, check to see if the input + // is a trunc or zext of a bitcast from vector. If so, we can replace all + // the casts with a shuffle and (potentially) a bitcast. + if (isa<TruncInst>(Src) || isa<ZExtInst>(Src)) { + CastInst *SrcCast = cast<CastInst>(Src); + if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0))) + if (isa<VectorType>(BCIn->getOperand(0)->getType())) + if (Instruction *I = optimizeVectorResize(BCIn->getOperand(0), + cast<VectorType>(DestTy), *this)) + return I; + } + + // If the input is an 'or' instruction, we may be doing shifts and ors to + // assemble the elements of the vector manually. Try to rip the code out + // and replace it with insertelements. + if (Value *V = optimizeIntegerToVectorInsertions(CI, *this)) + return replaceInstUsesWith(CI, V); + } + } + + if (VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) { + if (SrcVTy->getNumElements() == 1) { + // If our destination is not a vector, then make this a straight + // scalar-scalar cast. + if (!DestTy->isVectorTy()) { + Value *Elem = + Builder.CreateExtractElement(Src, + Constant::getNullValue(Type::getInt32Ty(CI.getContext()))); + return CastInst::Create(Instruction::BitCast, Elem, DestTy); + } + + // Otherwise, see if our source is an insert. If so, then use the scalar + // component directly: + // bitcast (inselt <1 x elt> V, X, 0) to <n x m> --> bitcast X to <n x m> + if (auto *InsElt = dyn_cast<InsertElementInst>(Src)) + return new BitCastInst(InsElt->getOperand(1), DestTy); + } + } + + if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Src)) { + // Okay, we have (bitcast (shuffle ..)). Check to see if this is + // a bitcast to a vector with the same # elts. + Value *ShufOp0 = Shuf->getOperand(0); + Value *ShufOp1 = Shuf->getOperand(1); + unsigned NumShufElts = Shuf->getType()->getVectorNumElements(); + unsigned NumSrcVecElts = ShufOp0->getType()->getVectorNumElements(); + if (Shuf->hasOneUse() && DestTy->isVectorTy() && + DestTy->getVectorNumElements() == NumShufElts && + NumShufElts == NumSrcVecElts) { + BitCastInst *Tmp; + // If either of the operands is a cast from CI.getType(), then + // evaluating the shuffle in the casted destination's type will allow + // us to eliminate at least one cast. + if (((Tmp = dyn_cast<BitCastInst>(ShufOp0)) && + Tmp->getOperand(0)->getType() == DestTy) || + ((Tmp = dyn_cast<BitCastInst>(ShufOp1)) && + Tmp->getOperand(0)->getType() == DestTy)) { + Value *LHS = Builder.CreateBitCast(ShufOp0, DestTy); + Value *RHS = Builder.CreateBitCast(ShufOp1, DestTy); + // Return a new shuffle vector. Use the same element ID's, as we + // know the vector types match #elts. + return new ShuffleVectorInst(LHS, RHS, Shuf->getOperand(2)); + } + } + + // A bitcasted-to-scalar and byte-reversing shuffle is better recognized as + // a byte-swap: + // bitcast <N x i8> (shuf X, undef, <N, N-1,...0>) --> bswap (bitcast X) + // TODO: We should match the related pattern for bitreverse. + if (DestTy->isIntegerTy() && + DL.isLegalInteger(DestTy->getScalarSizeInBits()) && + SrcTy->getScalarSizeInBits() == 8 && NumShufElts % 2 == 0 && + Shuf->hasOneUse() && Shuf->isReverse()) { + assert(ShufOp0->getType() == SrcTy && "Unexpected shuffle mask"); + assert(isa<UndefValue>(ShufOp1) && "Unexpected shuffle op"); + Function *Bswap = + Intrinsic::getDeclaration(CI.getModule(), Intrinsic::bswap, DestTy); + Value *ScalarX = Builder.CreateBitCast(ShufOp0, DestTy); + return IntrinsicInst::Create(Bswap, { ScalarX }); + } + } + + // Handle the A->B->A cast, and there is an intervening PHI node. + if (PHINode *PN = dyn_cast<PHINode>(Src)) + if (Instruction *I = optimizeBitCastFromPhi(CI, PN)) + return I; + + if (Instruction *I = canonicalizeBitCastExtElt(CI, *this)) + return I; + + if (Instruction *I = foldBitCastBitwiseLogic(CI, Builder)) + return I; + + if (Instruction *I = foldBitCastSelect(CI, Builder)) + return I; + + if (SrcTy->isPointerTy()) + return commonPointerCastTransforms(CI); + return commonCastTransforms(CI); +} + +Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) { + // If the destination pointer element type is not the same as the source's + // first do a bitcast to the destination type, and then the addrspacecast. + // This allows the cast to be exposed to other transforms. + Value *Src = CI.getOperand(0); + PointerType *SrcTy = cast<PointerType>(Src->getType()->getScalarType()); + PointerType *DestTy = cast<PointerType>(CI.getType()->getScalarType()); + + Type *DestElemTy = DestTy->getElementType(); + if (SrcTy->getElementType() != DestElemTy) { + Type *MidTy = PointerType::get(DestElemTy, SrcTy->getAddressSpace()); + if (VectorType *VT = dyn_cast<VectorType>(CI.getType())) { + // Handle vectors of pointers. + MidTy = VectorType::get(MidTy, VT->getNumElements()); + } + + Value *NewBitCast = Builder.CreateBitCast(Src, MidTy); + return new AddrSpaceCastInst(NewBitCast, CI.getType()); + } + + return commonPointerCastTransforms(CI); +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp new file mode 100644 index 000000000000..a9f64feb600c --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -0,0 +1,6113 @@ +//===- InstCombineCompares.cpp --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the visitICmp and visitFCmp functions. +// +//===----------------------------------------------------------------------===// + +#include "InstCombineInternal.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/ConstantRange.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "instcombine" + +// How many times is a select replaced by one of its operands? +STATISTIC(NumSel, "Number of select opts"); + + +/// Compute Result = In1+In2, returning true if the result overflowed for this +/// type. +static bool addWithOverflow(APInt &Result, const APInt &In1, + const APInt &In2, bool IsSigned = false) { + bool Overflow; + if (IsSigned) + Result = In1.sadd_ov(In2, Overflow); + else + Result = In1.uadd_ov(In2, Overflow); + + return Overflow; +} + +/// Compute Result = In1-In2, returning true if the result overflowed for this +/// type. +static bool subWithOverflow(APInt &Result, const APInt &In1, + const APInt &In2, bool IsSigned = false) { + bool Overflow; + if (IsSigned) + Result = In1.ssub_ov(In2, Overflow); + else + Result = In1.usub_ov(In2, Overflow); + + return Overflow; +} + +/// Given an icmp instruction, return true if any use of this comparison is a +/// branch on sign bit comparison. +static bool hasBranchUse(ICmpInst &I) { + for (auto *U : I.users()) + if (isa<BranchInst>(U)) + return true; + return false; +} + +/// Returns true if the exploded icmp can be expressed as a signed comparison +/// to zero and updates the predicate accordingly. +/// The signedness of the comparison is preserved. +/// TODO: Refactor with decomposeBitTestICmp()? +static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) { + if (!ICmpInst::isSigned(Pred)) + return false; + + if (C.isNullValue()) + return ICmpInst::isRelational(Pred); + + if (C.isOneValue()) { + if (Pred == ICmpInst::ICMP_SLT) { + Pred = ICmpInst::ICMP_SLE; + return true; + } + } else if (C.isAllOnesValue()) { + if (Pred == ICmpInst::ICMP_SGT) { + Pred = ICmpInst::ICMP_SGE; + return true; + } + } + + return false; +} + +/// Given a signed integer type and a set of known zero and one bits, compute +/// the maximum and minimum values that could have the specified known zero and +/// known one bits, returning them in Min/Max. +/// TODO: Move to method on KnownBits struct? +static void computeSignedMinMaxValuesFromKnownBits(const KnownBits &Known, + APInt &Min, APInt &Max) { + assert(Known.getBitWidth() == Min.getBitWidth() && + Known.getBitWidth() == Max.getBitWidth() && + "KnownZero, KnownOne and Min, Max must have equal bitwidth."); + APInt UnknownBits = ~(Known.Zero|Known.One); + + // The minimum value is when all unknown bits are zeros, EXCEPT for the sign + // bit if it is unknown. + Min = Known.One; + Max = Known.One|UnknownBits; + + if (UnknownBits.isNegative()) { // Sign bit is unknown + Min.setSignBit(); + Max.clearSignBit(); + } +} + +/// Given an unsigned integer type and a set of known zero and one bits, compute +/// the maximum and minimum values that could have the specified known zero and +/// known one bits, returning them in Min/Max. +/// TODO: Move to method on KnownBits struct? +static void computeUnsignedMinMaxValuesFromKnownBits(const KnownBits &Known, + APInt &Min, APInt &Max) { + assert(Known.getBitWidth() == Min.getBitWidth() && + Known.getBitWidth() == Max.getBitWidth() && + "Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth."); + APInt UnknownBits = ~(Known.Zero|Known.One); + + // The minimum value is when the unknown bits are all zeros. + Min = Known.One; + // The maximum value is when the unknown bits are all ones. + Max = Known.One|UnknownBits; +} + +/// This is called when we see this pattern: +/// cmp pred (load (gep GV, ...)), cmpcst +/// where GV is a global variable with a constant initializer. Try to simplify +/// this into some simple computation that does not need the load. For example +/// we can optimize "icmp eq (load (gep "foo", 0, i)), 0" into "icmp eq i, 3". +/// +/// If AndCst is non-null, then the loaded value is masked with that constant +/// before doing the comparison. This handles cases like "A[i]&4 == 0". +Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, + GlobalVariable *GV, + CmpInst &ICI, + ConstantInt *AndCst) { + Constant *Init = GV->getInitializer(); + if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init)) + return nullptr; + + uint64_t ArrayElementCount = Init->getType()->getArrayNumElements(); + // Don't blow up on huge arrays. + if (ArrayElementCount > MaxArraySizeForCombine) + return nullptr; + + // There are many forms of this optimization we can handle, for now, just do + // the simple index into a single-dimensional array. + // + // Require: GEP GV, 0, i {{, constant indices}} + if (GEP->getNumOperands() < 3 || + !isa<ConstantInt>(GEP->getOperand(1)) || + !cast<ConstantInt>(GEP->getOperand(1))->isZero() || + isa<Constant>(GEP->getOperand(2))) + return nullptr; + + // Check that indices after the variable are constants and in-range for the + // type they index. Collect the indices. This is typically for arrays of + // structs. + SmallVector<unsigned, 4> LaterIndices; + + Type *EltTy = Init->getType()->getArrayElementType(); + for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) { + ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i)); + if (!Idx) return nullptr; // Variable index. + + uint64_t IdxVal = Idx->getZExtValue(); + if ((unsigned)IdxVal != IdxVal) return nullptr; // Too large array index. + + if (StructType *STy = dyn_cast<StructType>(EltTy)) + EltTy = STy->getElementType(IdxVal); + else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) { + if (IdxVal >= ATy->getNumElements()) return nullptr; + EltTy = ATy->getElementType(); + } else { + return nullptr; // Unknown type. + } + + LaterIndices.push_back(IdxVal); + } + + enum { Overdefined = -3, Undefined = -2 }; + + // Variables for our state machines. + + // FirstTrueElement/SecondTrueElement - Used to emit a comparison of the form + // "i == 47 | i == 87", where 47 is the first index the condition is true for, + // and 87 is the second (and last) index. FirstTrueElement is -2 when + // undefined, otherwise set to the first true element. SecondTrueElement is + // -2 when undefined, -3 when overdefined and >= 0 when that index is true. + int FirstTrueElement = Undefined, SecondTrueElement = Undefined; + + // FirstFalseElement/SecondFalseElement - Used to emit a comparison of the + // form "i != 47 & i != 87". Same state transitions as for true elements. + int FirstFalseElement = Undefined, SecondFalseElement = Undefined; + + /// TrueRangeEnd/FalseRangeEnd - In conjunction with First*Element, these + /// define a state machine that triggers for ranges of values that the index + /// is true or false for. This triggers on things like "abbbbc"[i] == 'b'. + /// This is -2 when undefined, -3 when overdefined, and otherwise the last + /// index in the range (inclusive). We use -2 for undefined here because we + /// use relative comparisons and don't want 0-1 to match -1. + int TrueRangeEnd = Undefined, FalseRangeEnd = Undefined; + + // MagicBitvector - This is a magic bitvector where we set a bit if the + // comparison is true for element 'i'. If there are 64 elements or less in + // the array, this will fully represent all the comparison results. + uint64_t MagicBitvector = 0; + + // Scan the array and see if one of our patterns matches. + Constant *CompareRHS = cast<Constant>(ICI.getOperand(1)); + for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) { + Constant *Elt = Init->getAggregateElement(i); + if (!Elt) return nullptr; + + // If this is indexing an array of structures, get the structure element. + if (!LaterIndices.empty()) + Elt = ConstantExpr::getExtractValue(Elt, LaterIndices); + + // If the element is masked, handle it. + if (AndCst) Elt = ConstantExpr::getAnd(Elt, AndCst); + + // Find out if the comparison would be true or false for the i'th element. + Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt, + CompareRHS, DL, &TLI); + // If the result is undef for this element, ignore it. + if (isa<UndefValue>(C)) { + // Extend range state machines to cover this element in case there is an + // undef in the middle of the range. + if (TrueRangeEnd == (int)i-1) + TrueRangeEnd = i; + if (FalseRangeEnd == (int)i-1) + FalseRangeEnd = i; + continue; + } + + // If we can't compute the result for any of the elements, we have to give + // up evaluating the entire conditional. + if (!isa<ConstantInt>(C)) return nullptr; + + // Otherwise, we know if the comparison is true or false for this element, + // update our state machines. + bool IsTrueForElt = !cast<ConstantInt>(C)->isZero(); + + // State machine for single/double/range index comparison. + if (IsTrueForElt) { + // Update the TrueElement state machine. + if (FirstTrueElement == Undefined) + FirstTrueElement = TrueRangeEnd = i; // First true element. + else { + // Update double-compare state machine. + if (SecondTrueElement == Undefined) + SecondTrueElement = i; + else + SecondTrueElement = Overdefined; + + // Update range state machine. + if (TrueRangeEnd == (int)i-1) + TrueRangeEnd = i; + else + TrueRangeEnd = Overdefined; + } + } else { + // Update the FalseElement state machine. + if (FirstFalseElement == Undefined) + FirstFalseElement = FalseRangeEnd = i; // First false element. + else { + // Update double-compare state machine. + if (SecondFalseElement == Undefined) + SecondFalseElement = i; + else + SecondFalseElement = Overdefined; + + // Update range state machine. + if (FalseRangeEnd == (int)i-1) + FalseRangeEnd = i; + else + FalseRangeEnd = Overdefined; + } + } + + // If this element is in range, update our magic bitvector. + if (i < 64 && IsTrueForElt) + MagicBitvector |= 1ULL << i; + + // If all of our states become overdefined, bail out early. Since the + // predicate is expensive, only check it every 8 elements. This is only + // really useful for really huge arrays. + if ((i & 8) == 0 && i >= 64 && SecondTrueElement == Overdefined && + SecondFalseElement == Overdefined && TrueRangeEnd == Overdefined && + FalseRangeEnd == Overdefined) + return nullptr; + } + + // Now that we've scanned the entire array, emit our new comparison(s). We + // order the state machines in complexity of the generated code. + Value *Idx = GEP->getOperand(2); + + // If the index is larger than the pointer size of the target, truncate the + // index down like the GEP would do implicitly. We don't have to do this for + // an inbounds GEP because the index can't be out of range. + if (!GEP->isInBounds()) { + Type *IntPtrTy = DL.getIntPtrType(GEP->getType()); + unsigned PtrSize = IntPtrTy->getIntegerBitWidth(); + if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize) + Idx = Builder.CreateTrunc(Idx, IntPtrTy); + } + + // If the comparison is only true for one or two elements, emit direct + // comparisons. + if (SecondTrueElement != Overdefined) { + // None true -> false. + if (FirstTrueElement == Undefined) + return replaceInstUsesWith(ICI, Builder.getFalse()); + + Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement); + + // True for one element -> 'i == 47'. + if (SecondTrueElement == Undefined) + return new ICmpInst(ICmpInst::ICMP_EQ, Idx, FirstTrueIdx); + + // True for two elements -> 'i == 47 | i == 72'. + Value *C1 = Builder.CreateICmpEQ(Idx, FirstTrueIdx); + Value *SecondTrueIdx = ConstantInt::get(Idx->getType(), SecondTrueElement); + Value *C2 = Builder.CreateICmpEQ(Idx, SecondTrueIdx); + return BinaryOperator::CreateOr(C1, C2); + } + + // If the comparison is only false for one or two elements, emit direct + // comparisons. + if (SecondFalseElement != Overdefined) { + // None false -> true. + if (FirstFalseElement == Undefined) + return replaceInstUsesWith(ICI, Builder.getTrue()); + + Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement); + + // False for one element -> 'i != 47'. + if (SecondFalseElement == Undefined) + return new ICmpInst(ICmpInst::ICMP_NE, Idx, FirstFalseIdx); + + // False for two elements -> 'i != 47 & i != 72'. + Value *C1 = Builder.CreateICmpNE(Idx, FirstFalseIdx); + Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement); + Value *C2 = Builder.CreateICmpNE(Idx, SecondFalseIdx); + return BinaryOperator::CreateAnd(C1, C2); + } + + // If the comparison can be replaced with a range comparison for the elements + // where it is true, emit the range check. + if (TrueRangeEnd != Overdefined) { + assert(TrueRangeEnd != FirstTrueElement && "Should emit single compare"); + + // Generate (i-FirstTrue) <u (TrueRangeEnd-FirstTrue+1). + if (FirstTrueElement) { + Value *Offs = ConstantInt::get(Idx->getType(), -FirstTrueElement); + Idx = Builder.CreateAdd(Idx, Offs); + } + + Value *End = ConstantInt::get(Idx->getType(), + TrueRangeEnd-FirstTrueElement+1); + return new ICmpInst(ICmpInst::ICMP_ULT, Idx, End); + } + + // False range check. + if (FalseRangeEnd != Overdefined) { + assert(FalseRangeEnd != FirstFalseElement && "Should emit single compare"); + // Generate (i-FirstFalse) >u (FalseRangeEnd-FirstFalse). + if (FirstFalseElement) { + Value *Offs = ConstantInt::get(Idx->getType(), -FirstFalseElement); + Idx = Builder.CreateAdd(Idx, Offs); + } + + Value *End = ConstantInt::get(Idx->getType(), + FalseRangeEnd-FirstFalseElement); + return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End); + } + + // If a magic bitvector captures the entire comparison state + // of this load, replace it with computation that does: + // ((magic_cst >> i) & 1) != 0 + { + Type *Ty = nullptr; + + // Look for an appropriate type: + // - The type of Idx if the magic fits + // - The smallest fitting legal type + if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth()) + Ty = Idx->getType(); + else + Ty = DL.getSmallestLegalIntType(Init->getContext(), ArrayElementCount); + + if (Ty) { + Value *V = Builder.CreateIntCast(Idx, Ty, false); + V = Builder.CreateLShr(ConstantInt::get(Ty, MagicBitvector), V); + V = Builder.CreateAnd(ConstantInt::get(Ty, 1), V); + return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0)); + } + } + + return nullptr; +} + +/// Return a value that can be used to compare the *offset* implied by a GEP to +/// zero. For example, if we have &A[i], we want to return 'i' for +/// "icmp ne i, 0". Note that, in general, indices can be complex, and scales +/// are involved. The above expression would also be legal to codegen as +/// "icmp ne (i*4), 0" (assuming A is a pointer to i32). +/// This latter form is less amenable to optimization though, and we are allowed +/// to generate the first by knowing that pointer arithmetic doesn't overflow. +/// +/// If we can't emit an optimized form for this expression, this returns null. +/// +static Value *evaluateGEPOffsetExpression(User *GEP, InstCombiner &IC, + const DataLayout &DL) { + gep_type_iterator GTI = gep_type_begin(GEP); + + // Check to see if this gep only has a single variable index. If so, and if + // any constant indices are a multiple of its scale, then we can compute this + // in terms of the scale of the variable index. For example, if the GEP + // implies an offset of "12 + i*4", then we can codegen this as "3 + i", + // because the expression will cross zero at the same point. + unsigned i, e = GEP->getNumOperands(); + int64_t Offset = 0; + for (i = 1; i != e; ++i, ++GTI) { + if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) { + // Compute the aggregate offset of constant indices. + if (CI->isZero()) continue; + + // Handle a struct index, which adds its field offset to the pointer. + if (StructType *STy = GTI.getStructTypeOrNull()) { + Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue()); + } else { + uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()); + Offset += Size*CI->getSExtValue(); + } + } else { + // Found our variable index. + break; + } + } + + // If there are no variable indices, we must have a constant offset, just + // evaluate it the general way. + if (i == e) return nullptr; + + Value *VariableIdx = GEP->getOperand(i); + // Determine the scale factor of the variable element. For example, this is + // 4 if the variable index is into an array of i32. + uint64_t VariableScale = DL.getTypeAllocSize(GTI.getIndexedType()); + + // Verify that there are no other variable indices. If so, emit the hard way. + for (++i, ++GTI; i != e; ++i, ++GTI) { + ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i)); + if (!CI) return nullptr; + + // Compute the aggregate offset of constant indices. + if (CI->isZero()) continue; + + // Handle a struct index, which adds its field offset to the pointer. + if (StructType *STy = GTI.getStructTypeOrNull()) { + Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue()); + } else { + uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()); + Offset += Size*CI->getSExtValue(); + } + } + + // Okay, we know we have a single variable index, which must be a + // pointer/array/vector index. If there is no offset, life is simple, return + // the index. + Type *IntPtrTy = DL.getIntPtrType(GEP->getOperand(0)->getType()); + unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth(); + if (Offset == 0) { + // Cast to intptrty in case a truncation occurs. If an extension is needed, + // we don't need to bother extending: the extension won't affect where the + // computation crosses zero. + if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) { + VariableIdx = IC.Builder.CreateTrunc(VariableIdx, IntPtrTy); + } + return VariableIdx; + } + + // Otherwise, there is an index. The computation we will do will be modulo + // the pointer size. + Offset = SignExtend64(Offset, IntPtrWidth); + VariableScale = SignExtend64(VariableScale, IntPtrWidth); + + // To do this transformation, any constant index must be a multiple of the + // variable scale factor. For example, we can evaluate "12 + 4*i" as "3 + i", + // but we can't evaluate "10 + 3*i" in terms of i. Check that the offset is a + // multiple of the variable scale. + int64_t NewOffs = Offset / (int64_t)VariableScale; + if (Offset != NewOffs*(int64_t)VariableScale) + return nullptr; + + // Okay, we can do this evaluation. Start by converting the index to intptr. + if (VariableIdx->getType() != IntPtrTy) + VariableIdx = IC.Builder.CreateIntCast(VariableIdx, IntPtrTy, + true /*Signed*/); + Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs); + return IC.Builder.CreateAdd(VariableIdx, OffsetVal, "offset"); +} + +/// Returns true if we can rewrite Start as a GEP with pointer Base +/// and some integer offset. The nodes that need to be re-written +/// for this transformation will be added to Explored. +static bool canRewriteGEPAsOffset(Value *Start, Value *Base, + const DataLayout &DL, + SetVector<Value *> &Explored) { + SmallVector<Value *, 16> WorkList(1, Start); + Explored.insert(Base); + + // The following traversal gives us an order which can be used + // when doing the final transformation. Since in the final + // transformation we create the PHI replacement instructions first, + // we don't have to get them in any particular order. + // + // However, for other instructions we will have to traverse the + // operands of an instruction first, which means that we have to + // do a post-order traversal. + while (!WorkList.empty()) { + SetVector<PHINode *> PHIs; + + while (!WorkList.empty()) { + if (Explored.size() >= 100) + return false; + + Value *V = WorkList.back(); + + if (Explored.count(V) != 0) { + WorkList.pop_back(); + continue; + } + + if (!isa<IntToPtrInst>(V) && !isa<PtrToIntInst>(V) && + !isa<GetElementPtrInst>(V) && !isa<PHINode>(V)) + // We've found some value that we can't explore which is different from + // the base. Therefore we can't do this transformation. + return false; + + if (isa<IntToPtrInst>(V) || isa<PtrToIntInst>(V)) { + auto *CI = dyn_cast<CastInst>(V); + if (!CI->isNoopCast(DL)) + return false; + + if (Explored.count(CI->getOperand(0)) == 0) + WorkList.push_back(CI->getOperand(0)); + } + + if (auto *GEP = dyn_cast<GEPOperator>(V)) { + // We're limiting the GEP to having one index. This will preserve + // the original pointer type. We could handle more cases in the + // future. + if (GEP->getNumIndices() != 1 || !GEP->isInBounds() || + GEP->getType() != Start->getType()) + return false; + + if (Explored.count(GEP->getOperand(0)) == 0) + WorkList.push_back(GEP->getOperand(0)); + } + + if (WorkList.back() == V) { + WorkList.pop_back(); + // We've finished visiting this node, mark it as such. + Explored.insert(V); + } + + if (auto *PN = dyn_cast<PHINode>(V)) { + // We cannot transform PHIs on unsplittable basic blocks. + if (isa<CatchSwitchInst>(PN->getParent()->getTerminator())) + return false; + Explored.insert(PN); + PHIs.insert(PN); + } + } + + // Explore the PHI nodes further. + for (auto *PN : PHIs) + for (Value *Op : PN->incoming_values()) + if (Explored.count(Op) == 0) + WorkList.push_back(Op); + } + + // Make sure that we can do this. Since we can't insert GEPs in a basic + // block before a PHI node, we can't easily do this transformation if + // we have PHI node users of transformed instructions. + for (Value *Val : Explored) { + for (Value *Use : Val->uses()) { + + auto *PHI = dyn_cast<PHINode>(Use); + auto *Inst = dyn_cast<Instruction>(Val); + + if (Inst == Base || Inst == PHI || !Inst || !PHI || + Explored.count(PHI) == 0) + continue; + + if (PHI->getParent() == Inst->getParent()) + return false; + } + } + return true; +} + +// Sets the appropriate insert point on Builder where we can add +// a replacement Instruction for V (if that is possible). +static void setInsertionPoint(IRBuilder<> &Builder, Value *V, + bool Before = true) { + if (auto *PHI = dyn_cast<PHINode>(V)) { + Builder.SetInsertPoint(&*PHI->getParent()->getFirstInsertionPt()); + return; + } + if (auto *I = dyn_cast<Instruction>(V)) { + if (!Before) + I = &*std::next(I->getIterator()); + Builder.SetInsertPoint(I); + return; + } + if (auto *A = dyn_cast<Argument>(V)) { + // Set the insertion point in the entry block. + BasicBlock &Entry = A->getParent()->getEntryBlock(); + Builder.SetInsertPoint(&*Entry.getFirstInsertionPt()); + return; + } + // Otherwise, this is a constant and we don't need to set a new + // insertion point. + assert(isa<Constant>(V) && "Setting insertion point for unknown value!"); +} + +/// Returns a re-written value of Start as an indexed GEP using Base as a +/// pointer. +static Value *rewriteGEPAsOffset(Value *Start, Value *Base, + const DataLayout &DL, + SetVector<Value *> &Explored) { + // Perform all the substitutions. This is a bit tricky because we can + // have cycles in our use-def chains. + // 1. Create the PHI nodes without any incoming values. + // 2. Create all the other values. + // 3. Add the edges for the PHI nodes. + // 4. Emit GEPs to get the original pointers. + // 5. Remove the original instructions. + Type *IndexType = IntegerType::get( + Base->getContext(), DL.getIndexTypeSizeInBits(Start->getType())); + + DenseMap<Value *, Value *> NewInsts; + NewInsts[Base] = ConstantInt::getNullValue(IndexType); + + // Create the new PHI nodes, without adding any incoming values. + for (Value *Val : Explored) { + if (Val == Base) + continue; + // Create empty phi nodes. This avoids cyclic dependencies when creating + // the remaining instructions. + if (auto *PHI = dyn_cast<PHINode>(Val)) + NewInsts[PHI] = PHINode::Create(IndexType, PHI->getNumIncomingValues(), + PHI->getName() + ".idx", PHI); + } + IRBuilder<> Builder(Base->getContext()); + + // Create all the other instructions. + for (Value *Val : Explored) { + + if (NewInsts.find(Val) != NewInsts.end()) + continue; + + if (auto *CI = dyn_cast<CastInst>(Val)) { + // Don't get rid of the intermediate variable here; the store can grow + // the map which will invalidate the reference to the input value. + Value *V = NewInsts[CI->getOperand(0)]; + NewInsts[CI] = V; + continue; + } + if (auto *GEP = dyn_cast<GEPOperator>(Val)) { + Value *Index = NewInsts[GEP->getOperand(1)] ? NewInsts[GEP->getOperand(1)] + : GEP->getOperand(1); + setInsertionPoint(Builder, GEP); + // Indices might need to be sign extended. GEPs will magically do + // this, but we need to do it ourselves here. + if (Index->getType()->getScalarSizeInBits() != + NewInsts[GEP->getOperand(0)]->getType()->getScalarSizeInBits()) { + Index = Builder.CreateSExtOrTrunc( + Index, NewInsts[GEP->getOperand(0)]->getType(), + GEP->getOperand(0)->getName() + ".sext"); + } + + auto *Op = NewInsts[GEP->getOperand(0)]; + if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero()) + NewInsts[GEP] = Index; + else + NewInsts[GEP] = Builder.CreateNSWAdd( + Op, Index, GEP->getOperand(0)->getName() + ".add"); + continue; + } + if (isa<PHINode>(Val)) + continue; + + llvm_unreachable("Unexpected instruction type"); + } + + // Add the incoming values to the PHI nodes. + for (Value *Val : Explored) { + if (Val == Base) + continue; + // All the instructions have been created, we can now add edges to the + // phi nodes. + if (auto *PHI = dyn_cast<PHINode>(Val)) { + PHINode *NewPhi = static_cast<PHINode *>(NewInsts[PHI]); + for (unsigned I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) { + Value *NewIncoming = PHI->getIncomingValue(I); + + if (NewInsts.find(NewIncoming) != NewInsts.end()) + NewIncoming = NewInsts[NewIncoming]; + + NewPhi->addIncoming(NewIncoming, PHI->getIncomingBlock(I)); + } + } + } + + for (Value *Val : Explored) { + if (Val == Base) + continue; + + // Depending on the type, for external users we have to emit + // a GEP or a GEP + ptrtoint. + setInsertionPoint(Builder, Val, false); + + // If required, create an inttoptr instruction for Base. + Value *NewBase = Base; + if (!Base->getType()->isPointerTy()) + NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(), + Start->getName() + "to.ptr"); + + Value *GEP = Builder.CreateInBoundsGEP( + Start->getType()->getPointerElementType(), NewBase, + makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr"); + + if (!Val->getType()->isPointerTy()) { + Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(), + Val->getName() + ".conv"); + GEP = Cast; + } + Val->replaceAllUsesWith(GEP); + } + + return NewInsts[Start]; +} + +/// Looks through GEPs, IntToPtrInsts and PtrToIntInsts in order to express +/// the input Value as a constant indexed GEP. Returns a pair containing +/// the GEPs Pointer and Index. +static std::pair<Value *, Value *> +getAsConstantIndexedAddress(Value *V, const DataLayout &DL) { + Type *IndexType = IntegerType::get(V->getContext(), + DL.getIndexTypeSizeInBits(V->getType())); + + Constant *Index = ConstantInt::getNullValue(IndexType); + while (true) { + if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { + // We accept only inbouds GEPs here to exclude the possibility of + // overflow. + if (!GEP->isInBounds()) + break; + if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 && + GEP->getType() == V->getType()) { + V = GEP->getOperand(0); + Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1)); + Index = ConstantExpr::getAdd( + Index, ConstantExpr::getSExtOrBitCast(GEPIndex, IndexType)); + continue; + } + break; + } + if (auto *CI = dyn_cast<IntToPtrInst>(V)) { + if (!CI->isNoopCast(DL)) + break; + V = CI->getOperand(0); + continue; + } + if (auto *CI = dyn_cast<PtrToIntInst>(V)) { + if (!CI->isNoopCast(DL)) + break; + V = CI->getOperand(0); + continue; + } + break; + } + return {V, Index}; +} + +/// Converts (CMP GEPLHS, RHS) if this change would make RHS a constant. +/// We can look through PHIs, GEPs and casts in order to determine a common base +/// between GEPLHS and RHS. +static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS, + ICmpInst::Predicate Cond, + const DataLayout &DL) { + // FIXME: Support vector of pointers. + if (GEPLHS->getType()->isVectorTy()) + return nullptr; + + if (!GEPLHS->hasAllConstantIndices()) + return nullptr; + + // Make sure the pointers have the same type. + if (GEPLHS->getType() != RHS->getType()) + return nullptr; + + Value *PtrBase, *Index; + std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL); + + // The set of nodes that will take part in this transformation. + SetVector<Value *> Nodes; + + if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes)) + return nullptr; + + // We know we can re-write this as + // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) + // Since we've only looked through inbouds GEPs we know that we + // can't have overflow on either side. We can therefore re-write + // this as: + // OFFSET1 cmp OFFSET2 + Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes); + + // RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written + // GEP having PtrBase as the pointer base, and has returned in NewRHS the + // offset. Since Index is the offset of LHS to the base pointer, we will now + // compare the offsets instead of comparing the pointers. + return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Index, NewRHS); +} + +/// Fold comparisons between a GEP instruction and something else. At this point +/// we know that the GEP is on the LHS of the comparison. +Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, + ICmpInst::Predicate Cond, + Instruction &I) { + // Don't transform signed compares of GEPs into index compares. Even if the + // GEP is inbounds, the final add of the base pointer can have signed overflow + // and would change the result of the icmp. + // e.g. "&foo[0] <s &foo[1]" can't be folded to "true" because "foo" could be + // the maximum signed value for the pointer type. + if (ICmpInst::isSigned(Cond)) + return nullptr; + + // Look through bitcasts and addrspacecasts. We do not however want to remove + // 0 GEPs. + if (!isa<GetElementPtrInst>(RHS)) + RHS = RHS->stripPointerCasts(); + + Value *PtrBase = GEPLHS->getOperand(0); + // FIXME: Support vector pointer GEPs. + if (PtrBase == RHS && GEPLHS->isInBounds() && + !GEPLHS->getType()->isVectorTy()) { + // ((gep Ptr, OFFSET) cmp Ptr) ---> (OFFSET cmp 0). + // This transformation (ignoring the base and scales) is valid because we + // know pointers can't overflow since the gep is inbounds. See if we can + // output an optimized form. + Value *Offset = evaluateGEPOffsetExpression(GEPLHS, *this, DL); + + // If not, synthesize the offset the hard way. + if (!Offset) + Offset = EmitGEPOffset(GEPLHS); + return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset, + Constant::getNullValue(Offset->getType())); + } + + if (GEPLHS->isInBounds() && ICmpInst::isEquality(Cond) && + isa<Constant>(RHS) && cast<Constant>(RHS)->isNullValue() && + !NullPointerIsDefined(I.getFunction(), + RHS->getType()->getPointerAddressSpace())) { + // For most address spaces, an allocation can't be placed at null, but null + // itself is treated as a 0 size allocation in the in bounds rules. Thus, + // the only valid inbounds address derived from null, is null itself. + // Thus, we have four cases to consider: + // 1) Base == nullptr, Offset == 0 -> inbounds, null + // 2) Base == nullptr, Offset != 0 -> poison as the result is out of bounds + // 3) Base != nullptr, Offset == (-base) -> poison (crossing allocations) + // 4) Base != nullptr, Offset != (-base) -> nonnull (and possibly poison) + // + // (Note if we're indexing a type of size 0, that simply collapses into one + // of the buckets above.) + // + // In general, we're allowed to make values less poison (i.e. remove + // sources of full UB), so in this case, we just select between the two + // non-poison cases (1 and 4 above). + // + // For vectors, we apply the same reasoning on a per-lane basis. + auto *Base = GEPLHS->getPointerOperand(); + if (GEPLHS->getType()->isVectorTy() && Base->getType()->isPointerTy()) { + int NumElts = GEPLHS->getType()->getVectorNumElements(); + Base = Builder.CreateVectorSplat(NumElts, Base); + } + return new ICmpInst(Cond, Base, + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast<Constant>(RHS), Base->getType())); + } else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) { + // If the base pointers are different, but the indices are the same, just + // compare the base pointer. + if (PtrBase != GEPRHS->getOperand(0)) { + bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands(); + IndicesTheSame &= GEPLHS->getOperand(0)->getType() == + GEPRHS->getOperand(0)->getType(); + if (IndicesTheSame) + for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i) + if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) { + IndicesTheSame = false; + break; + } + + // If all indices are the same, just compare the base pointers. + Type *BaseType = GEPLHS->getOperand(0)->getType(); + if (IndicesTheSame && CmpInst::makeCmpResultType(BaseType) == I.getType()) + return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0)); + + // If we're comparing GEPs with two base pointers that only differ in type + // and both GEPs have only constant indices or just one use, then fold + // the compare with the adjusted indices. + // FIXME: Support vector of pointers. + if (GEPLHS->isInBounds() && GEPRHS->isInBounds() && + (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) && + (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) && + PtrBase->stripPointerCasts() == + GEPRHS->getOperand(0)->stripPointerCasts() && + !GEPLHS->getType()->isVectorTy()) { + Value *LOffset = EmitGEPOffset(GEPLHS); + Value *ROffset = EmitGEPOffset(GEPRHS); + + // If we looked through an addrspacecast between different sized address + // spaces, the LHS and RHS pointers are different sized + // integers. Truncate to the smaller one. + Type *LHSIndexTy = LOffset->getType(); + Type *RHSIndexTy = ROffset->getType(); + if (LHSIndexTy != RHSIndexTy) { + if (LHSIndexTy->getPrimitiveSizeInBits() < + RHSIndexTy->getPrimitiveSizeInBits()) { + ROffset = Builder.CreateTrunc(ROffset, LHSIndexTy); + } else + LOffset = Builder.CreateTrunc(LOffset, RHSIndexTy); + } + + Value *Cmp = Builder.CreateICmp(ICmpInst::getSignedPredicate(Cond), + LOffset, ROffset); + return replaceInstUsesWith(I, Cmp); + } + + // Otherwise, the base pointers are different and the indices are + // different. Try convert this to an indexed compare by looking through + // PHIs/casts. + return transformToIndexedCompare(GEPLHS, RHS, Cond, DL); + } + + // If one of the GEPs has all zero indices, recurse. + // FIXME: Handle vector of pointers. + if (!GEPLHS->getType()->isVectorTy() && GEPLHS->hasAllZeroIndices()) + return foldGEPICmp(GEPRHS, GEPLHS->getOperand(0), + ICmpInst::getSwappedPredicate(Cond), I); + + // If the other GEP has all zero indices, recurse. + // FIXME: Handle vector of pointers. + if (!GEPRHS->getType()->isVectorTy() && GEPRHS->hasAllZeroIndices()) + return foldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I); + + bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds(); + if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) { + // If the GEPs only differ by one index, compare it. + unsigned NumDifferences = 0; // Keep track of # differences. + unsigned DiffOperand = 0; // The operand that differs. + for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i) + if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) { + Type *LHSType = GEPLHS->getOperand(i)->getType(); + Type *RHSType = GEPRHS->getOperand(i)->getType(); + // FIXME: Better support for vector of pointers. + if (LHSType->getPrimitiveSizeInBits() != + RHSType->getPrimitiveSizeInBits() || + (GEPLHS->getType()->isVectorTy() && + (!LHSType->isVectorTy() || !RHSType->isVectorTy()))) { + // Irreconcilable differences. + NumDifferences = 2; + break; + } + + if (NumDifferences++) break; + DiffOperand = i; + } + + if (NumDifferences == 0) // SAME GEP? + return replaceInstUsesWith(I, // No comparison is needed here. + ConstantInt::get(I.getType(), ICmpInst::isTrueWhenEqual(Cond))); + + else if (NumDifferences == 1 && GEPsInBounds) { + Value *LHSV = GEPLHS->getOperand(DiffOperand); + Value *RHSV = GEPRHS->getOperand(DiffOperand); + // Make sure we do a signed comparison here. + return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV); + } + } + + // Only lower this if the icmp is the only user of the GEP or if we expect + // the result to fold to a constant! + if (GEPsInBounds && (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) && + (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) { + // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2) + Value *L = EmitGEPOffset(GEPLHS); + Value *R = EmitGEPOffset(GEPRHS); + return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R); + } + } + + // Try convert this to an indexed compare by looking through PHIs/casts as a + // last resort. + return transformToIndexedCompare(GEPLHS, RHS, Cond, DL); +} + +Instruction *InstCombiner::foldAllocaCmp(ICmpInst &ICI, + const AllocaInst *Alloca, + const Value *Other) { + assert(ICI.isEquality() && "Cannot fold non-equality comparison."); + + // It would be tempting to fold away comparisons between allocas and any + // pointer not based on that alloca (e.g. an argument). However, even + // though such pointers cannot alias, they can still compare equal. + // + // But LLVM doesn't specify where allocas get their memory, so if the alloca + // doesn't escape we can argue that it's impossible to guess its value, and we + // can therefore act as if any such guesses are wrong. + // + // The code below checks that the alloca doesn't escape, and that it's only + // used in a comparison once (the current instruction). The + // single-comparison-use condition ensures that we're trivially folding all + // comparisons against the alloca consistently, and avoids the risk of + // erroneously folding a comparison of the pointer with itself. + + unsigned MaxIter = 32; // Break cycles and bound to constant-time. + + SmallVector<const Use *, 32> Worklist; + for (const Use &U : Alloca->uses()) { + if (Worklist.size() >= MaxIter) + return nullptr; + Worklist.push_back(&U); + } + + unsigned NumCmps = 0; + while (!Worklist.empty()) { + assert(Worklist.size() <= MaxIter); + const Use *U = Worklist.pop_back_val(); + const Value *V = U->getUser(); + --MaxIter; + + if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V) || isa<PHINode>(V) || + isa<SelectInst>(V)) { + // Track the uses. + } else if (isa<LoadInst>(V)) { + // Loading from the pointer doesn't escape it. + continue; + } else if (const auto *SI = dyn_cast<StoreInst>(V)) { + // Storing *to* the pointer is fine, but storing the pointer escapes it. + if (SI->getValueOperand() == U->get()) + return nullptr; + continue; + } else if (isa<ICmpInst>(V)) { + if (NumCmps++) + return nullptr; // Found more than one cmp. + continue; + } else if (const auto *Intrin = dyn_cast<IntrinsicInst>(V)) { + switch (Intrin->getIntrinsicID()) { + // These intrinsics don't escape or compare the pointer. Memset is safe + // because we don't allow ptrtoint. Memcpy and memmove are safe because + // we don't allow stores, so src cannot point to V. + case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: + case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset: + continue; + default: + return nullptr; + } + } else { + return nullptr; + } + for (const Use &U : V->uses()) { + if (Worklist.size() >= MaxIter) + return nullptr; + Worklist.push_back(&U); + } + } + + Type *CmpTy = CmpInst::makeCmpResultType(Other->getType()); + return replaceInstUsesWith( + ICI, + ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate()))); +} + +/// Fold "icmp pred (X+C), X". +Instruction *InstCombiner::foldICmpAddOpConst(Value *X, const APInt &C, + ICmpInst::Predicate Pred) { + // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0, + // so the values can never be equal. Similarly for all other "or equals" + // operators. + assert(!!C && "C should not be zero!"); + + // (X+1) <u X --> X >u (MAXUINT-1) --> X == 255 + // (X+2) <u X --> X >u (MAXUINT-2) --> X > 253 + // (X+MAXUINT) <u X --> X >u (MAXUINT-MAXUINT) --> X != 0 + if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) { + Constant *R = ConstantInt::get(X->getType(), + APInt::getMaxValue(C.getBitWidth()) - C); + return new ICmpInst(ICmpInst::ICMP_UGT, X, R); + } + + // (X+1) >u X --> X <u (0-1) --> X != 255 + // (X+2) >u X --> X <u (0-2) --> X <u 254 + // (X+MAXUINT) >u X --> X <u (0-MAXUINT) --> X <u 1 --> X == 0 + if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) + return new ICmpInst(ICmpInst::ICMP_ULT, X, + ConstantInt::get(X->getType(), -C)); + + APInt SMax = APInt::getSignedMaxValue(C.getBitWidth()); + + // (X+ 1) <s X --> X >s (MAXSINT-1) --> X == 127 + // (X+ 2) <s X --> X >s (MAXSINT-2) --> X >s 125 + // (X+MAXSINT) <s X --> X >s (MAXSINT-MAXSINT) --> X >s 0 + // (X+MINSINT) <s X --> X >s (MAXSINT-MINSINT) --> X >s -1 + // (X+ -2) <s X --> X >s (MAXSINT- -2) --> X >s 126 + // (X+ -1) <s X --> X >s (MAXSINT- -1) --> X != 127 + if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) + return new ICmpInst(ICmpInst::ICMP_SGT, X, + ConstantInt::get(X->getType(), SMax - C)); + + // (X+ 1) >s X --> X <s (MAXSINT-(1-1)) --> X != 127 + // (X+ 2) >s X --> X <s (MAXSINT-(2-1)) --> X <s 126 + // (X+MAXSINT) >s X --> X <s (MAXSINT-(MAXSINT-1)) --> X <s 1 + // (X+MINSINT) >s X --> X <s (MAXSINT-(MINSINT-1)) --> X <s -2 + // (X+ -2) >s X --> X <s (MAXSINT-(-2-1)) --> X <s -126 + // (X+ -1) >s X --> X <s (MAXSINT-(-1-1)) --> X == -128 + + assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE); + return new ICmpInst(ICmpInst::ICMP_SLT, X, + ConstantInt::get(X->getType(), SMax - (C - 1))); +} + +/// Handle "(icmp eq/ne (ashr/lshr AP2, A), AP1)" -> +/// (icmp eq/ne A, Log2(AP2/AP1)) -> +/// (icmp eq/ne A, Log2(AP2) - Log2(AP1)). +Instruction *InstCombiner::foldICmpShrConstConst(ICmpInst &I, Value *A, + const APInt &AP1, + const APInt &AP2) { + assert(I.isEquality() && "Cannot fold icmp gt/lt"); + + auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) { + if (I.getPredicate() == I.ICMP_NE) + Pred = CmpInst::getInversePredicate(Pred); + return new ICmpInst(Pred, LHS, RHS); + }; + + // Don't bother doing any work for cases which InstSimplify handles. + if (AP2.isNullValue()) + return nullptr; + + bool IsAShr = isa<AShrOperator>(I.getOperand(0)); + if (IsAShr) { + if (AP2.isAllOnesValue()) + return nullptr; + if (AP2.isNegative() != AP1.isNegative()) + return nullptr; + if (AP2.sgt(AP1)) + return nullptr; + } + + if (!AP1) + // 'A' must be large enough to shift out the highest set bit. + return getICmp(I.ICMP_UGT, A, + ConstantInt::get(A->getType(), AP2.logBase2())); + + if (AP1 == AP2) + return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType())); + + int Shift; + if (IsAShr && AP1.isNegative()) + Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes(); + else + Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros(); + + if (Shift > 0) { + if (IsAShr && AP1 == AP2.ashr(Shift)) { + // There are multiple solutions if we are comparing against -1 and the LHS + // of the ashr is not a power of two. + if (AP1.isAllOnesValue() && !AP2.isPowerOf2()) + return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift)); + return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); + } else if (AP1 == AP2.lshr(Shift)) { + return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); + } + } + + // Shifting const2 will never be equal to const1. + // FIXME: This should always be handled by InstSimplify? + auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE); + return replaceInstUsesWith(I, TorF); +} + +/// Handle "(icmp eq/ne (shl AP2, A), AP1)" -> +/// (icmp eq/ne A, TrailingZeros(AP1) - TrailingZeros(AP2)). +Instruction *InstCombiner::foldICmpShlConstConst(ICmpInst &I, Value *A, + const APInt &AP1, + const APInt &AP2) { + assert(I.isEquality() && "Cannot fold icmp gt/lt"); + + auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) { + if (I.getPredicate() == I.ICMP_NE) + Pred = CmpInst::getInversePredicate(Pred); + return new ICmpInst(Pred, LHS, RHS); + }; + + // Don't bother doing any work for cases which InstSimplify handles. + if (AP2.isNullValue()) + return nullptr; + + unsigned AP2TrailingZeros = AP2.countTrailingZeros(); + + if (!AP1 && AP2TrailingZeros != 0) + return getICmp( + I.ICMP_UGE, A, + ConstantInt::get(A->getType(), AP2.getBitWidth() - AP2TrailingZeros)); + + if (AP1 == AP2) + return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType())); + + // Get the distance between the lowest bits that are set. + int Shift = AP1.countTrailingZeros() - AP2TrailingZeros; + + if (Shift > 0 && AP2.shl(Shift) == AP1) + return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); + + // Shifting const2 will never be equal to const1. + // FIXME: This should always be handled by InstSimplify? + auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE); + return replaceInstUsesWith(I, TorF); +} + +/// The caller has matched a pattern of the form: +/// I = icmp ugt (add (add A, B), CI2), CI1 +/// If this is of the form: +/// sum = a + b +/// if (sum+128 >u 255) +/// Then replace it with llvm.sadd.with.overflow.i8. +/// +static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, + ConstantInt *CI2, ConstantInt *CI1, + InstCombiner &IC) { + // The transformation we're trying to do here is to transform this into an + // llvm.sadd.with.overflow. To do this, we have to replace the original add + // with a narrower add, and discard the add-with-constant that is part of the + // range check (if we can't eliminate it, this isn't profitable). + + // In order to eliminate the add-with-constant, the compare can be its only + // use. + Instruction *AddWithCst = cast<Instruction>(I.getOperand(0)); + if (!AddWithCst->hasOneUse()) + return nullptr; + + // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow. + if (!CI2->getValue().isPowerOf2()) + return nullptr; + unsigned NewWidth = CI2->getValue().countTrailingZeros(); + if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31) + return nullptr; + + // The width of the new add formed is 1 more than the bias. + ++NewWidth; + + // Check to see that CI1 is an all-ones value with NewWidth bits. + if (CI1->getBitWidth() == NewWidth || + CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth)) + return nullptr; + + // This is only really a signed overflow check if the inputs have been + // sign-extended; check for that condition. For example, if CI2 is 2^31 and + // the operands of the add are 64 bits wide, we need at least 33 sign bits. + unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1; + if (IC.ComputeNumSignBits(A, 0, &I) < NeededSignBits || + IC.ComputeNumSignBits(B, 0, &I) < NeededSignBits) + return nullptr; + + // In order to replace the original add with a narrower + // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant + // and truncates that discard the high bits of the add. Verify that this is + // the case. + Instruction *OrigAdd = cast<Instruction>(AddWithCst->getOperand(0)); + for (User *U : OrigAdd->users()) { + if (U == AddWithCst) + continue; + + // Only accept truncates for now. We would really like a nice recursive + // predicate like SimplifyDemandedBits, but which goes downwards the use-def + // chain to see which bits of a value are actually demanded. If the + // original add had another add which was then immediately truncated, we + // could still do the transformation. + TruncInst *TI = dyn_cast<TruncInst>(U); + if (!TI || TI->getType()->getPrimitiveSizeInBits() > NewWidth) + return nullptr; + } + + // If the pattern matches, truncate the inputs to the narrower type and + // use the sadd_with_overflow intrinsic to efficiently compute both the + // result and the overflow bit. + Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth); + Function *F = Intrinsic::getDeclaration( + I.getModule(), Intrinsic::sadd_with_overflow, NewType); + + InstCombiner::BuilderTy &Builder = IC.Builder; + + // Put the new code above the original add, in case there are any uses of the + // add between the add and the compare. + Builder.SetInsertPoint(OrigAdd); + + Value *TruncA = Builder.CreateTrunc(A, NewType, A->getName() + ".trunc"); + Value *TruncB = Builder.CreateTrunc(B, NewType, B->getName() + ".trunc"); + CallInst *Call = Builder.CreateCall(F, {TruncA, TruncB}, "sadd"); + Value *Add = Builder.CreateExtractValue(Call, 0, "sadd.result"); + Value *ZExt = Builder.CreateZExt(Add, OrigAdd->getType()); + + // The inner add was the result of the narrow add, zero extended to the + // wider type. Replace it with the result computed by the intrinsic. + IC.replaceInstUsesWith(*OrigAdd, ZExt); + + // The original icmp gets replaced with the overflow value. + return ExtractValueInst::Create(Call, 1, "sadd.overflow"); +} + +/// If we have: +/// icmp eq/ne (urem/srem %x, %y), 0 +/// iff %y is a power-of-two, we can replace this with a bit test: +/// icmp eq/ne (and %x, (add %y, -1)), 0 +Instruction *InstCombiner::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) { + // This fold is only valid for equality predicates. + if (!I.isEquality()) + return nullptr; + ICmpInst::Predicate Pred; + Value *X, *Y, *Zero; + if (!match(&I, m_ICmp(Pred, m_OneUse(m_IRem(m_Value(X), m_Value(Y))), + m_CombineAnd(m_Zero(), m_Value(Zero))))) + return nullptr; + if (!isKnownToBeAPowerOfTwo(Y, /*OrZero*/ true, 0, &I)) + return nullptr; + // This may increase instruction count, we don't enforce that Y is a constant. + Value *Mask = Builder.CreateAdd(Y, Constant::getAllOnesValue(Y->getType())); + Value *Masked = Builder.CreateAnd(X, Mask); + return ICmpInst::Create(Instruction::ICmp, Pred, Masked, Zero); +} + +/// Fold equality-comparison between zero and any (maybe truncated) right-shift +/// by one-less-than-bitwidth into a sign test on the original value. +Instruction *InstCombiner::foldSignBitTest(ICmpInst &I) { + Instruction *Val; + ICmpInst::Predicate Pred; + if (!I.isEquality() || !match(&I, m_ICmp(Pred, m_Instruction(Val), m_Zero()))) + return nullptr; + + Value *X; + Type *XTy; + + Constant *C; + if (match(Val, m_TruncOrSelf(m_Shr(m_Value(X), m_Constant(C))))) { + XTy = X->getType(); + unsigned XBitWidth = XTy->getScalarSizeInBits(); + if (!match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt(XBitWidth, XBitWidth - 1)))) + return nullptr; + } else if (isa<BinaryOperator>(Val) && + (X = reassociateShiftAmtsOfTwoSameDirectionShifts( + cast<BinaryOperator>(Val), SQ.getWithInstruction(Val), + /*AnalyzeForSignBitExtraction=*/true))) { + XTy = X->getType(); + } else + return nullptr; + + return ICmpInst::Create(Instruction::ICmp, + Pred == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_SGE + : ICmpInst::ICMP_SLT, + X, ConstantInt::getNullValue(XTy)); +} + +// Handle icmp pred X, 0 +Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) { + CmpInst::Predicate Pred = Cmp.getPredicate(); + if (!match(Cmp.getOperand(1), m_Zero())) + return nullptr; + + // (icmp sgt smin(PosA, B) 0) -> (icmp sgt B 0) + if (Pred == ICmpInst::ICMP_SGT) { + Value *A, *B; + SelectPatternResult SPR = matchSelectPattern(Cmp.getOperand(0), A, B); + if (SPR.Flavor == SPF_SMIN) { + if (isKnownPositive(A, DL, 0, &AC, &Cmp, &DT)) + return new ICmpInst(Pred, B, Cmp.getOperand(1)); + if (isKnownPositive(B, DL, 0, &AC, &Cmp, &DT)) + return new ICmpInst(Pred, A, Cmp.getOperand(1)); + } + } + + if (Instruction *New = foldIRemByPowerOfTwoToBitTest(Cmp)) + return New; + + // Given: + // icmp eq/ne (urem %x, %y), 0 + // Iff %x has 0 or 1 bits set, and %y has at least 2 bits set, omit 'urem': + // icmp eq/ne %x, 0 + Value *X, *Y; + if (match(Cmp.getOperand(0), m_URem(m_Value(X), m_Value(Y))) && + ICmpInst::isEquality(Pred)) { + KnownBits XKnown = computeKnownBits(X, 0, &Cmp); + KnownBits YKnown = computeKnownBits(Y, 0, &Cmp); + if (XKnown.countMaxPopulation() == 1 && YKnown.countMinPopulation() >= 2) + return new ICmpInst(Pred, X, Cmp.getOperand(1)); + } + + return nullptr; +} + +/// Fold icmp Pred X, C. +/// TODO: This code structure does not make sense. The saturating add fold +/// should be moved to some other helper and extended as noted below (it is also +/// possible that code has been made unnecessary - do we canonicalize IR to +/// overflow/saturating intrinsics or not?). +Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) { + // Match the following pattern, which is a common idiom when writing + // overflow-safe integer arithmetic functions. The source performs an addition + // in wider type and explicitly checks for overflow using comparisons against + // INT_MIN and INT_MAX. Simplify by using the sadd_with_overflow intrinsic. + // + // TODO: This could probably be generalized to handle other overflow-safe + // operations if we worked out the formulas to compute the appropriate magic + // constants. + // + // sum = a + b + // if (sum+128 >u 255) ... -> llvm.sadd.with.overflow.i8 + CmpInst::Predicate Pred = Cmp.getPredicate(); + Value *Op0 = Cmp.getOperand(0), *Op1 = Cmp.getOperand(1); + Value *A, *B; + ConstantInt *CI, *CI2; // I = icmp ugt (add (add A, B), CI2), CI + if (Pred == ICmpInst::ICMP_UGT && match(Op1, m_ConstantInt(CI)) && + match(Op0, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2)))) + if (Instruction *Res = processUGT_ADDCST_ADD(Cmp, A, B, CI2, CI, *this)) + return Res; + + return nullptr; +} + +/// Canonicalize icmp instructions based on dominating conditions. +Instruction *InstCombiner::foldICmpWithDominatingICmp(ICmpInst &Cmp) { + // This is a cheap/incomplete check for dominance - just match a single + // predecessor with a conditional branch. + BasicBlock *CmpBB = Cmp.getParent(); + BasicBlock *DomBB = CmpBB->getSinglePredecessor(); + if (!DomBB) + return nullptr; + + Value *DomCond; + BasicBlock *TrueBB, *FalseBB; + if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB))) + return nullptr; + + assert((TrueBB == CmpBB || FalseBB == CmpBB) && + "Predecessor block does not point to successor?"); + + // The branch should get simplified. Don't bother simplifying this condition. + if (TrueBB == FalseBB) + return nullptr; + + // Try to simplify this compare to T/F based on the dominating condition. + Optional<bool> Imp = isImpliedCondition(DomCond, &Cmp, DL, TrueBB == CmpBB); + if (Imp) + return replaceInstUsesWith(Cmp, ConstantInt::get(Cmp.getType(), *Imp)); + + CmpInst::Predicate Pred = Cmp.getPredicate(); + Value *X = Cmp.getOperand(0), *Y = Cmp.getOperand(1); + ICmpInst::Predicate DomPred; + const APInt *C, *DomC; + if (match(DomCond, m_ICmp(DomPred, m_Specific(X), m_APInt(DomC))) && + match(Y, m_APInt(C))) { + // We have 2 compares of a variable with constants. Calculate the constant + // ranges of those compares to see if we can transform the 2nd compare: + // DomBB: + // DomCond = icmp DomPred X, DomC + // br DomCond, CmpBB, FalseBB + // CmpBB: + // Cmp = icmp Pred X, C + ConstantRange CR = ConstantRange::makeAllowedICmpRegion(Pred, *C); + ConstantRange DominatingCR = + (CmpBB == TrueBB) ? ConstantRange::makeExactICmpRegion(DomPred, *DomC) + : ConstantRange::makeExactICmpRegion( + CmpInst::getInversePredicate(DomPred), *DomC); + ConstantRange Intersection = DominatingCR.intersectWith(CR); + ConstantRange Difference = DominatingCR.difference(CR); + if (Intersection.isEmptySet()) + return replaceInstUsesWith(Cmp, Builder.getFalse()); + if (Difference.isEmptySet()) + return replaceInstUsesWith(Cmp, Builder.getTrue()); + + // Canonicalizing a sign bit comparison that gets used in a branch, + // pessimizes codegen by generating branch on zero instruction instead + // of a test and branch. So we avoid canonicalizing in such situations + // because test and branch instruction has better branch displacement + // than compare and branch instruction. + bool UnusedBit; + bool IsSignBit = isSignBitCheck(Pred, *C, UnusedBit); + if (Cmp.isEquality() || (IsSignBit && hasBranchUse(Cmp))) + return nullptr; + + if (const APInt *EqC = Intersection.getSingleElement()) + return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder.getInt(*EqC)); + if (const APInt *NeC = Difference.getSingleElement()) + return new ICmpInst(ICmpInst::ICMP_NE, X, Builder.getInt(*NeC)); + } + + return nullptr; +} + +/// Fold icmp (trunc X, Y), C. +Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp, + TruncInst *Trunc, + const APInt &C) { + ICmpInst::Predicate Pred = Cmp.getPredicate(); + Value *X = Trunc->getOperand(0); + if (C.isOneValue() && C.getBitWidth() > 1) { + // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1 + Value *V = nullptr; + if (Pred == ICmpInst::ICMP_SLT && match(X, m_Signum(m_Value(V)))) + return new ICmpInst(ICmpInst::ICMP_SLT, V, + ConstantInt::get(V->getType(), 1)); + } + + if (Cmp.isEquality() && Trunc->hasOneUse()) { + // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all + // of the high bits truncated out of x are known. + unsigned DstBits = Trunc->getType()->getScalarSizeInBits(), + SrcBits = X->getType()->getScalarSizeInBits(); + KnownBits Known = computeKnownBits(X, 0, &Cmp); + + // If all the high bits are known, we can do this xform. + if ((Known.Zero | Known.One).countLeadingOnes() >= SrcBits - DstBits) { + // Pull in the high bits from known-ones set. + APInt NewRHS = C.zext(SrcBits); + NewRHS |= Known.One & APInt::getHighBitsSet(SrcBits, SrcBits - DstBits); + return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), NewRHS)); + } + } + + return nullptr; +} + +/// Fold icmp (xor X, Y), C. +Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp, + BinaryOperator *Xor, + const APInt &C) { + Value *X = Xor->getOperand(0); + Value *Y = Xor->getOperand(1); + const APInt *XorC; + if (!match(Y, m_APInt(XorC))) + return nullptr; + + // If this is a comparison that tests the signbit (X < 0) or (x > -1), + // fold the xor. + ICmpInst::Predicate Pred = Cmp.getPredicate(); + bool TrueIfSigned = false; + if (isSignBitCheck(Cmp.getPredicate(), C, TrueIfSigned)) { + + // If the sign bit of the XorCst is not set, there is no change to + // the operation, just stop using the Xor. + if (!XorC->isNegative()) { + Cmp.setOperand(0, X); + Worklist.Add(Xor); + return &Cmp; + } + + // Emit the opposite comparison. + if (TrueIfSigned) + return new ICmpInst(ICmpInst::ICMP_SGT, X, + ConstantInt::getAllOnesValue(X->getType())); + else + return new ICmpInst(ICmpInst::ICMP_SLT, X, + ConstantInt::getNullValue(X->getType())); + } + + if (Xor->hasOneUse()) { + // (icmp u/s (xor X SignMask), C) -> (icmp s/u X, (xor C SignMask)) + if (!Cmp.isEquality() && XorC->isSignMask()) { + Pred = Cmp.isSigned() ? Cmp.getUnsignedPredicate() + : Cmp.getSignedPredicate(); + return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC)); + } + + // (icmp u/s (xor X ~SignMask), C) -> (icmp s/u X, (xor C ~SignMask)) + if (!Cmp.isEquality() && XorC->isMaxSignedValue()) { + Pred = Cmp.isSigned() ? Cmp.getUnsignedPredicate() + : Cmp.getSignedPredicate(); + Pred = Cmp.getSwappedPredicate(Pred); + return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC)); + } + } + + // Mask constant magic can eliminate an 'xor' with unsigned compares. + if (Pred == ICmpInst::ICMP_UGT) { + // (xor X, ~C) >u C --> X <u ~C (when C+1 is a power of 2) + if (*XorC == ~C && (C + 1).isPowerOf2()) + return new ICmpInst(ICmpInst::ICMP_ULT, X, Y); + // (xor X, C) >u C --> X >u C (when C+1 is a power of 2) + if (*XorC == C && (C + 1).isPowerOf2()) + return new ICmpInst(ICmpInst::ICMP_UGT, X, Y); + } + if (Pred == ICmpInst::ICMP_ULT) { + // (xor X, -C) <u C --> X >u ~C (when C is a power of 2) + if (*XorC == -C && C.isPowerOf2()) + return new ICmpInst(ICmpInst::ICMP_UGT, X, + ConstantInt::get(X->getType(), ~C)); + // (xor X, C) <u C --> X >u ~C (when -C is a power of 2) + if (*XorC == C && (-C).isPowerOf2()) + return new ICmpInst(ICmpInst::ICMP_UGT, X, + ConstantInt::get(X->getType(), ~C)); + } + return nullptr; +} + +/// Fold icmp (and (sh X, Y), C2), C1. +Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And, + const APInt &C1, const APInt &C2) { + BinaryOperator *Shift = dyn_cast<BinaryOperator>(And->getOperand(0)); + if (!Shift || !Shift->isShift()) + return nullptr; + + // If this is: (X >> C3) & C2 != C1 (where any shift and any compare could + // exist), turn it into (X & (C2 << C3)) != (C1 << C3). This happens a LOT in + // code produced by the clang front-end, for bitfield access. + // This seemingly simple opportunity to fold away a shift turns out to be + // rather complicated. See PR17827 for details. + unsigned ShiftOpcode = Shift->getOpcode(); + bool IsShl = ShiftOpcode == Instruction::Shl; + const APInt *C3; + if (match(Shift->getOperand(1), m_APInt(C3))) { + bool CanFold = false; + if (ShiftOpcode == Instruction::Shl) { + // For a left shift, we can fold if the comparison is not signed. We can + // also fold a signed comparison if the mask value and comparison value + // are not negative. These constraints may not be obvious, but we can + // prove that they are correct using an SMT solver. + if (!Cmp.isSigned() || (!C2.isNegative() && !C1.isNegative())) + CanFold = true; + } else { + bool IsAshr = ShiftOpcode == Instruction::AShr; + // For a logical right shift, we can fold if the comparison is not signed. + // We can also fold a signed comparison if the shifted mask value and the + // shifted comparison value are not negative. These constraints may not be + // obvious, but we can prove that they are correct using an SMT solver. + // For an arithmetic shift right we can do the same, if we ensure + // the And doesn't use any bits being shifted in. Normally these would + // be turned into lshr by SimplifyDemandedBits, but not if there is an + // additional user. + if (!IsAshr || (C2.shl(*C3).lshr(*C3) == C2)) { + if (!Cmp.isSigned() || + (!C2.shl(*C3).isNegative() && !C1.shl(*C3).isNegative())) + CanFold = true; + } + } + + if (CanFold) { + APInt NewCst = IsShl ? C1.lshr(*C3) : C1.shl(*C3); + APInt SameAsC1 = IsShl ? NewCst.shl(*C3) : NewCst.lshr(*C3); + // Check to see if we are shifting out any of the bits being compared. + if (SameAsC1 != C1) { + // If we shifted bits out, the fold is not going to work out. As a + // special case, check to see if this means that the result is always + // true or false now. + if (Cmp.getPredicate() == ICmpInst::ICMP_EQ) + return replaceInstUsesWith(Cmp, ConstantInt::getFalse(Cmp.getType())); + if (Cmp.getPredicate() == ICmpInst::ICMP_NE) + return replaceInstUsesWith(Cmp, ConstantInt::getTrue(Cmp.getType())); + } else { + Cmp.setOperand(1, ConstantInt::get(And->getType(), NewCst)); + APInt NewAndCst = IsShl ? C2.lshr(*C3) : C2.shl(*C3); + And->setOperand(1, ConstantInt::get(And->getType(), NewAndCst)); + And->setOperand(0, Shift->getOperand(0)); + Worklist.Add(Shift); // Shift is dead. + return &Cmp; + } + } + } + + // Turn ((X >> Y) & C2) == 0 into (X & (C2 << Y)) == 0. The latter is + // preferable because it allows the C2 << Y expression to be hoisted out of a + // loop if Y is invariant and X is not. + if (Shift->hasOneUse() && C1.isNullValue() && Cmp.isEquality() && + !Shift->isArithmeticShift() && !isa<Constant>(Shift->getOperand(0))) { + // Compute C2 << Y. + Value *NewShift = + IsShl ? Builder.CreateLShr(And->getOperand(1), Shift->getOperand(1)) + : Builder.CreateShl(And->getOperand(1), Shift->getOperand(1)); + + // Compute X & (C2 << Y). + Value *NewAnd = Builder.CreateAnd(Shift->getOperand(0), NewShift); + Cmp.setOperand(0, NewAnd); + return &Cmp; + } + + return nullptr; +} + +/// Fold icmp (and X, C2), C1. +Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp, + BinaryOperator *And, + const APInt &C1) { + bool isICMP_NE = Cmp.getPredicate() == ICmpInst::ICMP_NE; + + // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1 + // TODO: We canonicalize to the longer form for scalars because we have + // better analysis/folds for icmp, and codegen may be better with icmp. + if (isICMP_NE && Cmp.getType()->isVectorTy() && C1.isNullValue() && + match(And->getOperand(1), m_One())) + return new TruncInst(And->getOperand(0), Cmp.getType()); + + const APInt *C2; + Value *X; + if (!match(And, m_And(m_Value(X), m_APInt(C2)))) + return nullptr; + + // Don't perform the following transforms if the AND has multiple uses + if (!And->hasOneUse()) + return nullptr; + + if (Cmp.isEquality() && C1.isNullValue()) { + // Restrict this fold to single-use 'and' (PR10267). + // Replace (and X, (1 << size(X)-1) != 0) with X s< 0 + if (C2->isSignMask()) { + Constant *Zero = Constant::getNullValue(X->getType()); + auto NewPred = isICMP_NE ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE; + return new ICmpInst(NewPred, X, Zero); + } + + // Restrict this fold only for single-use 'and' (PR10267). + // ((%x & C) == 0) --> %x u< (-C) iff (-C) is power of two. + if ((~(*C2) + 1).isPowerOf2()) { + Constant *NegBOC = + ConstantExpr::getNeg(cast<Constant>(And->getOperand(1))); + auto NewPred = isICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT; + return new ICmpInst(NewPred, X, NegBOC); + } + } + + // If the LHS is an 'and' of a truncate and we can widen the and/compare to + // the input width without changing the value produced, eliminate the cast: + // + // icmp (and (trunc W), C2), C1 -> icmp (and W, C2'), C1' + // + // We can do this transformation if the constants do not have their sign bits + // set or if it is an equality comparison. Extending a relational comparison + // when we're checking the sign bit would not work. + Value *W; + if (match(And->getOperand(0), m_OneUse(m_Trunc(m_Value(W)))) && + (Cmp.isEquality() || (!C1.isNegative() && !C2->isNegative()))) { + // TODO: Is this a good transform for vectors? Wider types may reduce + // throughput. Should this transform be limited (even for scalars) by using + // shouldChangeType()? + if (!Cmp.getType()->isVectorTy()) { + Type *WideType = W->getType(); + unsigned WideScalarBits = WideType->getScalarSizeInBits(); + Constant *ZextC1 = ConstantInt::get(WideType, C1.zext(WideScalarBits)); + Constant *ZextC2 = ConstantInt::get(WideType, C2->zext(WideScalarBits)); + Value *NewAnd = Builder.CreateAnd(W, ZextC2, And->getName()); + return new ICmpInst(Cmp.getPredicate(), NewAnd, ZextC1); + } + } + + if (Instruction *I = foldICmpAndShift(Cmp, And, C1, *C2)) + return I; + + // (icmp pred (and (or (lshr A, B), A), 1), 0) --> + // (icmp pred (and A, (or (shl 1, B), 1), 0)) + // + // iff pred isn't signed + if (!Cmp.isSigned() && C1.isNullValue() && And->getOperand(0)->hasOneUse() && + match(And->getOperand(1), m_One())) { + Constant *One = cast<Constant>(And->getOperand(1)); + Value *Or = And->getOperand(0); + Value *A, *B, *LShr; + if (match(Or, m_Or(m_Value(LShr), m_Value(A))) && + match(LShr, m_LShr(m_Specific(A), m_Value(B)))) { + unsigned UsesRemoved = 0; + if (And->hasOneUse()) + ++UsesRemoved; + if (Or->hasOneUse()) + ++UsesRemoved; + if (LShr->hasOneUse()) + ++UsesRemoved; + + // Compute A & ((1 << B) | 1) + Value *NewOr = nullptr; + if (auto *C = dyn_cast<Constant>(B)) { + if (UsesRemoved >= 1) + NewOr = ConstantExpr::getOr(ConstantExpr::getNUWShl(One, C), One); + } else { + if (UsesRemoved >= 3) + NewOr = Builder.CreateOr(Builder.CreateShl(One, B, LShr->getName(), + /*HasNUW=*/true), + One, Or->getName()); + } + if (NewOr) { + Value *NewAnd = Builder.CreateAnd(A, NewOr, And->getName()); + Cmp.setOperand(0, NewAnd); + return &Cmp; + } + } + } + + return nullptr; +} + +/// Fold icmp (and X, Y), C. +Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp, + BinaryOperator *And, + const APInt &C) { + if (Instruction *I = foldICmpAndConstConst(Cmp, And, C)) + return I; + + // TODO: These all require that Y is constant too, so refactor with the above. + + // Try to optimize things like "A[i] & 42 == 0" to index computations. + Value *X = And->getOperand(0); + Value *Y = And->getOperand(1); + if (auto *LI = dyn_cast<LoadInst>(X)) + if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) + if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) + if (GV->isConstant() && GV->hasDefinitiveInitializer() && + !LI->isVolatile() && isa<ConstantInt>(Y)) { + ConstantInt *C2 = cast<ConstantInt>(Y); + if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, Cmp, C2)) + return Res; + } + + if (!Cmp.isEquality()) + return nullptr; + + // X & -C == -C -> X > u ~C + // X & -C != -C -> X <= u ~C + // iff C is a power of 2 + if (Cmp.getOperand(1) == Y && (-C).isPowerOf2()) { + auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT + : CmpInst::ICMP_ULE; + return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1)))); + } + + // (X & C2) == 0 -> (trunc X) >= 0 + // (X & C2) != 0 -> (trunc X) < 0 + // iff C2 is a power of 2 and it masks the sign bit of a legal integer type. + const APInt *C2; + if (And->hasOneUse() && C.isNullValue() && match(Y, m_APInt(C2))) { + int32_t ExactLogBase2 = C2->exactLogBase2(); + if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) { + Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1); + if (And->getType()->isVectorTy()) + NTy = VectorType::get(NTy, And->getType()->getVectorNumElements()); + Value *Trunc = Builder.CreateTrunc(X, NTy); + auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE + : CmpInst::ICMP_SLT; + return new ICmpInst(NewPred, Trunc, Constant::getNullValue(NTy)); + } + } + + return nullptr; +} + +/// Fold icmp (or X, Y), C. +Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or, + const APInt &C) { + ICmpInst::Predicate Pred = Cmp.getPredicate(); + if (C.isOneValue()) { + // icmp slt signum(V) 1 --> icmp slt V, 1 + Value *V = nullptr; + if (Pred == ICmpInst::ICMP_SLT && match(Or, m_Signum(m_Value(V)))) + return new ICmpInst(ICmpInst::ICMP_SLT, V, + ConstantInt::get(V->getType(), 1)); + } + + Value *OrOp0 = Or->getOperand(0), *OrOp1 = Or->getOperand(1); + if (Cmp.isEquality() && Cmp.getOperand(1) == OrOp1) { + // X | C == C --> X <=u C + // X | C != C --> X >u C + // iff C+1 is a power of 2 (C is a bitmask of the low bits) + if ((C + 1).isPowerOf2()) { + Pred = (Pred == CmpInst::ICMP_EQ) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT; + return new ICmpInst(Pred, OrOp0, OrOp1); + } + // More general: are all bits outside of a mask constant set or not set? + // X | C == C --> (X & ~C) == 0 + // X | C != C --> (X & ~C) != 0 + if (Or->hasOneUse()) { + Value *A = Builder.CreateAnd(OrOp0, ~C); + return new ICmpInst(Pred, A, ConstantInt::getNullValue(OrOp0->getType())); + } + } + + if (!Cmp.isEquality() || !C.isNullValue() || !Or->hasOneUse()) + return nullptr; + + Value *P, *Q; + if (match(Or, m_Or(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Value(Q))))) { + // Simplify icmp eq (or (ptrtoint P), (ptrtoint Q)), 0 + // -> and (icmp eq P, null), (icmp eq Q, null). + Value *CmpP = + Builder.CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType())); + Value *CmpQ = + Builder.CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType())); + auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or; + return BinaryOperator::Create(BOpc, CmpP, CmpQ); + } + + // Are we using xors to bitwise check for a pair of (in)equalities? Convert to + // a shorter form that has more potential to be folded even further. + Value *X1, *X2, *X3, *X4; + if (match(OrOp0, m_OneUse(m_Xor(m_Value(X1), m_Value(X2)))) && + match(OrOp1, m_OneUse(m_Xor(m_Value(X3), m_Value(X4))))) { + // ((X1 ^ X2) || (X3 ^ X4)) == 0 --> (X1 == X2) && (X3 == X4) + // ((X1 ^ X2) || (X3 ^ X4)) != 0 --> (X1 != X2) || (X3 != X4) + Value *Cmp12 = Builder.CreateICmp(Pred, X1, X2); + Value *Cmp34 = Builder.CreateICmp(Pred, X3, X4); + auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or; + return BinaryOperator::Create(BOpc, Cmp12, Cmp34); + } + + return nullptr; +} + +/// Fold icmp (mul X, Y), C. +Instruction *InstCombiner::foldICmpMulConstant(ICmpInst &Cmp, + BinaryOperator *Mul, + const APInt &C) { + const APInt *MulC; + if (!match(Mul->getOperand(1), m_APInt(MulC))) + return nullptr; + + // If this is a test of the sign bit and the multiply is sign-preserving with + // a constant operand, use the multiply LHS operand instead. + ICmpInst::Predicate Pred = Cmp.getPredicate(); + if (isSignTest(Pred, C) && Mul->hasNoSignedWrap()) { + if (MulC->isNegative()) + Pred = ICmpInst::getSwappedPredicate(Pred); + return new ICmpInst(Pred, Mul->getOperand(0), + Constant::getNullValue(Mul->getType())); + } + + return nullptr; +} + +/// Fold icmp (shl 1, Y), C. +static Instruction *foldICmpShlOne(ICmpInst &Cmp, Instruction *Shl, + const APInt &C) { + Value *Y; + if (!match(Shl, m_Shl(m_One(), m_Value(Y)))) + return nullptr; + + Type *ShiftType = Shl->getType(); + unsigned TypeBits = C.getBitWidth(); + bool CIsPowerOf2 = C.isPowerOf2(); + ICmpInst::Predicate Pred = Cmp.getPredicate(); + if (Cmp.isUnsigned()) { + // (1 << Y) pred C -> Y pred Log2(C) + if (!CIsPowerOf2) { + // (1 << Y) < 30 -> Y <= 4 + // (1 << Y) <= 30 -> Y <= 4 + // (1 << Y) >= 30 -> Y > 4 + // (1 << Y) > 30 -> Y > 4 + if (Pred == ICmpInst::ICMP_ULT) + Pred = ICmpInst::ICMP_ULE; + else if (Pred == ICmpInst::ICMP_UGE) + Pred = ICmpInst::ICMP_UGT; + } + + // (1 << Y) >= 2147483648 -> Y >= 31 -> Y == 31 + // (1 << Y) < 2147483648 -> Y < 31 -> Y != 31 + unsigned CLog2 = C.logBase2(); + if (CLog2 == TypeBits - 1) { + if (Pred == ICmpInst::ICMP_UGE) + Pred = ICmpInst::ICMP_EQ; + else if (Pred == ICmpInst::ICMP_ULT) + Pred = ICmpInst::ICMP_NE; + } + return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, CLog2)); + } else if (Cmp.isSigned()) { + Constant *BitWidthMinusOne = ConstantInt::get(ShiftType, TypeBits - 1); + if (C.isAllOnesValue()) { + // (1 << Y) <= -1 -> Y == 31 + if (Pred == ICmpInst::ICMP_SLE) + return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne); + + // (1 << Y) > -1 -> Y != 31 + if (Pred == ICmpInst::ICMP_SGT) + return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne); + } else if (!C) { + // (1 << Y) < 0 -> Y == 31 + // (1 << Y) <= 0 -> Y == 31 + if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) + return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne); + + // (1 << Y) >= 0 -> Y != 31 + // (1 << Y) > 0 -> Y != 31 + if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE) + return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne); + } + } else if (Cmp.isEquality() && CIsPowerOf2) { + return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, C.logBase2())); + } + + return nullptr; +} + +/// Fold icmp (shl X, Y), C. +Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp, + BinaryOperator *Shl, + const APInt &C) { + const APInt *ShiftVal; + if (Cmp.isEquality() && match(Shl->getOperand(0), m_APInt(ShiftVal))) + return foldICmpShlConstConst(Cmp, Shl->getOperand(1), C, *ShiftVal); + + const APInt *ShiftAmt; + if (!match(Shl->getOperand(1), m_APInt(ShiftAmt))) + return foldICmpShlOne(Cmp, Shl, C); + + // Check that the shift amount is in range. If not, don't perform undefined + // shifts. When the shift is visited, it will be simplified. + unsigned TypeBits = C.getBitWidth(); + if (ShiftAmt->uge(TypeBits)) + return nullptr; + + ICmpInst::Predicate Pred = Cmp.getPredicate(); + Value *X = Shl->getOperand(0); + Type *ShType = Shl->getType(); + + // NSW guarantees that we are only shifting out sign bits from the high bits, + // so we can ASHR the compare constant without needing a mask and eliminate + // the shift. + if (Shl->hasNoSignedWrap()) { + if (Pred == ICmpInst::ICMP_SGT) { + // icmp Pred (shl nsw X, ShiftAmt), C --> icmp Pred X, (C >>s ShiftAmt) + APInt ShiftedC = C.ashr(*ShiftAmt); + return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC)); + } + if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) && + C.ashr(*ShiftAmt).shl(*ShiftAmt) == C) { + APInt ShiftedC = C.ashr(*ShiftAmt); + return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC)); + } + if (Pred == ICmpInst::ICMP_SLT) { + // SLE is the same as above, but SLE is canonicalized to SLT, so convert: + // (X << S) <=s C is equiv to X <=s (C >> S) for all C + // (X << S) <s (C + 1) is equiv to X <s (C >> S) + 1 if C <s SMAX + // (X << S) <s C is equiv to X <s ((C - 1) >> S) + 1 if C >s SMIN + assert(!C.isMinSignedValue() && "Unexpected icmp slt"); + APInt ShiftedC = (C - 1).ashr(*ShiftAmt) + 1; + return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC)); + } + // If this is a signed comparison to 0 and the shift is sign preserving, + // use the shift LHS operand instead; isSignTest may change 'Pred', so only + // do that if we're sure to not continue on in this function. + if (isSignTest(Pred, C)) + return new ICmpInst(Pred, X, Constant::getNullValue(ShType)); + } + + // NUW guarantees that we are only shifting out zero bits from the high bits, + // so we can LSHR the compare constant without needing a mask and eliminate + // the shift. + if (Shl->hasNoUnsignedWrap()) { + if (Pred == ICmpInst::ICMP_UGT) { + // icmp Pred (shl nuw X, ShiftAmt), C --> icmp Pred X, (C >>u ShiftAmt) + APInt ShiftedC = C.lshr(*ShiftAmt); + return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC)); + } + if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) && + C.lshr(*ShiftAmt).shl(*ShiftAmt) == C) { + APInt ShiftedC = C.lshr(*ShiftAmt); + return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC)); + } + if (Pred == ICmpInst::ICMP_ULT) { + // ULE is the same as above, but ULE is canonicalized to ULT, so convert: + // (X << S) <=u C is equiv to X <=u (C >> S) for all C + // (X << S) <u (C + 1) is equiv to X <u (C >> S) + 1 if C <u ~0u + // (X << S) <u C is equiv to X <u ((C - 1) >> S) + 1 if C >u 0 + assert(C.ugt(0) && "ult 0 should have been eliminated"); + APInt ShiftedC = (C - 1).lshr(*ShiftAmt) + 1; + return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC)); + } + } + + if (Cmp.isEquality() && Shl->hasOneUse()) { + // Strength-reduce the shift into an 'and'. + Constant *Mask = ConstantInt::get( + ShType, + APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue())); + Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask"); + Constant *LShrC = ConstantInt::get(ShType, C.lshr(*ShiftAmt)); + return new ICmpInst(Pred, And, LShrC); + } + + // Otherwise, if this is a comparison of the sign bit, simplify to and/test. + bool TrueIfSigned = false; + if (Shl->hasOneUse() && isSignBitCheck(Pred, C, TrueIfSigned)) { + // (X << 31) <s 0 --> (X & 1) != 0 + Constant *Mask = ConstantInt::get( + ShType, + APInt::getOneBitSet(TypeBits, TypeBits - ShiftAmt->getZExtValue() - 1)); + Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask"); + return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ, + And, Constant::getNullValue(ShType)); + } + + // Simplify 'shl' inequality test into 'and' equality test. + if (Cmp.isUnsigned() && Shl->hasOneUse()) { + // (X l<< C2) u<=/u> C1 iff C1+1 is power of two -> X & (~C1 l>> C2) ==/!= 0 + if ((C + 1).isPowerOf2() && + (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT)) { + Value *And = Builder.CreateAnd(X, (~C).lshr(ShiftAmt->getZExtValue())); + return new ICmpInst(Pred == ICmpInst::ICMP_ULE ? ICmpInst::ICMP_EQ + : ICmpInst::ICMP_NE, + And, Constant::getNullValue(ShType)); + } + // (X l<< C2) u</u>= C1 iff C1 is power of two -> X & (-C1 l>> C2) ==/!= 0 + if (C.isPowerOf2() && + (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE)) { + Value *And = + Builder.CreateAnd(X, (~(C - 1)).lshr(ShiftAmt->getZExtValue())); + return new ICmpInst(Pred == ICmpInst::ICMP_ULT ? ICmpInst::ICMP_EQ + : ICmpInst::ICMP_NE, + And, Constant::getNullValue(ShType)); + } + } + + // Transform (icmp pred iM (shl iM %v, N), C) + // -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (C>>N)) + // Transform the shl to a trunc if (trunc (C>>N)) has no loss and M-N. + // This enables us to get rid of the shift in favor of a trunc that may be + // free on the target. It has the additional benefit of comparing to a + // smaller constant that may be more target-friendly. + unsigned Amt = ShiftAmt->getLimitedValue(TypeBits - 1); + if (Shl->hasOneUse() && Amt != 0 && C.countTrailingZeros() >= Amt && + DL.isLegalInteger(TypeBits - Amt)) { + Type *TruncTy = IntegerType::get(Cmp.getContext(), TypeBits - Amt); + if (ShType->isVectorTy()) + TruncTy = VectorType::get(TruncTy, ShType->getVectorNumElements()); + Constant *NewC = + ConstantInt::get(TruncTy, C.ashr(*ShiftAmt).trunc(TypeBits - Amt)); + return new ICmpInst(Pred, Builder.CreateTrunc(X, TruncTy), NewC); + } + + return nullptr; +} + +/// Fold icmp ({al}shr X, Y), C. +Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp, + BinaryOperator *Shr, + const APInt &C) { + // An exact shr only shifts out zero bits, so: + // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0 + Value *X = Shr->getOperand(0); + CmpInst::Predicate Pred = Cmp.getPredicate(); + if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() && + C.isNullValue()) + return new ICmpInst(Pred, X, Cmp.getOperand(1)); + + const APInt *ShiftVal; + if (Cmp.isEquality() && match(Shr->getOperand(0), m_APInt(ShiftVal))) + return foldICmpShrConstConst(Cmp, Shr->getOperand(1), C, *ShiftVal); + + const APInt *ShiftAmt; + if (!match(Shr->getOperand(1), m_APInt(ShiftAmt))) + return nullptr; + + // Check that the shift amount is in range. If not, don't perform undefined + // shifts. When the shift is visited it will be simplified. + unsigned TypeBits = C.getBitWidth(); + unsigned ShAmtVal = ShiftAmt->getLimitedValue(TypeBits); + if (ShAmtVal >= TypeBits || ShAmtVal == 0) + return nullptr; + + bool IsAShr = Shr->getOpcode() == Instruction::AShr; + bool IsExact = Shr->isExact(); + Type *ShrTy = Shr->getType(); + // TODO: If we could guarantee that InstSimplify would handle all of the + // constant-value-based preconditions in the folds below, then we could assert + // those conditions rather than checking them. This is difficult because of + // undef/poison (PR34838). + if (IsAShr) { + if (Pred == CmpInst::ICMP_SLT || (Pred == CmpInst::ICMP_SGT && IsExact)) { + // icmp slt (ashr X, ShAmtC), C --> icmp slt X, (C << ShAmtC) + // icmp sgt (ashr exact X, ShAmtC), C --> icmp sgt X, (C << ShAmtC) + APInt ShiftedC = C.shl(ShAmtVal); + if (ShiftedC.ashr(ShAmtVal) == C) + return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC)); + } + if (Pred == CmpInst::ICMP_SGT) { + // icmp sgt (ashr X, ShAmtC), C --> icmp sgt X, ((C + 1) << ShAmtC) - 1 + APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1; + if (!C.isMaxSignedValue() && !(C + 1).shl(ShAmtVal).isMinSignedValue() && + (ShiftedC + 1).ashr(ShAmtVal) == (C + 1)) + return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC)); + } + } else { + if (Pred == CmpInst::ICMP_ULT || (Pred == CmpInst::ICMP_UGT && IsExact)) { + // icmp ult (lshr X, ShAmtC), C --> icmp ult X, (C << ShAmtC) + // icmp ugt (lshr exact X, ShAmtC), C --> icmp ugt X, (C << ShAmtC) + APInt ShiftedC = C.shl(ShAmtVal); + if (ShiftedC.lshr(ShAmtVal) == C) + return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC)); + } + if (Pred == CmpInst::ICMP_UGT) { + // icmp ugt (lshr X, ShAmtC), C --> icmp ugt X, ((C + 1) << ShAmtC) - 1 + APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1; + if ((ShiftedC + 1).lshr(ShAmtVal) == (C + 1)) + return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC)); + } + } + + if (!Cmp.isEquality()) + return nullptr; + + // Handle equality comparisons of shift-by-constant. + + // If the comparison constant changes with the shift, the comparison cannot + // succeed (bits of the comparison constant cannot match the shifted value). + // This should be known by InstSimplify and already be folded to true/false. + assert(((IsAShr && C.shl(ShAmtVal).ashr(ShAmtVal) == C) || + (!IsAShr && C.shl(ShAmtVal).lshr(ShAmtVal) == C)) && + "Expected icmp+shr simplify did not occur."); + + // If the bits shifted out are known zero, compare the unshifted value: + // (X & 4) >> 1 == 2 --> (X & 4) == 4. + if (Shr->isExact()) + return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, C << ShAmtVal)); + + if (Shr->hasOneUse()) { + // Canonicalize the shift into an 'and': + // icmp eq/ne (shr X, ShAmt), C --> icmp eq/ne (and X, HiMask), (C << ShAmt) + APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal)); + Constant *Mask = ConstantInt::get(ShrTy, Val); + Value *And = Builder.CreateAnd(X, Mask, Shr->getName() + ".mask"); + return new ICmpInst(Pred, And, ConstantInt::get(ShrTy, C << ShAmtVal)); + } + + return nullptr; +} + +Instruction *InstCombiner::foldICmpSRemConstant(ICmpInst &Cmp, + BinaryOperator *SRem, + const APInt &C) { + // Match an 'is positive' or 'is negative' comparison of remainder by a + // constant power-of-2 value: + // (X % pow2C) sgt/slt 0 + const ICmpInst::Predicate Pred = Cmp.getPredicate(); + if (Pred != ICmpInst::ICMP_SGT && Pred != ICmpInst::ICMP_SLT) + return nullptr; + + // TODO: The one-use check is standard because we do not typically want to + // create longer instruction sequences, but this might be a special-case + // because srem is not good for analysis or codegen. + if (!SRem->hasOneUse()) + return nullptr; + + const APInt *DivisorC; + if (!C.isNullValue() || !match(SRem->getOperand(1), m_Power2(DivisorC))) + return nullptr; + + // Mask off the sign bit and the modulo bits (low-bits). + Type *Ty = SRem->getType(); + APInt SignMask = APInt::getSignMask(Ty->getScalarSizeInBits()); + Constant *MaskC = ConstantInt::get(Ty, SignMask | (*DivisorC - 1)); + Value *And = Builder.CreateAnd(SRem->getOperand(0), MaskC); + + // For 'is positive?' check that the sign-bit is clear and at least 1 masked + // bit is set. Example: + // (i8 X % 32) s> 0 --> (X & 159) s> 0 + if (Pred == ICmpInst::ICMP_SGT) + return new ICmpInst(ICmpInst::ICMP_SGT, And, ConstantInt::getNullValue(Ty)); + + // For 'is negative?' check that the sign-bit is set and at least 1 masked + // bit is set. Example: + // (i16 X % 4) s< 0 --> (X & 32771) u> 32768 + return new ICmpInst(ICmpInst::ICMP_UGT, And, ConstantInt::get(Ty, SignMask)); +} + +/// Fold icmp (udiv X, Y), C. +Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp, + BinaryOperator *UDiv, + const APInt &C) { + const APInt *C2; + if (!match(UDiv->getOperand(0), m_APInt(C2))) + return nullptr; + + assert(*C2 != 0 && "udiv 0, X should have been simplified already."); + + // (icmp ugt (udiv C2, Y), C) -> (icmp ule Y, C2/(C+1)) + Value *Y = UDiv->getOperand(1); + if (Cmp.getPredicate() == ICmpInst::ICMP_UGT) { + assert(!C.isMaxValue() && + "icmp ugt X, UINT_MAX should have been simplified already."); + return new ICmpInst(ICmpInst::ICMP_ULE, Y, + ConstantInt::get(Y->getType(), C2->udiv(C + 1))); + } + + // (icmp ult (udiv C2, Y), C) -> (icmp ugt Y, C2/C) + if (Cmp.getPredicate() == ICmpInst::ICMP_ULT) { + assert(C != 0 && "icmp ult X, 0 should have been simplified already."); + return new ICmpInst(ICmpInst::ICMP_UGT, Y, + ConstantInt::get(Y->getType(), C2->udiv(C))); + } + + return nullptr; +} + +/// Fold icmp ({su}div X, Y), C. +Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp, + BinaryOperator *Div, + const APInt &C) { + // Fold: icmp pred ([us]div X, C2), C -> range test + // Fold this div into the comparison, producing a range check. + // Determine, based on the divide type, what the range is being + // checked. If there is an overflow on the low or high side, remember + // it, otherwise compute the range [low, hi) bounding the new value. + // See: InsertRangeTest above for the kinds of replacements possible. + const APInt *C2; + if (!match(Div->getOperand(1), m_APInt(C2))) + return nullptr; + + // FIXME: If the operand types don't match the type of the divide + // then don't attempt this transform. The code below doesn't have the + // logic to deal with a signed divide and an unsigned compare (and + // vice versa). This is because (x /s C2) <s C produces different + // results than (x /s C2) <u C or (x /u C2) <s C or even + // (x /u C2) <u C. Simply casting the operands and result won't + // work. :( The if statement below tests that condition and bails + // if it finds it. + bool DivIsSigned = Div->getOpcode() == Instruction::SDiv; + if (!Cmp.isEquality() && DivIsSigned != Cmp.isSigned()) + return nullptr; + + // The ProdOV computation fails on divide by 0 and divide by -1. Cases with + // INT_MIN will also fail if the divisor is 1. Although folds of all these + // division-by-constant cases should be present, we can not assert that they + // have happened before we reach this icmp instruction. + if (C2->isNullValue() || C2->isOneValue() || + (DivIsSigned && C2->isAllOnesValue())) + return nullptr; + + // Compute Prod = C * C2. We are essentially solving an equation of + // form X / C2 = C. We solve for X by multiplying C2 and C. + // By solving for X, we can turn this into a range check instead of computing + // a divide. + APInt Prod = C * *C2; + + // Determine if the product overflows by seeing if the product is not equal to + // the divide. Make sure we do the same kind of divide as in the LHS + // instruction that we're folding. + bool ProdOV = (DivIsSigned ? Prod.sdiv(*C2) : Prod.udiv(*C2)) != C; + + ICmpInst::Predicate Pred = Cmp.getPredicate(); + + // If the division is known to be exact, then there is no remainder from the + // divide, so the covered range size is unit, otherwise it is the divisor. + APInt RangeSize = Div->isExact() ? APInt(C2->getBitWidth(), 1) : *C2; + + // Figure out the interval that is being checked. For example, a comparison + // like "X /u 5 == 0" is really checking that X is in the interval [0, 5). + // Compute this interval based on the constants involved and the signedness of + // the compare/divide. This computes a half-open interval, keeping track of + // whether either value in the interval overflows. After analysis each + // overflow variable is set to 0 if it's corresponding bound variable is valid + // -1 if overflowed off the bottom end, or +1 if overflowed off the top end. + int LoOverflow = 0, HiOverflow = 0; + APInt LoBound, HiBound; + + if (!DivIsSigned) { // udiv + // e.g. X/5 op 3 --> [15, 20) + LoBound = Prod; + HiOverflow = LoOverflow = ProdOV; + if (!HiOverflow) { + // If this is not an exact divide, then many values in the range collapse + // to the same result value. + HiOverflow = addWithOverflow(HiBound, LoBound, RangeSize, false); + } + } else if (C2->isStrictlyPositive()) { // Divisor is > 0. + if (C.isNullValue()) { // (X / pos) op 0 + // Can't overflow. e.g. X/2 op 0 --> [-1, 2) + LoBound = -(RangeSize - 1); + HiBound = RangeSize; + } else if (C.isStrictlyPositive()) { // (X / pos) op pos + LoBound = Prod; // e.g. X/5 op 3 --> [15, 20) + HiOverflow = LoOverflow = ProdOV; + if (!HiOverflow) + HiOverflow = addWithOverflow(HiBound, Prod, RangeSize, true); + } else { // (X / pos) op neg + // e.g. X/5 op -3 --> [-15-4, -15+1) --> [-19, -14) + HiBound = Prod + 1; + LoOverflow = HiOverflow = ProdOV ? -1 : 0; + if (!LoOverflow) { + APInt DivNeg = -RangeSize; + LoOverflow = addWithOverflow(LoBound, HiBound, DivNeg, true) ? -1 : 0; + } + } + } else if (C2->isNegative()) { // Divisor is < 0. + if (Div->isExact()) + RangeSize.negate(); + if (C.isNullValue()) { // (X / neg) op 0 + // e.g. X/-5 op 0 --> [-4, 5) + LoBound = RangeSize + 1; + HiBound = -RangeSize; + if (HiBound == *C2) { // -INTMIN = INTMIN + HiOverflow = 1; // [INTMIN+1, overflow) + HiBound = APInt(); // e.g. X/INTMIN = 0 --> X > INTMIN + } + } else if (C.isStrictlyPositive()) { // (X / neg) op pos + // e.g. X/-5 op 3 --> [-19, -14) + HiBound = Prod + 1; + HiOverflow = LoOverflow = ProdOV ? -1 : 0; + if (!LoOverflow) + LoOverflow = addWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0; + } else { // (X / neg) op neg + LoBound = Prod; // e.g. X/-5 op -3 --> [15, 20) + LoOverflow = HiOverflow = ProdOV; + if (!HiOverflow) + HiOverflow = subWithOverflow(HiBound, Prod, RangeSize, true); + } + + // Dividing by a negative swaps the condition. LT <-> GT + Pred = ICmpInst::getSwappedPredicate(Pred); + } + + Value *X = Div->getOperand(0); + switch (Pred) { + default: llvm_unreachable("Unhandled icmp opcode!"); + case ICmpInst::ICMP_EQ: + if (LoOverflow && HiOverflow) + return replaceInstUsesWith(Cmp, Builder.getFalse()); + if (HiOverflow) + return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : + ICmpInst::ICMP_UGE, X, + ConstantInt::get(Div->getType(), LoBound)); + if (LoOverflow) + return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : + ICmpInst::ICMP_ULT, X, + ConstantInt::get(Div->getType(), HiBound)); + return replaceInstUsesWith( + Cmp, insertRangeTest(X, LoBound, HiBound, DivIsSigned, true)); + case ICmpInst::ICMP_NE: + if (LoOverflow && HiOverflow) + return replaceInstUsesWith(Cmp, Builder.getTrue()); + if (HiOverflow) + return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : + ICmpInst::ICMP_ULT, X, + ConstantInt::get(Div->getType(), LoBound)); + if (LoOverflow) + return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : + ICmpInst::ICMP_UGE, X, + ConstantInt::get(Div->getType(), HiBound)); + return replaceInstUsesWith(Cmp, + insertRangeTest(X, LoBound, HiBound, + DivIsSigned, false)); + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_SLT: + if (LoOverflow == +1) // Low bound is greater than input range. + return replaceInstUsesWith(Cmp, Builder.getTrue()); + if (LoOverflow == -1) // Low bound is less than input range. + return replaceInstUsesWith(Cmp, Builder.getFalse()); + return new ICmpInst(Pred, X, ConstantInt::get(Div->getType(), LoBound)); + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_SGT: + if (HiOverflow == +1) // High bound greater than input range. + return replaceInstUsesWith(Cmp, Builder.getFalse()); + if (HiOverflow == -1) // High bound less than input range. + return replaceInstUsesWith(Cmp, Builder.getTrue()); + if (Pred == ICmpInst::ICMP_UGT) + return new ICmpInst(ICmpInst::ICMP_UGE, X, + ConstantInt::get(Div->getType(), HiBound)); + return new ICmpInst(ICmpInst::ICMP_SGE, X, + ConstantInt::get(Div->getType(), HiBound)); + } + + return nullptr; +} + +/// Fold icmp (sub X, Y), C. +Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp, + BinaryOperator *Sub, + const APInt &C) { + Value *X = Sub->getOperand(0), *Y = Sub->getOperand(1); + ICmpInst::Predicate Pred = Cmp.getPredicate(); + const APInt *C2; + APInt SubResult; + + // icmp eq/ne (sub C, Y), C -> icmp eq/ne Y, 0 + if (match(X, m_APInt(C2)) && *C2 == C && Cmp.isEquality()) + return new ICmpInst(Cmp.getPredicate(), Y, + ConstantInt::get(Y->getType(), 0)); + + // (icmp P (sub nuw|nsw C2, Y), C) -> (icmp swap(P) Y, C2-C) + if (match(X, m_APInt(C2)) && + ((Cmp.isUnsigned() && Sub->hasNoUnsignedWrap()) || + (Cmp.isSigned() && Sub->hasNoSignedWrap())) && + !subWithOverflow(SubResult, *C2, C, Cmp.isSigned())) + return new ICmpInst(Cmp.getSwappedPredicate(), Y, + ConstantInt::get(Y->getType(), SubResult)); + + // The following transforms are only worth it if the only user of the subtract + // is the icmp. + if (!Sub->hasOneUse()) + return nullptr; + + if (Sub->hasNoSignedWrap()) { + // (icmp sgt (sub nsw X, Y), -1) -> (icmp sge X, Y) + if (Pred == ICmpInst::ICMP_SGT && C.isAllOnesValue()) + return new ICmpInst(ICmpInst::ICMP_SGE, X, Y); + + // (icmp sgt (sub nsw X, Y), 0) -> (icmp sgt X, Y) + if (Pred == ICmpInst::ICMP_SGT && C.isNullValue()) + return new ICmpInst(ICmpInst::ICMP_SGT, X, Y); + + // (icmp slt (sub nsw X, Y), 0) -> (icmp slt X, Y) + if (Pred == ICmpInst::ICMP_SLT && C.isNullValue()) + return new ICmpInst(ICmpInst::ICMP_SLT, X, Y); + + // (icmp slt (sub nsw X, Y), 1) -> (icmp sle X, Y) + if (Pred == ICmpInst::ICMP_SLT && C.isOneValue()) + return new ICmpInst(ICmpInst::ICMP_SLE, X, Y); + } + + if (!match(X, m_APInt(C2))) + return nullptr; + + // C2 - Y <u C -> (Y | (C - 1)) == C2 + // iff (C2 & (C - 1)) == C - 1 and C is a power of 2 + if (Pred == ICmpInst::ICMP_ULT && C.isPowerOf2() && + (*C2 & (C - 1)) == (C - 1)) + return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateOr(Y, C - 1), X); + + // C2 - Y >u C -> (Y | C) != C2 + // iff C2 & C == C and C + 1 is a power of 2 + if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == C) + return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateOr(Y, C), X); + + return nullptr; +} + +/// Fold icmp (add X, Y), C. +Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp, + BinaryOperator *Add, + const APInt &C) { + Value *Y = Add->getOperand(1); + const APInt *C2; + if (Cmp.isEquality() || !match(Y, m_APInt(C2))) + return nullptr; + + // Fold icmp pred (add X, C2), C. + Value *X = Add->getOperand(0); + Type *Ty = Add->getType(); + CmpInst::Predicate Pred = Cmp.getPredicate(); + + if (!Add->hasOneUse()) + return nullptr; + + // If the add does not wrap, we can always adjust the compare by subtracting + // the constants. Equality comparisons are handled elsewhere. SGE/SLE/UGE/ULE + // are canonicalized to SGT/SLT/UGT/ULT. + if ((Add->hasNoSignedWrap() && + (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLT)) || + (Add->hasNoUnsignedWrap() && + (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT))) { + bool Overflow; + APInt NewC = + Cmp.isSigned() ? C.ssub_ov(*C2, Overflow) : C.usub_ov(*C2, Overflow); + // If there is overflow, the result must be true or false. + // TODO: Can we assert there is no overflow because InstSimplify always + // handles those cases? + if (!Overflow) + // icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2) + return new ICmpInst(Pred, X, ConstantInt::get(Ty, NewC)); + } + + auto CR = ConstantRange::makeExactICmpRegion(Pred, C).subtract(*C2); + const APInt &Upper = CR.getUpper(); + const APInt &Lower = CR.getLower(); + if (Cmp.isSigned()) { + if (Lower.isSignMask()) + return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantInt::get(Ty, Upper)); + if (Upper.isSignMask()) + return new ICmpInst(ICmpInst::ICMP_SGE, X, ConstantInt::get(Ty, Lower)); + } else { + if (Lower.isMinValue()) + return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantInt::get(Ty, Upper)); + if (Upper.isMinValue()) + return new ICmpInst(ICmpInst::ICMP_UGE, X, ConstantInt::get(Ty, Lower)); + } + + // X+C <u C2 -> (X & -C2) == C + // iff C & (C2-1) == 0 + // C2 is a power of 2 + if (Pred == ICmpInst::ICMP_ULT && C.isPowerOf2() && (*C2 & (C - 1)) == 0) + return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateAnd(X, -C), + ConstantExpr::getNeg(cast<Constant>(Y))); + + // X+C >u C2 -> (X & ~C2) != C + // iff C & C2 == 0 + // C2+1 is a power of 2 + if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == 0) + return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateAnd(X, ~C), + ConstantExpr::getNeg(cast<Constant>(Y))); + + return nullptr; +} + +bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, + Value *&RHS, ConstantInt *&Less, + ConstantInt *&Equal, + ConstantInt *&Greater) { + // TODO: Generalize this to work with other comparison idioms or ensure + // they get canonicalized into this form. + + // select i1 (a == b), + // i32 Equal, + // i32 (select i1 (a < b), i32 Less, i32 Greater) + // where Equal, Less and Greater are placeholders for any three constants. + ICmpInst::Predicate PredA; + if (!match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) || + !ICmpInst::isEquality(PredA)) + return false; + Value *EqualVal = SI->getTrueValue(); + Value *UnequalVal = SI->getFalseValue(); + // We still can get non-canonical predicate here, so canonicalize. + if (PredA == ICmpInst::ICMP_NE) + std::swap(EqualVal, UnequalVal); + if (!match(EqualVal, m_ConstantInt(Equal))) + return false; + ICmpInst::Predicate PredB; + Value *LHS2, *RHS2; + if (!match(UnequalVal, m_Select(m_ICmp(PredB, m_Value(LHS2), m_Value(RHS2)), + m_ConstantInt(Less), m_ConstantInt(Greater)))) + return false; + // We can get predicate mismatch here, so canonicalize if possible: + // First, ensure that 'LHS' match. + if (LHS2 != LHS) { + // x sgt y <--> y slt x + std::swap(LHS2, RHS2); + PredB = ICmpInst::getSwappedPredicate(PredB); + } + if (LHS2 != LHS) + return false; + // We also need to canonicalize 'RHS'. + if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) { + // x sgt C-1 <--> x sge C <--> not(x slt C) + auto FlippedStrictness = + getFlippedStrictnessPredicateAndConstant(PredB, cast<Constant>(RHS2)); + if (!FlippedStrictness) + return false; + assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check"); + RHS2 = FlippedStrictness->second; + // And kind-of perform the result swap. + std::swap(Less, Greater); + PredB = ICmpInst::ICMP_SLT; + } + return PredB == ICmpInst::ICMP_SLT && RHS == RHS2; +} + +Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp, + SelectInst *Select, + ConstantInt *C) { + + assert(C && "Cmp RHS should be a constant int!"); + // If we're testing a constant value against the result of a three way + // comparison, the result can be expressed directly in terms of the + // original values being compared. Note: We could possibly be more + // aggressive here and remove the hasOneUse test. The original select is + // really likely to simplify or sink when we remove a test of the result. + Value *OrigLHS, *OrigRHS; + ConstantInt *C1LessThan, *C2Equal, *C3GreaterThan; + if (Cmp.hasOneUse() && + matchThreeWayIntCompare(Select, OrigLHS, OrigRHS, C1LessThan, C2Equal, + C3GreaterThan)) { + assert(C1LessThan && C2Equal && C3GreaterThan); + + bool TrueWhenLessThan = + ConstantExpr::getCompare(Cmp.getPredicate(), C1LessThan, C) + ->isAllOnesValue(); + bool TrueWhenEqual = + ConstantExpr::getCompare(Cmp.getPredicate(), C2Equal, C) + ->isAllOnesValue(); + bool TrueWhenGreaterThan = + ConstantExpr::getCompare(Cmp.getPredicate(), C3GreaterThan, C) + ->isAllOnesValue(); + + // This generates the new instruction that will replace the original Cmp + // Instruction. Instead of enumerating the various combinations when + // TrueWhenLessThan, TrueWhenEqual and TrueWhenGreaterThan are true versus + // false, we rely on chaining of ORs and future passes of InstCombine to + // simplify the OR further (i.e. a s< b || a == b becomes a s<= b). + + // When none of the three constants satisfy the predicate for the RHS (C), + // the entire original Cmp can be simplified to a false. + Value *Cond = Builder.getFalse(); + if (TrueWhenLessThan) + Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SLT, + OrigLHS, OrigRHS)); + if (TrueWhenEqual) + Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_EQ, + OrigLHS, OrigRHS)); + if (TrueWhenGreaterThan) + Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SGT, + OrigLHS, OrigRHS)); + + return replaceInstUsesWith(Cmp, Cond); + } + return nullptr; +} + +static Instruction *foldICmpBitCast(ICmpInst &Cmp, + InstCombiner::BuilderTy &Builder) { + auto *Bitcast = dyn_cast<BitCastInst>(Cmp.getOperand(0)); + if (!Bitcast) + return nullptr; + + ICmpInst::Predicate Pred = Cmp.getPredicate(); + Value *Op1 = Cmp.getOperand(1); + Value *BCSrcOp = Bitcast->getOperand(0); + + // Make sure the bitcast doesn't change the number of vector elements. + if (Bitcast->getSrcTy()->getScalarSizeInBits() == + Bitcast->getDestTy()->getScalarSizeInBits()) { + // Zero-equality and sign-bit checks are preserved through sitofp + bitcast. + Value *X; + if (match(BCSrcOp, m_SIToFP(m_Value(X)))) { + // icmp eq (bitcast (sitofp X)), 0 --> icmp eq X, 0 + // icmp ne (bitcast (sitofp X)), 0 --> icmp ne X, 0 + // icmp slt (bitcast (sitofp X)), 0 --> icmp slt X, 0 + // icmp sgt (bitcast (sitofp X)), 0 --> icmp sgt X, 0 + if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_SLT || + Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT) && + match(Op1, m_Zero())) + return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType())); + + // icmp slt (bitcast (sitofp X)), 1 --> icmp slt X, 1 + if (Pred == ICmpInst::ICMP_SLT && match(Op1, m_One())) + return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), 1)); + + // icmp sgt (bitcast (sitofp X)), -1 --> icmp sgt X, -1 + if (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes())) + return new ICmpInst(Pred, X, + ConstantInt::getAllOnesValue(X->getType())); + } + + // Zero-equality checks are preserved through unsigned floating-point casts: + // icmp eq (bitcast (uitofp X)), 0 --> icmp eq X, 0 + // icmp ne (bitcast (uitofp X)), 0 --> icmp ne X, 0 + if (match(BCSrcOp, m_UIToFP(m_Value(X)))) + if (Cmp.isEquality() && match(Op1, m_Zero())) + return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType())); + } + + // Test to see if the operands of the icmp are casted versions of other + // values. If the ptr->ptr cast can be stripped off both arguments, do so. + if (Bitcast->getType()->isPointerTy() && + (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) { + // If operand #1 is a bitcast instruction, it must also be a ptr->ptr cast + // so eliminate it as well. + if (auto *BC2 = dyn_cast<BitCastInst>(Op1)) + Op1 = BC2->getOperand(0); + + Op1 = Builder.CreateBitCast(Op1, BCSrcOp->getType()); + return new ICmpInst(Pred, BCSrcOp, Op1); + } + + // Folding: icmp <pred> iN X, C + // where X = bitcast <M x iK> (shufflevector <M x iK> %vec, undef, SC)) to iN + // and C is a splat of a K-bit pattern + // and SC is a constant vector = <C', C', C', ..., C'> + // Into: + // %E = extractelement <M x iK> %vec, i32 C' + // icmp <pred> iK %E, trunc(C) + const APInt *C; + if (!match(Cmp.getOperand(1), m_APInt(C)) || + !Bitcast->getType()->isIntegerTy() || + !Bitcast->getSrcTy()->isIntOrIntVectorTy()) + return nullptr; + + Value *Vec; + Constant *Mask; + if (match(BCSrcOp, + m_ShuffleVector(m_Value(Vec), m_Undef(), m_Constant(Mask)))) { + // Check whether every element of Mask is the same constant + if (auto *Elem = dyn_cast_or_null<ConstantInt>(Mask->getSplatValue())) { + auto *VecTy = cast<VectorType>(BCSrcOp->getType()); + auto *EltTy = cast<IntegerType>(VecTy->getElementType()); + if (C->isSplat(EltTy->getBitWidth())) { + // Fold the icmp based on the value of C + // If C is M copies of an iK sized bit pattern, + // then: + // => %E = extractelement <N x iK> %vec, i32 Elem + // icmp <pred> iK %SplatVal, <pattern> + Value *Extract = Builder.CreateExtractElement(Vec, Elem); + Value *NewC = ConstantInt::get(EltTy, C->trunc(EltTy->getBitWidth())); + return new ICmpInst(Pred, Extract, NewC); + } + } + } + return nullptr; +} + +/// Try to fold integer comparisons with a constant operand: icmp Pred X, C +/// where X is some kind of instruction. +Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) { + const APInt *C; + if (!match(Cmp.getOperand(1), m_APInt(C))) + return nullptr; + + if (auto *BO = dyn_cast<BinaryOperator>(Cmp.getOperand(0))) { + switch (BO->getOpcode()) { + case Instruction::Xor: + if (Instruction *I = foldICmpXorConstant(Cmp, BO, *C)) + return I; + break; + case Instruction::And: + if (Instruction *I = foldICmpAndConstant(Cmp, BO, *C)) + return I; + break; + case Instruction::Or: + if (Instruction *I = foldICmpOrConstant(Cmp, BO, *C)) + return I; + break; + case Instruction::Mul: + if (Instruction *I = foldICmpMulConstant(Cmp, BO, *C)) + return I; + break; + case Instruction::Shl: + if (Instruction *I = foldICmpShlConstant(Cmp, BO, *C)) + return I; + break; + case Instruction::LShr: + case Instruction::AShr: + if (Instruction *I = foldICmpShrConstant(Cmp, BO, *C)) + return I; + break; + case Instruction::SRem: + if (Instruction *I = foldICmpSRemConstant(Cmp, BO, *C)) + return I; + break; + case Instruction::UDiv: + if (Instruction *I = foldICmpUDivConstant(Cmp, BO, *C)) + return I; + LLVM_FALLTHROUGH; + case Instruction::SDiv: + if (Instruction *I = foldICmpDivConstant(Cmp, BO, *C)) + return I; + break; + case Instruction::Sub: + if (Instruction *I = foldICmpSubConstant(Cmp, BO, *C)) + return I; + break; + case Instruction::Add: + if (Instruction *I = foldICmpAddConstant(Cmp, BO, *C)) + return I; + break; + default: + break; + } + // TODO: These folds could be refactored to be part of the above calls. + if (Instruction *I = foldICmpBinOpEqualityWithConstant(Cmp, BO, *C)) + return I; + } + + // Match against CmpInst LHS being instructions other than binary operators. + + if (auto *SI = dyn_cast<SelectInst>(Cmp.getOperand(0))) { + // For now, we only support constant integers while folding the + // ICMP(SELECT)) pattern. We can extend this to support vector of integers + // similar to the cases handled by binary ops above. + if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(Cmp.getOperand(1))) + if (Instruction *I = foldICmpSelectConstant(Cmp, SI, ConstRHS)) + return I; + } + + if (auto *TI = dyn_cast<TruncInst>(Cmp.getOperand(0))) { + if (Instruction *I = foldICmpTruncConstant(Cmp, TI, *C)) + return I; + } + + if (auto *II = dyn_cast<IntrinsicInst>(Cmp.getOperand(0))) + if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, II, *C)) + return I; + + return nullptr; +} + +/// Fold an icmp equality instruction with binary operator LHS and constant RHS: +/// icmp eq/ne BO, C. +Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp, + BinaryOperator *BO, + const APInt &C) { + // TODO: Some of these folds could work with arbitrary constants, but this + // function is limited to scalar and vector splat constants. + if (!Cmp.isEquality()) + return nullptr; + + ICmpInst::Predicate Pred = Cmp.getPredicate(); + bool isICMP_NE = Pred == ICmpInst::ICMP_NE; + Constant *RHS = cast<Constant>(Cmp.getOperand(1)); + Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1); + + switch (BO->getOpcode()) { + case Instruction::SRem: + // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one. + if (C.isNullValue() && BO->hasOneUse()) { + const APInt *BOC; + if (match(BOp1, m_APInt(BOC)) && BOC->sgt(1) && BOC->isPowerOf2()) { + Value *NewRem = Builder.CreateURem(BOp0, BOp1, BO->getName()); + return new ICmpInst(Pred, NewRem, + Constant::getNullValue(BO->getType())); + } + } + break; + case Instruction::Add: { + // Replace ((add A, B) != C) with (A != C-B) if B & C are constants. + const APInt *BOC; + if (match(BOp1, m_APInt(BOC))) { + if (BO->hasOneUse()) { + Constant *SubC = ConstantExpr::getSub(RHS, cast<Constant>(BOp1)); + return new ICmpInst(Pred, BOp0, SubC); + } + } else if (C.isNullValue()) { + // Replace ((add A, B) != 0) with (A != -B) if A or B is + // efficiently invertible, or if the add has just this one use. + if (Value *NegVal = dyn_castNegVal(BOp1)) + return new ICmpInst(Pred, BOp0, NegVal); + if (Value *NegVal = dyn_castNegVal(BOp0)) + return new ICmpInst(Pred, NegVal, BOp1); + if (BO->hasOneUse()) { + Value *Neg = Builder.CreateNeg(BOp1); + Neg->takeName(BO); + return new ICmpInst(Pred, BOp0, Neg); + } + } + break; + } + case Instruction::Xor: + if (BO->hasOneUse()) { + if (Constant *BOC = dyn_cast<Constant>(BOp1)) { + // For the xor case, we can xor two constants together, eliminating + // the explicit xor. + return new ICmpInst(Pred, BOp0, ConstantExpr::getXor(RHS, BOC)); + } else if (C.isNullValue()) { + // Replace ((xor A, B) != 0) with (A != B) + return new ICmpInst(Pred, BOp0, BOp1); + } + } + break; + case Instruction::Sub: + if (BO->hasOneUse()) { + const APInt *BOC; + if (match(BOp0, m_APInt(BOC))) { + // Replace ((sub BOC, B) != C) with (B != BOC-C). + Constant *SubC = ConstantExpr::getSub(cast<Constant>(BOp0), RHS); + return new ICmpInst(Pred, BOp1, SubC); + } else if (C.isNullValue()) { + // Replace ((sub A, B) != 0) with (A != B). + return new ICmpInst(Pred, BOp0, BOp1); + } + } + break; + case Instruction::Or: { + const APInt *BOC; + if (match(BOp1, m_APInt(BOC)) && BO->hasOneUse() && RHS->isAllOnesValue()) { + // Comparing if all bits outside of a constant mask are set? + // Replace (X | C) == -1 with (X & ~C) == ~C. + // This removes the -1 constant. + Constant *NotBOC = ConstantExpr::getNot(cast<Constant>(BOp1)); + Value *And = Builder.CreateAnd(BOp0, NotBOC); + return new ICmpInst(Pred, And, NotBOC); + } + break; + } + case Instruction::And: { + const APInt *BOC; + if (match(BOp1, m_APInt(BOC))) { + // If we have ((X & C) == C), turn it into ((X & C) != 0). + if (C == *BOC && C.isPowerOf2()) + return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, + BO, Constant::getNullValue(RHS->getType())); + } + break; + } + case Instruction::Mul: + if (C.isNullValue() && BO->hasNoSignedWrap()) { + const APInt *BOC; + if (match(BOp1, m_APInt(BOC)) && !BOC->isNullValue()) { + // The trivial case (mul X, 0) is handled by InstSimplify. + // General case : (mul X, C) != 0 iff X != 0 + // (mul X, C) == 0 iff X == 0 + return new ICmpInst(Pred, BOp0, Constant::getNullValue(RHS->getType())); + } + } + break; + case Instruction::UDiv: + if (C.isNullValue()) { + // (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A) + auto NewPred = isICMP_NE ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT; + return new ICmpInst(NewPred, BOp1, BOp0); + } + break; + default: + break; + } + return nullptr; +} + +/// Fold an equality icmp with LLVM intrinsic and constant operand. +Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp, + IntrinsicInst *II, + const APInt &C) { + Type *Ty = II->getType(); + unsigned BitWidth = C.getBitWidth(); + switch (II->getIntrinsicID()) { + case Intrinsic::bswap: + Worklist.Add(II); + Cmp.setOperand(0, II->getArgOperand(0)); + Cmp.setOperand(1, ConstantInt::get(Ty, C.byteSwap())); + return &Cmp; + + case Intrinsic::ctlz: + case Intrinsic::cttz: { + // ctz(A) == bitwidth(A) -> A == 0 and likewise for != + if (C == BitWidth) { + Worklist.Add(II); + Cmp.setOperand(0, II->getArgOperand(0)); + Cmp.setOperand(1, ConstantInt::getNullValue(Ty)); + return &Cmp; + } + + // ctz(A) == C -> A & Mask1 == Mask2, where Mask2 only has bit C set + // and Mask1 has bits 0..C+1 set. Similar for ctl, but for high bits. + // Limit to one use to ensure we don't increase instruction count. + unsigned Num = C.getLimitedValue(BitWidth); + if (Num != BitWidth && II->hasOneUse()) { + bool IsTrailing = II->getIntrinsicID() == Intrinsic::cttz; + APInt Mask1 = IsTrailing ? APInt::getLowBitsSet(BitWidth, Num + 1) + : APInt::getHighBitsSet(BitWidth, Num + 1); + APInt Mask2 = IsTrailing + ? APInt::getOneBitSet(BitWidth, Num) + : APInt::getOneBitSet(BitWidth, BitWidth - Num - 1); + Cmp.setOperand(0, Builder.CreateAnd(II->getArgOperand(0), Mask1)); + Cmp.setOperand(1, ConstantInt::get(Ty, Mask2)); + Worklist.Add(II); + return &Cmp; + } + break; + } + + case Intrinsic::ctpop: { + // popcount(A) == 0 -> A == 0 and likewise for != + // popcount(A) == bitwidth(A) -> A == -1 and likewise for != + bool IsZero = C.isNullValue(); + if (IsZero || C == BitWidth) { + Worklist.Add(II); + Cmp.setOperand(0, II->getArgOperand(0)); + auto *NewOp = + IsZero ? Constant::getNullValue(Ty) : Constant::getAllOnesValue(Ty); + Cmp.setOperand(1, NewOp); + return &Cmp; + } + break; + } + + case Intrinsic::uadd_sat: { + // uadd.sat(a, b) == 0 -> (a | b) == 0 + if (C.isNullValue()) { + Value *Or = Builder.CreateOr(II->getArgOperand(0), II->getArgOperand(1)); + return replaceInstUsesWith(Cmp, Builder.CreateICmp( + Cmp.getPredicate(), Or, Constant::getNullValue(Ty))); + + } + break; + } + + case Intrinsic::usub_sat: { + // usub.sat(a, b) == 0 -> a <= b + if (C.isNullValue()) { + ICmpInst::Predicate NewPred = Cmp.getPredicate() == ICmpInst::ICMP_EQ + ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT; + return ICmpInst::Create(Instruction::ICmp, NewPred, + II->getArgOperand(0), II->getArgOperand(1)); + } + break; + } + default: + break; + } + + return nullptr; +} + +/// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C. +Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp, + IntrinsicInst *II, + const APInt &C) { + if (Cmp.isEquality()) + return foldICmpEqIntrinsicWithConstant(Cmp, II, C); + + Type *Ty = II->getType(); + unsigned BitWidth = C.getBitWidth(); + switch (II->getIntrinsicID()) { + case Intrinsic::ctlz: { + // ctlz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX < 0b00010000 + if (Cmp.getPredicate() == ICmpInst::ICMP_UGT && C.ult(BitWidth)) { + unsigned Num = C.getLimitedValue(); + APInt Limit = APInt::getOneBitSet(BitWidth, BitWidth - Num - 1); + return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_ULT, + II->getArgOperand(0), ConstantInt::get(Ty, Limit)); + } + + // ctlz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX > 0b00011111 + if (Cmp.getPredicate() == ICmpInst::ICMP_ULT && + C.uge(1) && C.ule(BitWidth)) { + unsigned Num = C.getLimitedValue(); + APInt Limit = APInt::getLowBitsSet(BitWidth, BitWidth - Num); + return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_UGT, + II->getArgOperand(0), ConstantInt::get(Ty, Limit)); + } + break; + } + case Intrinsic::cttz: { + // Limit to one use to ensure we don't increase instruction count. + if (!II->hasOneUse()) + return nullptr; + + // cttz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX & 0b00001111 == 0 + if (Cmp.getPredicate() == ICmpInst::ICMP_UGT && C.ult(BitWidth)) { + APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue() + 1); + return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_EQ, + Builder.CreateAnd(II->getArgOperand(0), Mask), + ConstantInt::getNullValue(Ty)); + } + + // cttz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX & 0b00000111 != 0 + if (Cmp.getPredicate() == ICmpInst::ICMP_ULT && + C.uge(1) && C.ule(BitWidth)) { + APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue()); + return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_NE, + Builder.CreateAnd(II->getArgOperand(0), Mask), + ConstantInt::getNullValue(Ty)); + } + break; + } + default: + break; + } + + return nullptr; +} + +/// Handle icmp with constant (but not simple integer constant) RHS. +Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + Constant *RHSC = dyn_cast<Constant>(Op1); + Instruction *LHSI = dyn_cast<Instruction>(Op0); + if (!RHSC || !LHSI) + return nullptr; + + switch (LHSI->getOpcode()) { + case Instruction::GetElementPtr: + // icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null + if (RHSC->isNullValue() && + cast<GetElementPtrInst>(LHSI)->hasAllZeroIndices()) + return new ICmpInst( + I.getPredicate(), LHSI->getOperand(0), + Constant::getNullValue(LHSI->getOperand(0)->getType())); + break; + case Instruction::PHI: + // Only fold icmp into the PHI if the phi and icmp are in the same + // block. If in the same block, we're encouraging jump threading. If + // not, we are just pessimizing the code by making an i1 phi. + if (LHSI->getParent() == I.getParent()) + if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI))) + return NV; + break; + case Instruction::Select: { + // If either operand of the select is a constant, we can fold the + // comparison into the select arms, which will cause one to be + // constant folded and the select turned into a bitwise or. + Value *Op1 = nullptr, *Op2 = nullptr; + ConstantInt *CI = nullptr; + if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) { + Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC); + CI = dyn_cast<ConstantInt>(Op1); + } + if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) { + Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC); + CI = dyn_cast<ConstantInt>(Op2); + } + + // We only want to perform this transformation if it will not lead to + // additional code. This is true if either both sides of the select + // fold to a constant (in which case the icmp is replaced with a select + // which will usually simplify) or this is the only user of the + // select (in which case we are trading a select+icmp for a simpler + // select+icmp) or all uses of the select can be replaced based on + // dominance information ("Global cases"). + bool Transform = false; + if (Op1 && Op2) + Transform = true; + else if (Op1 || Op2) { + // Local case + if (LHSI->hasOneUse()) + Transform = true; + // Global cases + else if (CI && !CI->isZero()) + // When Op1 is constant try replacing select with second operand. + // Otherwise Op2 is constant and try replacing select with first + // operand. + Transform = + replacedSelectWithOperand(cast<SelectInst>(LHSI), &I, Op1 ? 2 : 1); + } + if (Transform) { + if (!Op1) + Op1 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(1), RHSC, + I.getName()); + if (!Op2) + Op2 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(2), RHSC, + I.getName()); + return SelectInst::Create(LHSI->getOperand(0), Op1, Op2); + } + break; + } + case Instruction::IntToPtr: + // icmp pred inttoptr(X), null -> icmp pred X, 0 + if (RHSC->isNullValue() && + DL.getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType()) + return new ICmpInst( + I.getPredicate(), LHSI->getOperand(0), + Constant::getNullValue(LHSI->getOperand(0)->getType())); + break; + + case Instruction::Load: + // Try to optimize things like "A[i] > 4" to index computations. + if (GetElementPtrInst *GEP = + dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) { + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) + if (GV->isConstant() && GV->hasDefinitiveInitializer() && + !cast<LoadInst>(LHSI)->isVolatile()) + if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I)) + return Res; + } + break; + } + + return nullptr; +} + +/// Some comparisons can be simplified. +/// In this case, we are looking for comparisons that look like +/// a check for a lossy truncation. +/// Folds: +/// icmp SrcPred (x & Mask), x to icmp DstPred x, Mask +/// Where Mask is some pattern that produces all-ones in low bits: +/// (-1 >> y) +/// ((-1 << y) >> y) <- non-canonical, has extra uses +/// ~(-1 << y) +/// ((1 << y) + (-1)) <- non-canonical, has extra uses +/// The Mask can be a constant, too. +/// For some predicates, the operands are commutative. +/// For others, x can only be on a specific side. +static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I, + InstCombiner::BuilderTy &Builder) { + ICmpInst::Predicate SrcPred; + Value *X, *M, *Y; + auto m_VariableMask = m_CombineOr( + m_CombineOr(m_Not(m_Shl(m_AllOnes(), m_Value())), + m_Add(m_Shl(m_One(), m_Value()), m_AllOnes())), + m_CombineOr(m_LShr(m_AllOnes(), m_Value()), + m_LShr(m_Shl(m_AllOnes(), m_Value(Y)), m_Deferred(Y)))); + auto m_Mask = m_CombineOr(m_VariableMask, m_LowBitMask()); + if (!match(&I, m_c_ICmp(SrcPred, + m_c_And(m_CombineAnd(m_Mask, m_Value(M)), m_Value(X)), + m_Deferred(X)))) + return nullptr; + + ICmpInst::Predicate DstPred; + switch (SrcPred) { + case ICmpInst::Predicate::ICMP_EQ: + // x & (-1 >> y) == x -> x u<= (-1 >> y) + DstPred = ICmpInst::Predicate::ICMP_ULE; + break; + case ICmpInst::Predicate::ICMP_NE: + // x & (-1 >> y) != x -> x u> (-1 >> y) + DstPred = ICmpInst::Predicate::ICMP_UGT; + break; + case ICmpInst::Predicate::ICMP_UGT: + // x u> x & (-1 >> y) -> x u> (-1 >> y) + assert(X == I.getOperand(0) && "instsimplify took care of commut. variant"); + DstPred = ICmpInst::Predicate::ICMP_UGT; + break; + case ICmpInst::Predicate::ICMP_UGE: + // x & (-1 >> y) u>= x -> x u<= (-1 >> y) + assert(X == I.getOperand(1) && "instsimplify took care of commut. variant"); + DstPred = ICmpInst::Predicate::ICMP_ULE; + break; + case ICmpInst::Predicate::ICMP_ULT: + // x & (-1 >> y) u< x -> x u> (-1 >> y) + assert(X == I.getOperand(1) && "instsimplify took care of commut. variant"); + DstPred = ICmpInst::Predicate::ICMP_UGT; + break; + case ICmpInst::Predicate::ICMP_ULE: + // x u<= x & (-1 >> y) -> x u<= (-1 >> y) + assert(X == I.getOperand(0) && "instsimplify took care of commut. variant"); + DstPred = ICmpInst::Predicate::ICMP_ULE; + break; + case ICmpInst::Predicate::ICMP_SGT: + // x s> x & (-1 >> y) -> x s> (-1 >> y) + if (X != I.getOperand(0)) // X must be on LHS of comparison! + return nullptr; // Ignore the other case. + if (!match(M, m_Constant())) // Can not do this fold with non-constant. + return nullptr; + if (!match(M, m_NonNegative())) // Must not have any -1 vector elements. + return nullptr; + DstPred = ICmpInst::Predicate::ICMP_SGT; + break; + case ICmpInst::Predicate::ICMP_SGE: + // x & (-1 >> y) s>= x -> x s<= (-1 >> y) + if (X != I.getOperand(1)) // X must be on RHS of comparison! + return nullptr; // Ignore the other case. + if (!match(M, m_Constant())) // Can not do this fold with non-constant. + return nullptr; + if (!match(M, m_NonNegative())) // Must not have any -1 vector elements. + return nullptr; + DstPred = ICmpInst::Predicate::ICMP_SLE; + break; + case ICmpInst::Predicate::ICMP_SLT: + // x & (-1 >> y) s< x -> x s> (-1 >> y) + if (X != I.getOperand(1)) // X must be on RHS of comparison! + return nullptr; // Ignore the other case. + if (!match(M, m_Constant())) // Can not do this fold with non-constant. + return nullptr; + if (!match(M, m_NonNegative())) // Must not have any -1 vector elements. + return nullptr; + DstPred = ICmpInst::Predicate::ICMP_SGT; + break; + case ICmpInst::Predicate::ICMP_SLE: + // x s<= x & (-1 >> y) -> x s<= (-1 >> y) + if (X != I.getOperand(0)) // X must be on LHS of comparison! + return nullptr; // Ignore the other case. + if (!match(M, m_Constant())) // Can not do this fold with non-constant. + return nullptr; + if (!match(M, m_NonNegative())) // Must not have any -1 vector elements. + return nullptr; + DstPred = ICmpInst::Predicate::ICMP_SLE; + break; + default: + llvm_unreachable("All possible folds are handled."); + } + + return Builder.CreateICmp(DstPred, X, M); +} + +/// Some comparisons can be simplified. +/// In this case, we are looking for comparisons that look like +/// a check for a lossy signed truncation. +/// Folds: (MaskedBits is a constant.) +/// ((%x << MaskedBits) a>> MaskedBits) SrcPred %x +/// Into: +/// (add %x, (1 << (KeptBits-1))) DstPred (1 << KeptBits) +/// Where KeptBits = bitwidth(%x) - MaskedBits +static Value * +foldICmpWithTruncSignExtendedVal(ICmpInst &I, + InstCombiner::BuilderTy &Builder) { + ICmpInst::Predicate SrcPred; + Value *X; + const APInt *C0, *C1; // FIXME: non-splats, potentially with undef. + // We are ok with 'shl' having multiple uses, but 'ashr' must be one-use. + if (!match(&I, m_c_ICmp(SrcPred, + m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C0)), + m_APInt(C1))), + m_Deferred(X)))) + return nullptr; + + // Potential handling of non-splats: for each element: + // * if both are undef, replace with constant 0. + // Because (1<<0) is OK and is 1, and ((1<<0)>>1) is also OK and is 0. + // * if both are not undef, and are different, bailout. + // * else, only one is undef, then pick the non-undef one. + + // The shift amount must be equal. + if (*C0 != *C1) + return nullptr; + const APInt &MaskedBits = *C0; + assert(MaskedBits != 0 && "shift by zero should be folded away already."); + + ICmpInst::Predicate DstPred; + switch (SrcPred) { + case ICmpInst::Predicate::ICMP_EQ: + // ((%x << MaskedBits) a>> MaskedBits) == %x + // => + // (add %x, (1 << (KeptBits-1))) u< (1 << KeptBits) + DstPred = ICmpInst::Predicate::ICMP_ULT; + break; + case ICmpInst::Predicate::ICMP_NE: + // ((%x << MaskedBits) a>> MaskedBits) != %x + // => + // (add %x, (1 << (KeptBits-1))) u>= (1 << KeptBits) + DstPred = ICmpInst::Predicate::ICMP_UGE; + break; + // FIXME: are more folds possible? + default: + return nullptr; + } + + auto *XType = X->getType(); + const unsigned XBitWidth = XType->getScalarSizeInBits(); + const APInt BitWidth = APInt(XBitWidth, XBitWidth); + assert(BitWidth.ugt(MaskedBits) && "shifts should leave some bits untouched"); + + // KeptBits = bitwidth(%x) - MaskedBits + const APInt KeptBits = BitWidth - MaskedBits; + assert(KeptBits.ugt(0) && KeptBits.ult(BitWidth) && "unreachable"); + // ICmpCst = (1 << KeptBits) + const APInt ICmpCst = APInt(XBitWidth, 1).shl(KeptBits); + assert(ICmpCst.isPowerOf2()); + // AddCst = (1 << (KeptBits-1)) + const APInt AddCst = ICmpCst.lshr(1); + assert(AddCst.ult(ICmpCst) && AddCst.isPowerOf2()); + + // T0 = add %x, AddCst + Value *T0 = Builder.CreateAdd(X, ConstantInt::get(XType, AddCst)); + // T1 = T0 DstPred ICmpCst + Value *T1 = Builder.CreateICmp(DstPred, T0, ConstantInt::get(XType, ICmpCst)); + + return T1; +} + +// Given pattern: +// icmp eq/ne (and ((x shift Q), (y oppositeshift K))), 0 +// we should move shifts to the same hand of 'and', i.e. rewrite as +// icmp eq/ne (and (x shift (Q+K)), y), 0 iff (Q+K) u< bitwidth(x) +// We are only interested in opposite logical shifts here. +// One of the shifts can be truncated. +// If we can, we want to end up creating 'lshr' shift. +static Value * +foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ, + InstCombiner::BuilderTy &Builder) { + if (!I.isEquality() || !match(I.getOperand(1), m_Zero()) || + !I.getOperand(0)->hasOneUse()) + return nullptr; + + auto m_AnyLogicalShift = m_LogicalShift(m_Value(), m_Value()); + + // Look for an 'and' of two logical shifts, one of which may be truncated. + // We use m_TruncOrSelf() on the RHS to correctly handle commutative case. + Instruction *XShift, *MaybeTruncation, *YShift; + if (!match( + I.getOperand(0), + m_c_And(m_CombineAnd(m_AnyLogicalShift, m_Instruction(XShift)), + m_CombineAnd(m_TruncOrSelf(m_CombineAnd( + m_AnyLogicalShift, m_Instruction(YShift))), + m_Instruction(MaybeTruncation))))) + return nullptr; + + // We potentially looked past 'trunc', but only when matching YShift, + // therefore YShift must have the widest type. + Instruction *WidestShift = YShift; + // Therefore XShift must have the shallowest type. + // Or they both have identical types if there was no truncation. + Instruction *NarrowestShift = XShift; + + Type *WidestTy = WidestShift->getType(); + assert(NarrowestShift->getType() == I.getOperand(0)->getType() && + "We did not look past any shifts while matching XShift though."); + bool HadTrunc = WidestTy != I.getOperand(0)->getType(); + + // If YShift is a 'lshr', swap the shifts around. + if (match(YShift, m_LShr(m_Value(), m_Value()))) + std::swap(XShift, YShift); + + // The shifts must be in opposite directions. + auto XShiftOpcode = XShift->getOpcode(); + if (XShiftOpcode == YShift->getOpcode()) + return nullptr; // Do not care about same-direction shifts here. + + Value *X, *XShAmt, *Y, *YShAmt; + match(XShift, m_BinOp(m_Value(X), m_ZExtOrSelf(m_Value(XShAmt)))); + match(YShift, m_BinOp(m_Value(Y), m_ZExtOrSelf(m_Value(YShAmt)))); + + // If one of the values being shifted is a constant, then we will end with + // and+icmp, and [zext+]shift instrs will be constant-folded. If they are not, + // however, we will need to ensure that we won't increase instruction count. + if (!isa<Constant>(X) && !isa<Constant>(Y)) { + // At least one of the hands of the 'and' should be one-use shift. + if (!match(I.getOperand(0), + m_c_And(m_OneUse(m_AnyLogicalShift), m_Value()))) + return nullptr; + if (HadTrunc) { + // Due to the 'trunc', we will need to widen X. For that either the old + // 'trunc' or the shift amt in the non-truncated shift should be one-use. + if (!MaybeTruncation->hasOneUse() && + !NarrowestShift->getOperand(1)->hasOneUse()) + return nullptr; + } + } + + // We have two shift amounts from two different shifts. The types of those + // shift amounts may not match. If that's the case let's bailout now. + if (XShAmt->getType() != YShAmt->getType()) + return nullptr; + + // Can we fold (XShAmt+YShAmt) ? + auto *NewShAmt = dyn_cast_or_null<Constant>( + SimplifyAddInst(XShAmt, YShAmt, /*isNSW=*/false, + /*isNUW=*/false, SQ.getWithInstruction(&I))); + if (!NewShAmt) + return nullptr; + NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, WidestTy); + unsigned WidestBitWidth = WidestTy->getScalarSizeInBits(); + + // Is the new shift amount smaller than the bit width? + // FIXME: could also rely on ConstantRange. + if (!match(NewShAmt, + m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, + APInt(WidestBitWidth, WidestBitWidth)))) + return nullptr; + + // An extra legality check is needed if we had trunc-of-lshr. + if (HadTrunc && match(WidestShift, m_LShr(m_Value(), m_Value()))) { + auto CanFold = [NewShAmt, WidestBitWidth, NarrowestShift, SQ, + WidestShift]() { + // It isn't obvious whether it's worth it to analyze non-constants here. + // Also, let's basically give up on non-splat cases, pessimizing vectors. + // If *any* of these preconditions matches we can perform the fold. + Constant *NewShAmtSplat = NewShAmt->getType()->isVectorTy() + ? NewShAmt->getSplatValue() + : NewShAmt; + // If it's edge-case shift (by 0 or by WidestBitWidth-1) we can fold. + if (NewShAmtSplat && + (NewShAmtSplat->isNullValue() || + NewShAmtSplat->getUniqueInteger() == WidestBitWidth - 1)) + return true; + // We consider *min* leading zeros so a single outlier + // blocks the transform as opposed to allowing it. + if (auto *C = dyn_cast<Constant>(NarrowestShift->getOperand(0))) { + KnownBits Known = computeKnownBits(C, SQ.DL); + unsigned MinLeadZero = Known.countMinLeadingZeros(); + // If the value being shifted has at most lowest bit set we can fold. + unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero; + if (MaxActiveBits <= 1) + return true; + // Precondition: NewShAmt u<= countLeadingZeros(C) + if (NewShAmtSplat && NewShAmtSplat->getUniqueInteger().ule(MinLeadZero)) + return true; + } + if (auto *C = dyn_cast<Constant>(WidestShift->getOperand(0))) { + KnownBits Known = computeKnownBits(C, SQ.DL); + unsigned MinLeadZero = Known.countMinLeadingZeros(); + // If the value being shifted has at most lowest bit set we can fold. + unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero; + if (MaxActiveBits <= 1) + return true; + // Precondition: ((WidestBitWidth-1)-NewShAmt) u<= countLeadingZeros(C) + if (NewShAmtSplat) { + APInt AdjNewShAmt = + (WidestBitWidth - 1) - NewShAmtSplat->getUniqueInteger(); + if (AdjNewShAmt.ule(MinLeadZero)) + return true; + } + } + return false; // Can't tell if it's ok. + }; + if (!CanFold()) + return nullptr; + } + + // All good, we can do this fold. + X = Builder.CreateZExt(X, WidestTy); + Y = Builder.CreateZExt(Y, WidestTy); + // The shift is the same that was for X. + Value *T0 = XShiftOpcode == Instruction::BinaryOps::LShr + ? Builder.CreateLShr(X, NewShAmt) + : Builder.CreateShl(X, NewShAmt); + Value *T1 = Builder.CreateAnd(T0, Y); + return Builder.CreateICmp(I.getPredicate(), T1, + Constant::getNullValue(WidestTy)); +} + +/// Fold +/// (-1 u/ x) u< y +/// ((x * y) u/ x) != y +/// to +/// @llvm.umul.with.overflow(x, y) plus extraction of overflow bit +/// Note that the comparison is commutative, while inverted (u>=, ==) predicate +/// will mean that we are looking for the opposite answer. +Value *InstCombiner::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) { + ICmpInst::Predicate Pred; + Value *X, *Y; + Instruction *Mul; + bool NeedNegation; + // Look for: (-1 u/ x) u</u>= y + if (!I.isEquality() && + match(&I, m_c_ICmp(Pred, m_OneUse(m_UDiv(m_AllOnes(), m_Value(X))), + m_Value(Y)))) { + Mul = nullptr; + // Canonicalize as-if y was on RHS. + if (I.getOperand(1) != Y) + Pred = I.getSwappedPredicate(); + + // Are we checking that overflow does not happen, or does happen? + switch (Pred) { + case ICmpInst::Predicate::ICMP_ULT: + NeedNegation = false; + break; // OK + case ICmpInst::Predicate::ICMP_UGE: + NeedNegation = true; + break; // OK + default: + return nullptr; // Wrong predicate. + } + } else // Look for: ((x * y) u/ x) !=/== y + if (I.isEquality() && + match(&I, m_c_ICmp(Pred, m_Value(Y), + m_OneUse(m_UDiv(m_CombineAnd(m_c_Mul(m_Deferred(Y), + m_Value(X)), + m_Instruction(Mul)), + m_Deferred(X)))))) { + NeedNegation = Pred == ICmpInst::Predicate::ICMP_EQ; + } else + return nullptr; + + BuilderTy::InsertPointGuard Guard(Builder); + // If the pattern included (x * y), we'll want to insert new instructions + // right before that original multiplication so that we can replace it. + bool MulHadOtherUses = Mul && !Mul->hasOneUse(); + if (MulHadOtherUses) + Builder.SetInsertPoint(Mul); + + Function *F = Intrinsic::getDeclaration( + I.getModule(), Intrinsic::umul_with_overflow, X->getType()); + CallInst *Call = Builder.CreateCall(F, {X, Y}, "umul"); + + // If the multiplication was used elsewhere, to ensure that we don't leave + // "duplicate" instructions, replace uses of that original multiplication + // with the multiplication result from the with.overflow intrinsic. + if (MulHadOtherUses) + replaceInstUsesWith(*Mul, Builder.CreateExtractValue(Call, 0, "umul.val")); + + Value *Res = Builder.CreateExtractValue(Call, 1, "umul.ov"); + if (NeedNegation) // This technically increases instruction count. + Res = Builder.CreateNot(Res, "umul.not.ov"); + + return Res; +} + +/// Try to fold icmp (binop), X or icmp X, (binop). +/// TODO: A large part of this logic is duplicated in InstSimplify's +/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code +/// duplication. +Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I, const SimplifyQuery &SQ) { + const SimplifyQuery Q = SQ.getWithInstruction(&I); + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + // Special logic for binary operators. + BinaryOperator *BO0 = dyn_cast<BinaryOperator>(Op0); + BinaryOperator *BO1 = dyn_cast<BinaryOperator>(Op1); + if (!BO0 && !BO1) + return nullptr; + + const CmpInst::Predicate Pred = I.getPredicate(); + Value *X; + + // Convert add-with-unsigned-overflow comparisons into a 'not' with compare. + // (Op1 + X) u</u>= Op1 --> ~Op1 u</u>= X + if (match(Op0, m_OneUse(m_c_Add(m_Specific(Op1), m_Value(X)))) && + (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE)) + return new ICmpInst(Pred, Builder.CreateNot(Op1), X); + // Op0 u>/u<= (Op0 + X) --> X u>/u<= ~Op0 + if (match(Op1, m_OneUse(m_c_Add(m_Specific(Op0), m_Value(X)))) && + (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE)) + return new ICmpInst(Pred, X, Builder.CreateNot(Op0)); + + bool NoOp0WrapProblem = false, NoOp1WrapProblem = false; + if (BO0 && isa<OverflowingBinaryOperator>(BO0)) + NoOp0WrapProblem = + ICmpInst::isEquality(Pred) || + (CmpInst::isUnsigned(Pred) && BO0->hasNoUnsignedWrap()) || + (CmpInst::isSigned(Pred) && BO0->hasNoSignedWrap()); + if (BO1 && isa<OverflowingBinaryOperator>(BO1)) + NoOp1WrapProblem = + ICmpInst::isEquality(Pred) || + (CmpInst::isUnsigned(Pred) && BO1->hasNoUnsignedWrap()) || + (CmpInst::isSigned(Pred) && BO1->hasNoSignedWrap()); + + // Analyze the case when either Op0 or Op1 is an add instruction. + // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null). + Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr; + if (BO0 && BO0->getOpcode() == Instruction::Add) { + A = BO0->getOperand(0); + B = BO0->getOperand(1); + } + if (BO1 && BO1->getOpcode() == Instruction::Add) { + C = BO1->getOperand(0); + D = BO1->getOperand(1); + } + + // icmp (A+B), A -> icmp B, 0 for equalities or if there is no overflow. + // icmp (A+B), B -> icmp A, 0 for equalities or if there is no overflow. + if ((A == Op1 || B == Op1) && NoOp0WrapProblem) + return new ICmpInst(Pred, A == Op1 ? B : A, + Constant::getNullValue(Op1->getType())); + + // icmp C, (C+D) -> icmp 0, D for equalities or if there is no overflow. + // icmp D, (C+D) -> icmp 0, C for equalities or if there is no overflow. + if ((C == Op0 || D == Op0) && NoOp1WrapProblem) + return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()), + C == Op0 ? D : C); + + // icmp (A+B), (A+D) -> icmp B, D for equalities or if there is no overflow. + if (A && C && (A == C || A == D || B == C || B == D) && NoOp0WrapProblem && + NoOp1WrapProblem) { + // Determine Y and Z in the form icmp (X+Y), (X+Z). + Value *Y, *Z; + if (A == C) { + // C + B == C + D -> B == D + Y = B; + Z = D; + } else if (A == D) { + // D + B == C + D -> B == C + Y = B; + Z = C; + } else if (B == C) { + // A + C == C + D -> A == D + Y = A; + Z = D; + } else { + assert(B == D); + // A + D == C + D -> A == C + Y = A; + Z = C; + } + return new ICmpInst(Pred, Y, Z); + } + + // icmp slt (A + -1), Op1 -> icmp sle A, Op1 + if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT && + match(B, m_AllOnes())) + return new ICmpInst(CmpInst::ICMP_SLE, A, Op1); + + // icmp sge (A + -1), Op1 -> icmp sgt A, Op1 + if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE && + match(B, m_AllOnes())) + return new ICmpInst(CmpInst::ICMP_SGT, A, Op1); + + // icmp sle (A + 1), Op1 -> icmp slt A, Op1 + if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE && match(B, m_One())) + return new ICmpInst(CmpInst::ICMP_SLT, A, Op1); + + // icmp sgt (A + 1), Op1 -> icmp sge A, Op1 + if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT && match(B, m_One())) + return new ICmpInst(CmpInst::ICMP_SGE, A, Op1); + + // icmp sgt Op0, (C + -1) -> icmp sge Op0, C + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT && + match(D, m_AllOnes())) + return new ICmpInst(CmpInst::ICMP_SGE, Op0, C); + + // icmp sle Op0, (C + -1) -> icmp slt Op0, C + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE && + match(D, m_AllOnes())) + return new ICmpInst(CmpInst::ICMP_SLT, Op0, C); + + // icmp sge Op0, (C + 1) -> icmp sgt Op0, C + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && match(D, m_One())) + return new ICmpInst(CmpInst::ICMP_SGT, Op0, C); + + // icmp slt Op0, (C + 1) -> icmp sle Op0, C + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && match(D, m_One())) + return new ICmpInst(CmpInst::ICMP_SLE, Op0, C); + + // TODO: The subtraction-related identities shown below also hold, but + // canonicalization from (X -nuw 1) to (X + -1) means that the combinations + // wouldn't happen even if they were implemented. + // + // icmp ult (A - 1), Op1 -> icmp ule A, Op1 + // icmp uge (A - 1), Op1 -> icmp ugt A, Op1 + // icmp ugt Op0, (C - 1) -> icmp uge Op0, C + // icmp ule Op0, (C - 1) -> icmp ult Op0, C + + // icmp ule (A + 1), Op0 -> icmp ult A, Op1 + if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_ULE && match(B, m_One())) + return new ICmpInst(CmpInst::ICMP_ULT, A, Op1); + + // icmp ugt (A + 1), Op0 -> icmp uge A, Op1 + if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_UGT && match(B, m_One())) + return new ICmpInst(CmpInst::ICMP_UGE, A, Op1); + + // icmp uge Op0, (C + 1) -> icmp ugt Op0, C + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_UGE && match(D, m_One())) + return new ICmpInst(CmpInst::ICMP_UGT, Op0, C); + + // icmp ult Op0, (C + 1) -> icmp ule Op0, C + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_ULT && match(D, m_One())) + return new ICmpInst(CmpInst::ICMP_ULE, Op0, C); + + // if C1 has greater magnitude than C2: + // icmp (A + C1), (C + C2) -> icmp (A + C3), C + // s.t. C3 = C1 - C2 + // + // if C2 has greater magnitude than C1: + // icmp (A + C1), (C + C2) -> icmp A, (C + C3) + // s.t. C3 = C2 - C1 + if (A && C && NoOp0WrapProblem && NoOp1WrapProblem && + (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned()) + if (ConstantInt *C1 = dyn_cast<ConstantInt>(B)) + if (ConstantInt *C2 = dyn_cast<ConstantInt>(D)) { + const APInt &AP1 = C1->getValue(); + const APInt &AP2 = C2->getValue(); + if (AP1.isNegative() == AP2.isNegative()) { + APInt AP1Abs = C1->getValue().abs(); + APInt AP2Abs = C2->getValue().abs(); + if (AP1Abs.uge(AP2Abs)) { + ConstantInt *C3 = Builder.getInt(AP1 - AP2); + Value *NewAdd = Builder.CreateNSWAdd(A, C3); + return new ICmpInst(Pred, NewAdd, C); + } else { + ConstantInt *C3 = Builder.getInt(AP2 - AP1); + Value *NewAdd = Builder.CreateNSWAdd(C, C3); + return new ICmpInst(Pred, A, NewAdd); + } + } + } + + // Analyze the case when either Op0 or Op1 is a sub instruction. + // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null). + A = nullptr; + B = nullptr; + C = nullptr; + D = nullptr; + if (BO0 && BO0->getOpcode() == Instruction::Sub) { + A = BO0->getOperand(0); + B = BO0->getOperand(1); + } + if (BO1 && BO1->getOpcode() == Instruction::Sub) { + C = BO1->getOperand(0); + D = BO1->getOperand(1); + } + + // icmp (A-B), A -> icmp 0, B for equalities or if there is no overflow. + if (A == Op1 && NoOp0WrapProblem) + return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B); + // icmp C, (C-D) -> icmp D, 0 for equalities or if there is no overflow. + if (C == Op0 && NoOp1WrapProblem) + return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType())); + + // Convert sub-with-unsigned-overflow comparisons into a comparison of args. + // (A - B) u>/u<= A --> B u>/u<= A + if (A == Op1 && (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE)) + return new ICmpInst(Pred, B, A); + // C u</u>= (C - D) --> C u</u>= D + if (C == Op0 && (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE)) + return new ICmpInst(Pred, C, D); + // (A - B) u>=/u< A --> B u>/u<= A iff B != 0 + if (A == Op1 && (Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_ULT) && + isKnownNonZero(B, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT)) + return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), B, A); + // C u<=/u> (C - D) --> C u</u>= D iff B != 0 + if (C == Op0 && (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT) && + isKnownNonZero(D, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT)) + return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), C, D); + + // icmp (A-B), (C-B) -> icmp A, C for equalities or if there is no overflow. + if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem) + return new ICmpInst(Pred, A, C); + + // icmp (A-B), (A-D) -> icmp D, B for equalities or if there is no overflow. + if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem) + return new ICmpInst(Pred, D, B); + + // icmp (0-X) < cst --> x > -cst + if (NoOp0WrapProblem && ICmpInst::isSigned(Pred)) { + Value *X; + if (match(BO0, m_Neg(m_Value(X)))) + if (Constant *RHSC = dyn_cast<Constant>(Op1)) + if (RHSC->isNotMinSignedValue()) + return new ICmpInst(I.getSwappedPredicate(), X, + ConstantExpr::getNeg(RHSC)); + } + + BinaryOperator *SRem = nullptr; + // icmp (srem X, Y), Y + if (BO0 && BO0->getOpcode() == Instruction::SRem && Op1 == BO0->getOperand(1)) + SRem = BO0; + // icmp Y, (srem X, Y) + else if (BO1 && BO1->getOpcode() == Instruction::SRem && + Op0 == BO1->getOperand(1)) + SRem = BO1; + if (SRem) { + // We don't check hasOneUse to avoid increasing register pressure because + // the value we use is the same value this instruction was already using. + switch (SRem == BO0 ? ICmpInst::getSwappedPredicate(Pred) : Pred) { + default: + break; + case ICmpInst::ICMP_EQ: + return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); + case ICmpInst::ICMP_NE: + return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); + case ICmpInst::ICMP_SGT: + case ICmpInst::ICMP_SGE: + return new ICmpInst(ICmpInst::ICMP_SGT, SRem->getOperand(1), + Constant::getAllOnesValue(SRem->getType())); + case ICmpInst::ICMP_SLT: + case ICmpInst::ICMP_SLE: + return new ICmpInst(ICmpInst::ICMP_SLT, SRem->getOperand(1), + Constant::getNullValue(SRem->getType())); + } + } + + if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() && BO0->hasOneUse() && + BO1->hasOneUse() && BO0->getOperand(1) == BO1->getOperand(1)) { + switch (BO0->getOpcode()) { + default: + break; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Xor: { + if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + + const APInt *C; + if (match(BO0->getOperand(1), m_APInt(C))) { + // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b + if (C->isSignMask()) { + ICmpInst::Predicate NewPred = + I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate(); + return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0)); + } + + // icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b + if (BO0->getOpcode() == Instruction::Xor && C->isMaxSignedValue()) { + ICmpInst::Predicate NewPred = + I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate(); + NewPred = I.getSwappedPredicate(NewPred); + return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0)); + } + } + break; + } + case Instruction::Mul: { + if (!I.isEquality()) + break; + + const APInt *C; + if (match(BO0->getOperand(1), m_APInt(C)) && !C->isNullValue() && + !C->isOneValue()) { + // icmp eq/ne (X * C), (Y * C) --> icmp (X & Mask), (Y & Mask) + // Mask = -1 >> count-trailing-zeros(C). + if (unsigned TZs = C->countTrailingZeros()) { + Constant *Mask = ConstantInt::get( + BO0->getType(), + APInt::getLowBitsSet(C->getBitWidth(), C->getBitWidth() - TZs)); + Value *And1 = Builder.CreateAnd(BO0->getOperand(0), Mask); + Value *And2 = Builder.CreateAnd(BO1->getOperand(0), Mask); + return new ICmpInst(Pred, And1, And2); + } + // If there are no trailing zeros in the multiplier, just eliminate + // the multiplies (no masking is needed): + // icmp eq/ne (X * C), (Y * C) --> icmp eq/ne X, Y + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + } + break; + } + case Instruction::UDiv: + case Instruction::LShr: + if (I.isSigned() || !BO0->isExact() || !BO1->isExact()) + break; + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + + case Instruction::SDiv: + if (!I.isEquality() || !BO0->isExact() || !BO1->isExact()) + break; + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + + case Instruction::AShr: + if (!BO0->isExact() || !BO1->isExact()) + break; + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + + case Instruction::Shl: { + bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap(); + bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap(); + if (!NUW && !NSW) + break; + if (!NSW && I.isSigned()) + break; + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + } + } + } + + if (BO0) { + // Transform A & (L - 1) `ult` L --> L != 0 + auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes()); + auto BitwiseAnd = m_c_And(m_Value(), LSubOne); + + if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) { + auto *Zero = Constant::getNullValue(BO0->getType()); + return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero); + } + } + + if (Value *V = foldUnsignedMultiplicationOverflowCheck(I)) + return replaceInstUsesWith(I, V); + + if (Value *V = foldICmpWithLowBitMaskedVal(I, Builder)) + return replaceInstUsesWith(I, V); + + if (Value *V = foldICmpWithTruncSignExtendedVal(I, Builder)) + return replaceInstUsesWith(I, V); + + if (Value *V = foldShiftIntoShiftInAnotherHandOfAndInICmp(I, SQ, Builder)) + return replaceInstUsesWith(I, V); + + return nullptr; +} + +/// Fold icmp Pred min|max(X, Y), X. +static Instruction *foldICmpWithMinMax(ICmpInst &Cmp) { + ICmpInst::Predicate Pred = Cmp.getPredicate(); + Value *Op0 = Cmp.getOperand(0); + Value *X = Cmp.getOperand(1); + + // Canonicalize minimum or maximum operand to LHS of the icmp. + if (match(X, m_c_SMin(m_Specific(Op0), m_Value())) || + match(X, m_c_SMax(m_Specific(Op0), m_Value())) || + match(X, m_c_UMin(m_Specific(Op0), m_Value())) || + match(X, m_c_UMax(m_Specific(Op0), m_Value()))) { + std::swap(Op0, X); + Pred = Cmp.getSwappedPredicate(); + } + + Value *Y; + if (match(Op0, m_c_SMin(m_Specific(X), m_Value(Y)))) { + // smin(X, Y) == X --> X s<= Y + // smin(X, Y) s>= X --> X s<= Y + if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SGE) + return new ICmpInst(ICmpInst::ICMP_SLE, X, Y); + + // smin(X, Y) != X --> X s> Y + // smin(X, Y) s< X --> X s> Y + if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SLT) + return new ICmpInst(ICmpInst::ICMP_SGT, X, Y); + + // These cases should be handled in InstSimplify: + // smin(X, Y) s<= X --> true + // smin(X, Y) s> X --> false + return nullptr; + } + + if (match(Op0, m_c_SMax(m_Specific(X), m_Value(Y)))) { + // smax(X, Y) == X --> X s>= Y + // smax(X, Y) s<= X --> X s>= Y + if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SLE) + return new ICmpInst(ICmpInst::ICMP_SGE, X, Y); + + // smax(X, Y) != X --> X s< Y + // smax(X, Y) s> X --> X s< Y + if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SGT) + return new ICmpInst(ICmpInst::ICMP_SLT, X, Y); + + // These cases should be handled in InstSimplify: + // smax(X, Y) s>= X --> true + // smax(X, Y) s< X --> false + return nullptr; + } + + if (match(Op0, m_c_UMin(m_Specific(X), m_Value(Y)))) { + // umin(X, Y) == X --> X u<= Y + // umin(X, Y) u>= X --> X u<= Y + if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_UGE) + return new ICmpInst(ICmpInst::ICMP_ULE, X, Y); + + // umin(X, Y) != X --> X u> Y + // umin(X, Y) u< X --> X u> Y + if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT) + return new ICmpInst(ICmpInst::ICMP_UGT, X, Y); + + // These cases should be handled in InstSimplify: + // umin(X, Y) u<= X --> true + // umin(X, Y) u> X --> false + return nullptr; + } + + if (match(Op0, m_c_UMax(m_Specific(X), m_Value(Y)))) { + // umax(X, Y) == X --> X u>= Y + // umax(X, Y) u<= X --> X u>= Y + if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_ULE) + return new ICmpInst(ICmpInst::ICMP_UGE, X, Y); + + // umax(X, Y) != X --> X u< Y + // umax(X, Y) u> X --> X u< Y + if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_UGT) + return new ICmpInst(ICmpInst::ICMP_ULT, X, Y); + + // These cases should be handled in InstSimplify: + // umax(X, Y) u>= X --> true + // umax(X, Y) u< X --> false + return nullptr; + } + + return nullptr; +} + +Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) { + if (!I.isEquality()) + return nullptr; + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + const CmpInst::Predicate Pred = I.getPredicate(); + Value *A, *B, *C, *D; + if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) { + if (A == Op1 || B == Op1) { // (A^B) == A -> B == 0 + Value *OtherVal = A == Op1 ? B : A; + return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType())); + } + + if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) { + // A^c1 == C^c2 --> A == C^(c1^c2) + ConstantInt *C1, *C2; + if (match(B, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2)) && + Op1->hasOneUse()) { + Constant *NC = Builder.getInt(C1->getValue() ^ C2->getValue()); + Value *Xor = Builder.CreateXor(C, NC); + return new ICmpInst(Pred, A, Xor); + } + + // A^B == A^D -> B == D + if (A == C) + return new ICmpInst(Pred, B, D); + if (A == D) + return new ICmpInst(Pred, B, C); + if (B == C) + return new ICmpInst(Pred, A, D); + if (B == D) + return new ICmpInst(Pred, A, C); + } + } + + if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) { + // A == (A^B) -> B == 0 + Value *OtherVal = A == Op0 ? B : A; + return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType())); + } + + // (X&Z) == (Y&Z) -> (X^Y) & Z == 0 + if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) && + match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) { + Value *X = nullptr, *Y = nullptr, *Z = nullptr; + + if (A == C) { + X = B; + Y = D; + Z = A; + } else if (A == D) { + X = B; + Y = C; + Z = A; + } else if (B == C) { + X = A; + Y = D; + Z = B; + } else if (B == D) { + X = A; + Y = C; + Z = B; + } + + if (X) { // Build (X^Y) & Z + Op1 = Builder.CreateXor(X, Y); + Op1 = Builder.CreateAnd(Op1, Z); + I.setOperand(0, Op1); + I.setOperand(1, Constant::getNullValue(Op1->getType())); + return &I; + } + } + + // Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B) + // and (B & (1<<X)-1) == (zext A) --> A == (trunc B) + ConstantInt *Cst1; + if ((Op0->hasOneUse() && match(Op0, m_ZExt(m_Value(A))) && + match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) || + (Op1->hasOneUse() && match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) && + match(Op1, m_ZExt(m_Value(A))))) { + APInt Pow2 = Cst1->getValue() + 1; + if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) && + Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth()) + return new ICmpInst(Pred, A, Builder.CreateTrunc(B, A->getType())); + } + + // (A >> C) == (B >> C) --> (A^B) u< (1 << C) + // For lshr and ashr pairs. + if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) && + match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) || + (match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) && + match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) { + unsigned TypeBits = Cst1->getBitWidth(); + unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits); + if (ShAmt < TypeBits && ShAmt != 0) { + ICmpInst::Predicate NewPred = + Pred == ICmpInst::ICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT; + Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted"); + APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt); + return new ICmpInst(NewPred, Xor, Builder.getInt(CmpVal)); + } + } + + // (A << C) == (B << C) --> ((A^B) & (~0U >> C)) == 0 + if (match(Op0, m_OneUse(m_Shl(m_Value(A), m_ConstantInt(Cst1)))) && + match(Op1, m_OneUse(m_Shl(m_Value(B), m_Specific(Cst1))))) { + unsigned TypeBits = Cst1->getBitWidth(); + unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits); + if (ShAmt < TypeBits && ShAmt != 0) { + Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted"); + APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt); + Value *And = Builder.CreateAnd(Xor, Builder.getInt(AndVal), + I.getName() + ".mask"); + return new ICmpInst(Pred, And, Constant::getNullValue(Cst1->getType())); + } + } + + // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to + // "icmp (and X, mask), cst" + uint64_t ShAmt = 0; + if (Op0->hasOneUse() && + match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A), m_ConstantInt(ShAmt))))) && + match(Op1, m_ConstantInt(Cst1)) && + // Only do this when A has multiple uses. This is most important to do + // when it exposes other optimizations. + !A->hasOneUse()) { + unsigned ASize = cast<IntegerType>(A->getType())->getPrimitiveSizeInBits(); + + if (ShAmt < ASize) { + APInt MaskV = + APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits()); + MaskV <<= ShAmt; + + APInt CmpV = Cst1->getValue().zext(ASize); + CmpV <<= ShAmt; + + Value *Mask = Builder.CreateAnd(A, Builder.getInt(MaskV)); + return new ICmpInst(Pred, Mask, Builder.getInt(CmpV)); + } + } + + // If both operands are byte-swapped or bit-reversed, just compare the + // original values. + // TODO: Move this to a function similar to foldICmpIntrinsicWithConstant() + // and handle more intrinsics. + if ((match(Op0, m_BSwap(m_Value(A))) && match(Op1, m_BSwap(m_Value(B)))) || + (match(Op0, m_BitReverse(m_Value(A))) && + match(Op1, m_BitReverse(m_Value(B))))) + return new ICmpInst(Pred, A, B); + + // Canonicalize checking for a power-of-2-or-zero value: + // (A & (A-1)) == 0 --> ctpop(A) < 2 (two commuted variants) + // ((A-1) & A) != 0 --> ctpop(A) > 1 (two commuted variants) + if (!match(Op0, m_OneUse(m_c_And(m_Add(m_Value(A), m_AllOnes()), + m_Deferred(A)))) || + !match(Op1, m_ZeroInt())) + A = nullptr; + + // (A & -A) == A --> ctpop(A) < 2 (four commuted variants) + // (-A & A) != A --> ctpop(A) > 1 (four commuted variants) + if (match(Op0, m_OneUse(m_c_And(m_Neg(m_Specific(Op1)), m_Specific(Op1))))) + A = Op1; + else if (match(Op1, + m_OneUse(m_c_And(m_Neg(m_Specific(Op0)), m_Specific(Op0))))) + A = Op0; + + if (A) { + Type *Ty = A->getType(); + CallInst *CtPop = Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, A); + return Pred == ICmpInst::ICMP_EQ + ? new ICmpInst(ICmpInst::ICMP_ULT, CtPop, ConstantInt::get(Ty, 2)) + : new ICmpInst(ICmpInst::ICMP_UGT, CtPop, ConstantInt::get(Ty, 1)); + } + + return nullptr; +} + +static Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp, + InstCombiner::BuilderTy &Builder) { + assert(isa<CastInst>(ICmp.getOperand(0)) && "Expected cast for operand 0"); + auto *CastOp0 = cast<CastInst>(ICmp.getOperand(0)); + Value *X; + if (!match(CastOp0, m_ZExtOrSExt(m_Value(X)))) + return nullptr; + + bool IsSignedExt = CastOp0->getOpcode() == Instruction::SExt; + bool IsSignedCmp = ICmp.isSigned(); + if (auto *CastOp1 = dyn_cast<CastInst>(ICmp.getOperand(1))) { + // If the signedness of the two casts doesn't agree (i.e. one is a sext + // and the other is a zext), then we can't handle this. + // TODO: This is too strict. We can handle some predicates (equality?). + if (CastOp0->getOpcode() != CastOp1->getOpcode()) + return nullptr; + + // Not an extension from the same type? + Value *Y = CastOp1->getOperand(0); + Type *XTy = X->getType(), *YTy = Y->getType(); + if (XTy != YTy) { + // One of the casts must have one use because we are creating a new cast. + if (!CastOp0->hasOneUse() && !CastOp1->hasOneUse()) + return nullptr; + // Extend the narrower operand to the type of the wider operand. + if (XTy->getScalarSizeInBits() < YTy->getScalarSizeInBits()) + X = Builder.CreateCast(CastOp0->getOpcode(), X, YTy); + else if (YTy->getScalarSizeInBits() < XTy->getScalarSizeInBits()) + Y = Builder.CreateCast(CastOp0->getOpcode(), Y, XTy); + else + return nullptr; + } + + // (zext X) == (zext Y) --> X == Y + // (sext X) == (sext Y) --> X == Y + if (ICmp.isEquality()) + return new ICmpInst(ICmp.getPredicate(), X, Y); + + // A signed comparison of sign extended values simplifies into a + // signed comparison. + if (IsSignedCmp && IsSignedExt) + return new ICmpInst(ICmp.getPredicate(), X, Y); + + // The other three cases all fold into an unsigned comparison. + return new ICmpInst(ICmp.getUnsignedPredicate(), X, Y); + } + + // Below here, we are only folding a compare with constant. + auto *C = dyn_cast<Constant>(ICmp.getOperand(1)); + if (!C) + return nullptr; + + // Compute the constant that would happen if we truncated to SrcTy then + // re-extended to DestTy. + Type *SrcTy = CastOp0->getSrcTy(); + Type *DestTy = CastOp0->getDestTy(); + Constant *Res1 = ConstantExpr::getTrunc(C, SrcTy); + Constant *Res2 = ConstantExpr::getCast(CastOp0->getOpcode(), Res1, DestTy); + + // If the re-extended constant didn't change... + if (Res2 == C) { + if (ICmp.isEquality()) + return new ICmpInst(ICmp.getPredicate(), X, Res1); + + // A signed comparison of sign extended values simplifies into a + // signed comparison. + if (IsSignedExt && IsSignedCmp) + return new ICmpInst(ICmp.getPredicate(), X, Res1); + + // The other three cases all fold into an unsigned comparison. + return new ICmpInst(ICmp.getUnsignedPredicate(), X, Res1); + } + + // The re-extended constant changed, partly changed (in the case of a vector), + // or could not be determined to be equal (in the case of a constant + // expression), so the constant cannot be represented in the shorter type. + // All the cases that fold to true or false will have already been handled + // by SimplifyICmpInst, so only deal with the tricky case. + if (IsSignedCmp || !IsSignedExt || !isa<ConstantInt>(C)) + return nullptr; + + // Is source op positive? + // icmp ult (sext X), C --> icmp sgt X, -1 + if (ICmp.getPredicate() == ICmpInst::ICMP_ULT) + return new ICmpInst(CmpInst::ICMP_SGT, X, Constant::getAllOnesValue(SrcTy)); + + // Is source op negative? + // icmp ugt (sext X), C --> icmp slt X, 0 + assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!"); + return new ICmpInst(CmpInst::ICMP_SLT, X, Constant::getNullValue(SrcTy)); +} + +/// Handle icmp (cast x), (cast or constant). +Instruction *InstCombiner::foldICmpWithCastOp(ICmpInst &ICmp) { + auto *CastOp0 = dyn_cast<CastInst>(ICmp.getOperand(0)); + if (!CastOp0) + return nullptr; + if (!isa<Constant>(ICmp.getOperand(1)) && !isa<CastInst>(ICmp.getOperand(1))) + return nullptr; + + Value *Op0Src = CastOp0->getOperand(0); + Type *SrcTy = CastOp0->getSrcTy(); + Type *DestTy = CastOp0->getDestTy(); + + // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the + // integer type is the same size as the pointer type. + auto CompatibleSizes = [&](Type *SrcTy, Type *DestTy) { + if (isa<VectorType>(SrcTy)) { + SrcTy = cast<VectorType>(SrcTy)->getElementType(); + DestTy = cast<VectorType>(DestTy)->getElementType(); + } + return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth(); + }; + if (CastOp0->getOpcode() == Instruction::PtrToInt && + CompatibleSizes(SrcTy, DestTy)) { + Value *NewOp1 = nullptr; + if (auto *PtrToIntOp1 = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) { + Value *PtrSrc = PtrToIntOp1->getOperand(0); + if (PtrSrc->getType()->getPointerAddressSpace() == + Op0Src->getType()->getPointerAddressSpace()) { + NewOp1 = PtrToIntOp1->getOperand(0); + // If the pointer types don't match, insert a bitcast. + if (Op0Src->getType() != NewOp1->getType()) + NewOp1 = Builder.CreateBitCast(NewOp1, Op0Src->getType()); + } + } else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) { + NewOp1 = ConstantExpr::getIntToPtr(RHSC, SrcTy); + } + + if (NewOp1) + return new ICmpInst(ICmp.getPredicate(), Op0Src, NewOp1); + } + + return foldICmpWithZextOrSext(ICmp, Builder); +} + +static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS) { + switch (BinaryOp) { + default: + llvm_unreachable("Unsupported binary op"); + case Instruction::Add: + case Instruction::Sub: + return match(RHS, m_Zero()); + case Instruction::Mul: + return match(RHS, m_One()); + } +} + +OverflowResult InstCombiner::computeOverflow( + Instruction::BinaryOps BinaryOp, bool IsSigned, + Value *LHS, Value *RHS, Instruction *CxtI) const { + switch (BinaryOp) { + default: + llvm_unreachable("Unsupported binary op"); + case Instruction::Add: + if (IsSigned) + return computeOverflowForSignedAdd(LHS, RHS, CxtI); + else + return computeOverflowForUnsignedAdd(LHS, RHS, CxtI); + case Instruction::Sub: + if (IsSigned) + return computeOverflowForSignedSub(LHS, RHS, CxtI); + else + return computeOverflowForUnsignedSub(LHS, RHS, CxtI); + case Instruction::Mul: + if (IsSigned) + return computeOverflowForSignedMul(LHS, RHS, CxtI); + else + return computeOverflowForUnsignedMul(LHS, RHS, CxtI); + } +} + +bool InstCombiner::OptimizeOverflowCheck( + Instruction::BinaryOps BinaryOp, bool IsSigned, Value *LHS, Value *RHS, + Instruction &OrigI, Value *&Result, Constant *&Overflow) { + if (OrigI.isCommutative() && isa<Constant>(LHS) && !isa<Constant>(RHS)) + std::swap(LHS, RHS); + + // If the overflow check was an add followed by a compare, the insertion point + // may be pointing to the compare. We want to insert the new instructions + // before the add in case there are uses of the add between the add and the + // compare. + Builder.SetInsertPoint(&OrigI); + + if (isNeutralValue(BinaryOp, RHS)) { + Result = LHS; + Overflow = Builder.getFalse(); + return true; + } + + switch (computeOverflow(BinaryOp, IsSigned, LHS, RHS, &OrigI)) { + case OverflowResult::MayOverflow: + return false; + case OverflowResult::AlwaysOverflowsLow: + case OverflowResult::AlwaysOverflowsHigh: + Result = Builder.CreateBinOp(BinaryOp, LHS, RHS); + Result->takeName(&OrigI); + Overflow = Builder.getTrue(); + return true; + case OverflowResult::NeverOverflows: + Result = Builder.CreateBinOp(BinaryOp, LHS, RHS); + Result->takeName(&OrigI); + Overflow = Builder.getFalse(); + if (auto *Inst = dyn_cast<Instruction>(Result)) { + if (IsSigned) + Inst->setHasNoSignedWrap(); + else + Inst->setHasNoUnsignedWrap(); + } + return true; + } + + llvm_unreachable("Unexpected overflow result"); +} + +/// Recognize and process idiom involving test for multiplication +/// overflow. +/// +/// The caller has matched a pattern of the form: +/// I = cmp u (mul(zext A, zext B), V +/// The function checks if this is a test for overflow and if so replaces +/// multiplication with call to 'mul.with.overflow' intrinsic. +/// +/// \param I Compare instruction. +/// \param MulVal Result of 'mult' instruction. It is one of the arguments of +/// the compare instruction. Must be of integer type. +/// \param OtherVal The other argument of compare instruction. +/// \returns Instruction which must replace the compare instruction, NULL if no +/// replacement required. +static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal, + Value *OtherVal, InstCombiner &IC) { + // Don't bother doing this transformation for pointers, don't do it for + // vectors. + if (!isa<IntegerType>(MulVal->getType())) + return nullptr; + + assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal); + assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal); + auto *MulInstr = dyn_cast<Instruction>(MulVal); + if (!MulInstr) + return nullptr; + assert(MulInstr->getOpcode() == Instruction::Mul); + + auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)), + *RHS = cast<ZExtOperator>(MulInstr->getOperand(1)); + assert(LHS->getOpcode() == Instruction::ZExt); + assert(RHS->getOpcode() == Instruction::ZExt); + Value *A = LHS->getOperand(0), *B = RHS->getOperand(0); + + // Calculate type and width of the result produced by mul.with.overflow. + Type *TyA = A->getType(), *TyB = B->getType(); + unsigned WidthA = TyA->getPrimitiveSizeInBits(), + WidthB = TyB->getPrimitiveSizeInBits(); + unsigned MulWidth; + Type *MulType; + if (WidthB > WidthA) { + MulWidth = WidthB; + MulType = TyB; + } else { + MulWidth = WidthA; + MulType = TyA; + } + + // In order to replace the original mul with a narrower mul.with.overflow, + // all uses must ignore upper bits of the product. The number of used low + // bits must be not greater than the width of mul.with.overflow. + if (MulVal->hasNUsesOrMore(2)) + for (User *U : MulVal->users()) { + if (U == &I) + continue; + if (TruncInst *TI = dyn_cast<TruncInst>(U)) { + // Check if truncation ignores bits above MulWidth. + unsigned TruncWidth = TI->getType()->getPrimitiveSizeInBits(); + if (TruncWidth > MulWidth) + return nullptr; + } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) { + // Check if AND ignores bits above MulWidth. + if (BO->getOpcode() != Instruction::And) + return nullptr; + if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) { + const APInt &CVal = CI->getValue(); + if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth) + return nullptr; + } else { + // In this case we could have the operand of the binary operation + // being defined in another block, and performing the replacement + // could break the dominance relation. + return nullptr; + } + } else { + // Other uses prohibit this transformation. + return nullptr; + } + } + + // Recognize patterns + switch (I.getPredicate()) { + case ICmpInst::ICMP_EQ: + case ICmpInst::ICMP_NE: + // Recognize pattern: + // mulval = mul(zext A, zext B) + // cmp eq/neq mulval, zext trunc mulval + if (ZExtInst *Zext = dyn_cast<ZExtInst>(OtherVal)) + if (Zext->hasOneUse()) { + Value *ZextArg = Zext->getOperand(0); + if (TruncInst *Trunc = dyn_cast<TruncInst>(ZextArg)) + if (Trunc->getType()->getPrimitiveSizeInBits() == MulWidth) + break; //Recognized + } + + // Recognize pattern: + // mulval = mul(zext A, zext B) + // cmp eq/neq mulval, and(mulval, mask), mask selects low MulWidth bits. + ConstantInt *CI; + Value *ValToMask; + if (match(OtherVal, m_And(m_Value(ValToMask), m_ConstantInt(CI)))) { + if (ValToMask != MulVal) + return nullptr; + const APInt &CVal = CI->getValue() + 1; + if (CVal.isPowerOf2()) { + unsigned MaskWidth = CVal.logBase2(); + if (MaskWidth == MulWidth) + break; // Recognized + } + } + return nullptr; + + case ICmpInst::ICMP_UGT: + // Recognize pattern: + // mulval = mul(zext A, zext B) + // cmp ugt mulval, max + if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) { + APInt MaxVal = APInt::getMaxValue(MulWidth); + MaxVal = MaxVal.zext(CI->getBitWidth()); + if (MaxVal.eq(CI->getValue())) + break; // Recognized + } + return nullptr; + + case ICmpInst::ICMP_UGE: + // Recognize pattern: + // mulval = mul(zext A, zext B) + // cmp uge mulval, max+1 + if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) { + APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth); + if (MaxVal.eq(CI->getValue())) + break; // Recognized + } + return nullptr; + + case ICmpInst::ICMP_ULE: + // Recognize pattern: + // mulval = mul(zext A, zext B) + // cmp ule mulval, max + if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) { + APInt MaxVal = APInt::getMaxValue(MulWidth); + MaxVal = MaxVal.zext(CI->getBitWidth()); + if (MaxVal.eq(CI->getValue())) + break; // Recognized + } + return nullptr; + + case ICmpInst::ICMP_ULT: + // Recognize pattern: + // mulval = mul(zext A, zext B) + // cmp ule mulval, max + 1 + if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) { + APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth); + if (MaxVal.eq(CI->getValue())) + break; // Recognized + } + return nullptr; + + default: + return nullptr; + } + + InstCombiner::BuilderTy &Builder = IC.Builder; + Builder.SetInsertPoint(MulInstr); + + // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B) + Value *MulA = A, *MulB = B; + if (WidthA < MulWidth) + MulA = Builder.CreateZExt(A, MulType); + if (WidthB < MulWidth) + MulB = Builder.CreateZExt(B, MulType); + Function *F = Intrinsic::getDeclaration( + I.getModule(), Intrinsic::umul_with_overflow, MulType); + CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul"); + IC.Worklist.Add(MulInstr); + + // If there are uses of mul result other than the comparison, we know that + // they are truncation or binary AND. Change them to use result of + // mul.with.overflow and adjust properly mask/size. + if (MulVal->hasNUsesOrMore(2)) { + Value *Mul = Builder.CreateExtractValue(Call, 0, "umul.value"); + for (auto UI = MulVal->user_begin(), UE = MulVal->user_end(); UI != UE;) { + User *U = *UI++; + if (U == &I || U == OtherVal) + continue; + if (TruncInst *TI = dyn_cast<TruncInst>(U)) { + if (TI->getType()->getPrimitiveSizeInBits() == MulWidth) + IC.replaceInstUsesWith(*TI, Mul); + else + TI->setOperand(0, Mul); + } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) { + assert(BO->getOpcode() == Instruction::And); + // Replace (mul & mask) --> zext (mul.with.overflow & short_mask) + ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1)); + APInt ShortMask = CI->getValue().trunc(MulWidth); + Value *ShortAnd = Builder.CreateAnd(Mul, ShortMask); + Instruction *Zext = + cast<Instruction>(Builder.CreateZExt(ShortAnd, BO->getType())); + IC.Worklist.Add(Zext); + IC.replaceInstUsesWith(*BO, Zext); + } else { + llvm_unreachable("Unexpected Binary operation"); + } + IC.Worklist.Add(cast<Instruction>(U)); + } + } + if (isa<Instruction>(OtherVal)) + IC.Worklist.Add(cast<Instruction>(OtherVal)); + + // The original icmp gets replaced with the overflow value, maybe inverted + // depending on predicate. + bool Inverse = false; + switch (I.getPredicate()) { + case ICmpInst::ICMP_NE: + break; + case ICmpInst::ICMP_EQ: + Inverse = true; + break; + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_UGE: + if (I.getOperand(0) == MulVal) + break; + Inverse = true; + break; + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_ULE: + if (I.getOperand(1) == MulVal) + break; + Inverse = true; + break; + default: + llvm_unreachable("Unexpected predicate"); + } + if (Inverse) { + Value *Res = Builder.CreateExtractValue(Call, 1); + return BinaryOperator::CreateNot(Res); + } + + return ExtractValueInst::Create(Call, 1); +} + +/// When performing a comparison against a constant, it is possible that not all +/// the bits in the LHS are demanded. This helper method computes the mask that +/// IS demanded. +static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth) { + const APInt *RHS; + if (!match(I.getOperand(1), m_APInt(RHS))) + return APInt::getAllOnesValue(BitWidth); + + // If this is a normal comparison, it demands all bits. If it is a sign bit + // comparison, it only demands the sign bit. + bool UnusedBit; + if (isSignBitCheck(I.getPredicate(), *RHS, UnusedBit)) + return APInt::getSignMask(BitWidth); + + switch (I.getPredicate()) { + // For a UGT comparison, we don't care about any bits that + // correspond to the trailing ones of the comparand. The value of these + // bits doesn't impact the outcome of the comparison, because any value + // greater than the RHS must differ in a bit higher than these due to carry. + case ICmpInst::ICMP_UGT: + return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingOnes()); + + // Similarly, for a ULT comparison, we don't care about the trailing zeros. + // Any value less than the RHS must differ in a higher bit because of carries. + case ICmpInst::ICMP_ULT: + return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingZeros()); + + default: + return APInt::getAllOnesValue(BitWidth); + } +} + +/// Check if the order of \p Op0 and \p Op1 as operands in an ICmpInst +/// should be swapped. +/// The decision is based on how many times these two operands are reused +/// as subtract operands and their positions in those instructions. +/// The rationale is that several architectures use the same instruction for +/// both subtract and cmp. Thus, it is better if the order of those operands +/// match. +/// \return true if Op0 and Op1 should be swapped. +static bool swapMayExposeCSEOpportunities(const Value *Op0, const Value *Op1) { + // Filter out pointer values as those cannot appear directly in subtract. + // FIXME: we may want to go through inttoptrs or bitcasts. + if (Op0->getType()->isPointerTy()) + return false; + // If a subtract already has the same operands as a compare, swapping would be + // bad. If a subtract has the same operands as a compare but in reverse order, + // then swapping is good. + int GoodToSwap = 0; + for (const User *U : Op0->users()) { + if (match(U, m_Sub(m_Specific(Op1), m_Specific(Op0)))) + GoodToSwap++; + else if (match(U, m_Sub(m_Specific(Op0), m_Specific(Op1)))) + GoodToSwap--; + } + return GoodToSwap > 0; +} + +/// Check that one use is in the same block as the definition and all +/// other uses are in blocks dominated by a given block. +/// +/// \param DI Definition +/// \param UI Use +/// \param DB Block that must dominate all uses of \p DI outside +/// the parent block +/// \return true when \p UI is the only use of \p DI in the parent block +/// and all other uses of \p DI are in blocks dominated by \p DB. +/// +bool InstCombiner::dominatesAllUses(const Instruction *DI, + const Instruction *UI, + const BasicBlock *DB) const { + assert(DI && UI && "Instruction not defined\n"); + // Ignore incomplete definitions. + if (!DI->getParent()) + return false; + // DI and UI must be in the same block. + if (DI->getParent() != UI->getParent()) + return false; + // Protect from self-referencing blocks. + if (DI->getParent() == DB) + return false; + for (const User *U : DI->users()) { + auto *Usr = cast<Instruction>(U); + if (Usr != UI && !DT.dominates(DB, Usr->getParent())) + return false; + } + return true; +} + +/// Return true when the instruction sequence within a block is select-cmp-br. +static bool isChainSelectCmpBranch(const SelectInst *SI) { + const BasicBlock *BB = SI->getParent(); + if (!BB) + return false; + auto *BI = dyn_cast_or_null<BranchInst>(BB->getTerminator()); + if (!BI || BI->getNumSuccessors() != 2) + return false; + auto *IC = dyn_cast<ICmpInst>(BI->getCondition()); + if (!IC || (IC->getOperand(0) != SI && IC->getOperand(1) != SI)) + return false; + return true; +} + +/// True when a select result is replaced by one of its operands +/// in select-icmp sequence. This will eventually result in the elimination +/// of the select. +/// +/// \param SI Select instruction +/// \param Icmp Compare instruction +/// \param SIOpd Operand that replaces the select +/// +/// Notes: +/// - The replacement is global and requires dominator information +/// - The caller is responsible for the actual replacement +/// +/// Example: +/// +/// entry: +/// %4 = select i1 %3, %C* %0, %C* null +/// %5 = icmp eq %C* %4, null +/// br i1 %5, label %9, label %7 +/// ... +/// ; <label>:7 ; preds = %entry +/// %8 = getelementptr inbounds %C* %4, i64 0, i32 0 +/// ... +/// +/// can be transformed to +/// +/// %5 = icmp eq %C* %0, null +/// %6 = select i1 %3, i1 %5, i1 true +/// br i1 %6, label %9, label %7 +/// ... +/// ; <label>:7 ; preds = %entry +/// %8 = getelementptr inbounds %C* %0, i64 0, i32 0 // replace by %0! +/// +/// Similar when the first operand of the select is a constant or/and +/// the compare is for not equal rather than equal. +/// +/// NOTE: The function is only called when the select and compare constants +/// are equal, the optimization can work only for EQ predicates. This is not a +/// major restriction since a NE compare should be 'normalized' to an equal +/// compare, which usually happens in the combiner and test case +/// select-cmp-br.ll checks for it. +bool InstCombiner::replacedSelectWithOperand(SelectInst *SI, + const ICmpInst *Icmp, + const unsigned SIOpd) { + assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!"); + if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) { + BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1); + // The check for the single predecessor is not the best that can be + // done. But it protects efficiently against cases like when SI's + // home block has two successors, Succ and Succ1, and Succ1 predecessor + // of Succ. Then SI can't be replaced by SIOpd because the use that gets + // replaced can be reached on either path. So the uniqueness check + // guarantees that the path all uses of SI (outside SI's parent) are on + // is disjoint from all other paths out of SI. But that information + // is more expensive to compute, and the trade-off here is in favor + // of compile-time. It should also be noticed that we check for a single + // predecessor and not only uniqueness. This to handle the situation when + // Succ and Succ1 points to the same basic block. + if (Succ->getSinglePredecessor() && dominatesAllUses(SI, Icmp, Succ)) { + NumSel++; + SI->replaceUsesOutsideBlock(SI->getOperand(SIOpd), SI->getParent()); + return true; + } + } + return false; +} + +/// Try to fold the comparison based on range information we can get by checking +/// whether bits are known to be zero or one in the inputs. +Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + Type *Ty = Op0->getType(); + ICmpInst::Predicate Pred = I.getPredicate(); + + // Get scalar or pointer size. + unsigned BitWidth = Ty->isIntOrIntVectorTy() + ? Ty->getScalarSizeInBits() + : DL.getIndexTypeSizeInBits(Ty->getScalarType()); + + if (!BitWidth) + return nullptr; + + KnownBits Op0Known(BitWidth); + KnownBits Op1Known(BitWidth); + + if (SimplifyDemandedBits(&I, 0, + getDemandedBitsLHSMask(I, BitWidth), + Op0Known, 0)) + return &I; + + if (SimplifyDemandedBits(&I, 1, APInt::getAllOnesValue(BitWidth), + Op1Known, 0)) + return &I; + + // Given the known and unknown bits, compute a range that the LHS could be + // in. Compute the Min, Max and RHS values based on the known bits. For the + // EQ and NE we use unsigned values. + APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0); + APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0); + if (I.isSigned()) { + computeSignedMinMaxValuesFromKnownBits(Op0Known, Op0Min, Op0Max); + computeSignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max); + } else { + computeUnsignedMinMaxValuesFromKnownBits(Op0Known, Op0Min, Op0Max); + computeUnsignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max); + } + + // If Min and Max are known to be the same, then SimplifyDemandedBits figured + // out that the LHS or RHS is a constant. Constant fold this now, so that + // code below can assume that Min != Max. + if (!isa<Constant>(Op0) && Op0Min == Op0Max) + return new ICmpInst(Pred, ConstantExpr::getIntegerValue(Ty, Op0Min), Op1); + if (!isa<Constant>(Op1) && Op1Min == Op1Max) + return new ICmpInst(Pred, Op0, ConstantExpr::getIntegerValue(Ty, Op1Min)); + + // Based on the range information we know about the LHS, see if we can + // simplify this comparison. For example, (x&4) < 8 is always true. + switch (Pred) { + default: + llvm_unreachable("Unknown icmp opcode!"); + case ICmpInst::ICMP_EQ: + case ICmpInst::ICMP_NE: { + if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max)) { + return Pred == CmpInst::ICMP_EQ + ? replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())) + : replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); + } + + // If all bits are known zero except for one, then we know at most one bit + // is set. If the comparison is against zero, then this is a check to see if + // *that* bit is set. + APInt Op0KnownZeroInverted = ~Op0Known.Zero; + if (Op1Known.isZero()) { + // If the LHS is an AND with the same constant, look through it. + Value *LHS = nullptr; + const APInt *LHSC; + if (!match(Op0, m_And(m_Value(LHS), m_APInt(LHSC))) || + *LHSC != Op0KnownZeroInverted) + LHS = Op0; + + Value *X; + if (match(LHS, m_Shl(m_One(), m_Value(X)))) { + APInt ValToCheck = Op0KnownZeroInverted; + Type *XTy = X->getType(); + if (ValToCheck.isPowerOf2()) { + // ((1 << X) & 8) == 0 -> X != 3 + // ((1 << X) & 8) != 0 -> X == 3 + auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros()); + auto NewPred = ICmpInst::getInversePredicate(Pred); + return new ICmpInst(NewPred, X, CmpC); + } else if ((++ValToCheck).isPowerOf2()) { + // ((1 << X) & 7) == 0 -> X >= 3 + // ((1 << X) & 7) != 0 -> X < 3 + auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros()); + auto NewPred = + Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGE : CmpInst::ICMP_ULT; + return new ICmpInst(NewPred, X, CmpC); + } + } + + // Check if the LHS is 8 >>u x and the result is a power of 2 like 1. + const APInt *CI; + if (Op0KnownZeroInverted.isOneValue() && + match(LHS, m_LShr(m_Power2(CI), m_Value(X)))) { + // ((8 >>u X) & 1) == 0 -> X != 3 + // ((8 >>u X) & 1) != 0 -> X == 3 + unsigned CmpVal = CI->countTrailingZeros(); + auto NewPred = ICmpInst::getInversePredicate(Pred); + return new ICmpInst(NewPred, X, ConstantInt::get(X->getType(), CmpVal)); + } + } + break; + } + case ICmpInst::ICMP_ULT: { + if (Op0Max.ult(Op1Min)) // A <u B -> true if max(A) < min(B) + return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); + if (Op0Min.uge(Op1Max)) // A <u B -> false if min(A) >= max(B) + return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); + if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B) + return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1); + + const APInt *CmpC; + if (match(Op1, m_APInt(CmpC))) { + // A <u C -> A == C-1 if min(A)+1 == C + if (*CmpC == Op0Min + 1) + return new ICmpInst(ICmpInst::ICMP_EQ, Op0, + ConstantInt::get(Op1->getType(), *CmpC - 1)); + // X <u C --> X == 0, if the number of zero bits in the bottom of X + // exceeds the log2 of C. + if (Op0Known.countMinTrailingZeros() >= CmpC->ceilLogBase2()) + return new ICmpInst(ICmpInst::ICMP_EQ, Op0, + Constant::getNullValue(Op1->getType())); + } + break; + } + case ICmpInst::ICMP_UGT: { + if (Op0Min.ugt(Op1Max)) // A >u B -> true if min(A) > max(B) + return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); + if (Op0Max.ule(Op1Min)) // A >u B -> false if max(A) <= max(B) + return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); + if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B) + return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1); + + const APInt *CmpC; + if (match(Op1, m_APInt(CmpC))) { + // A >u C -> A == C+1 if max(a)-1 == C + if (*CmpC == Op0Max - 1) + return new ICmpInst(ICmpInst::ICMP_EQ, Op0, + ConstantInt::get(Op1->getType(), *CmpC + 1)); + // X >u C --> X != 0, if the number of zero bits in the bottom of X + // exceeds the log2 of C. + if (Op0Known.countMinTrailingZeros() >= CmpC->getActiveBits()) + return new ICmpInst(ICmpInst::ICMP_NE, Op0, + Constant::getNullValue(Op1->getType())); + } + break; + } + case ICmpInst::ICMP_SLT: { + if (Op0Max.slt(Op1Min)) // A <s B -> true if max(A) < min(C) + return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); + if (Op0Min.sge(Op1Max)) // A <s B -> false if min(A) >= max(C) + return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); + if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B) + return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1); + const APInt *CmpC; + if (match(Op1, m_APInt(CmpC))) { + if (*CmpC == Op0Min + 1) // A <s C -> A == C-1 if min(A)+1 == C + return new ICmpInst(ICmpInst::ICMP_EQ, Op0, + ConstantInt::get(Op1->getType(), *CmpC - 1)); + } + break; + } + case ICmpInst::ICMP_SGT: { + if (Op0Min.sgt(Op1Max)) // A >s B -> true if min(A) > max(B) + return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); + if (Op0Max.sle(Op1Min)) // A >s B -> false if max(A) <= min(B) + return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); + if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B) + return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1); + const APInt *CmpC; + if (match(Op1, m_APInt(CmpC))) { + if (*CmpC == Op0Max - 1) // A >s C -> A == C+1 if max(A)-1 == C + return new ICmpInst(ICmpInst::ICMP_EQ, Op0, + ConstantInt::get(Op1->getType(), *CmpC + 1)); + } + break; + } + case ICmpInst::ICMP_SGE: + assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!"); + if (Op0Min.sge(Op1Max)) // A >=s B -> true if min(A) >= max(B) + return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); + if (Op0Max.slt(Op1Min)) // A >=s B -> false if max(A) < min(B) + return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); + if (Op1Min == Op0Max) // A >=s B -> A == B if max(A) == min(B) + return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1); + break; + case ICmpInst::ICMP_SLE: + assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!"); + if (Op0Max.sle(Op1Min)) // A <=s B -> true if max(A) <= min(B) + return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); + if (Op0Min.sgt(Op1Max)) // A <=s B -> false if min(A) > max(B) + return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); + if (Op1Max == Op0Min) // A <=s B -> A == B if min(A) == max(B) + return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1); + break; + case ICmpInst::ICMP_UGE: + assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!"); + if (Op0Min.uge(Op1Max)) // A >=u B -> true if min(A) >= max(B) + return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); + if (Op0Max.ult(Op1Min)) // A >=u B -> false if max(A) < min(B) + return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); + if (Op1Min == Op0Max) // A >=u B -> A == B if max(A) == min(B) + return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1); + break; + case ICmpInst::ICMP_ULE: + assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!"); + if (Op0Max.ule(Op1Min)) // A <=u B -> true if max(A) <= min(B) + return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType())); + if (Op0Min.ugt(Op1Max)) // A <=u B -> false if min(A) > max(B) + return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType())); + if (Op1Max == Op0Min) // A <=u B -> A == B if min(A) == max(B) + return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1); + break; + } + + // Turn a signed comparison into an unsigned one if both operands are known to + // have the same sign. + if (I.isSigned() && + ((Op0Known.Zero.isNegative() && Op1Known.Zero.isNegative()) || + (Op0Known.One.isNegative() && Op1Known.One.isNegative()))) + return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1); + + return nullptr; +} + +llvm::Optional<std::pair<CmpInst::Predicate, Constant *>> +llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, + Constant *C) { + assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) && + "Only for relational integer predicates."); + + Type *Type = C->getType(); + bool IsSigned = ICmpInst::isSigned(Pred); + + CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred); + bool WillIncrement = + UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT; + + // Check if the constant operand can be safely incremented/decremented + // without overflowing/underflowing. + auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) { + return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned); + }; + + if (auto *CI = dyn_cast<ConstantInt>(C)) { + // Bail out if the constant can't be safely incremented/decremented. + if (!ConstantIsOk(CI)) + return llvm::None; + } else if (Type->isVectorTy()) { + unsigned NumElts = Type->getVectorNumElements(); + for (unsigned i = 0; i != NumElts; ++i) { + Constant *Elt = C->getAggregateElement(i); + if (!Elt) + return llvm::None; + + if (isa<UndefValue>(Elt)) + continue; + + // Bail out if we can't determine if this constant is min/max or if we + // know that this constant is min/max. + auto *CI = dyn_cast<ConstantInt>(Elt); + if (!CI || !ConstantIsOk(CI)) + return llvm::None; + } + } else { + // ConstantExpr? + return llvm::None; + } + + CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred); + + // Increment or decrement the constant. + Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true); + Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne); + + return std::make_pair(NewPred, NewC); +} + +/// If we have an icmp le or icmp ge instruction with a constant operand, turn +/// it into the appropriate icmp lt or icmp gt instruction. This transform +/// allows them to be folded in visitICmpInst. +static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) { + ICmpInst::Predicate Pred = I.getPredicate(); + if (ICmpInst::isEquality(Pred) || !ICmpInst::isIntPredicate(Pred) || + isCanonicalPredicate(Pred)) + return nullptr; + + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + auto *Op1C = dyn_cast<Constant>(Op1); + if (!Op1C) + return nullptr; + + auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, Op1C); + if (!FlippedStrictness) + return nullptr; + + return new ICmpInst(FlippedStrictness->first, Op0, FlippedStrictness->second); +} + +/// Integer compare with boolean values can always be turned into bitwise ops. +static Instruction *canonicalizeICmpBool(ICmpInst &I, + InstCombiner::BuilderTy &Builder) { + Value *A = I.getOperand(0), *B = I.getOperand(1); + assert(A->getType()->isIntOrIntVectorTy(1) && "Bools only"); + + // A boolean compared to true/false can be simplified to Op0/true/false in + // 14 out of the 20 (10 predicates * 2 constants) possible combinations. + // Cases not handled by InstSimplify are always 'not' of Op0. + if (match(B, m_Zero())) { + switch (I.getPredicate()) { + case CmpInst::ICMP_EQ: // A == 0 -> !A + case CmpInst::ICMP_ULE: // A <=u 0 -> !A + case CmpInst::ICMP_SGE: // A >=s 0 -> !A + return BinaryOperator::CreateNot(A); + default: + llvm_unreachable("ICmp i1 X, C not simplified as expected."); + } + } else if (match(B, m_One())) { + switch (I.getPredicate()) { + case CmpInst::ICMP_NE: // A != 1 -> !A + case CmpInst::ICMP_ULT: // A <u 1 -> !A + case CmpInst::ICMP_SGT: // A >s -1 -> !A + return BinaryOperator::CreateNot(A); + default: + llvm_unreachable("ICmp i1 X, C not simplified as expected."); + } + } + + switch (I.getPredicate()) { + default: + llvm_unreachable("Invalid icmp instruction!"); + case ICmpInst::ICMP_EQ: + // icmp eq i1 A, B -> ~(A ^ B) + return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); + + case ICmpInst::ICMP_NE: + // icmp ne i1 A, B -> A ^ B + return BinaryOperator::CreateXor(A, B); + + case ICmpInst::ICMP_UGT: + // icmp ugt -> icmp ult + std::swap(A, B); + LLVM_FALLTHROUGH; + case ICmpInst::ICMP_ULT: + // icmp ult i1 A, B -> ~A & B + return BinaryOperator::CreateAnd(Builder.CreateNot(A), B); + + case ICmpInst::ICMP_SGT: + // icmp sgt -> icmp slt + std::swap(A, B); + LLVM_FALLTHROUGH; + case ICmpInst::ICMP_SLT: + // icmp slt i1 A, B -> A & ~B + return BinaryOperator::CreateAnd(Builder.CreateNot(B), A); + + case ICmpInst::ICMP_UGE: + // icmp uge -> icmp ule + std::swap(A, B); + LLVM_FALLTHROUGH; + case ICmpInst::ICMP_ULE: + // icmp ule i1 A, B -> ~A | B + return BinaryOperator::CreateOr(Builder.CreateNot(A), B); + + case ICmpInst::ICMP_SGE: + // icmp sge -> icmp sle + std::swap(A, B); + LLVM_FALLTHROUGH; + case ICmpInst::ICMP_SLE: + // icmp sle i1 A, B -> A | ~B + return BinaryOperator::CreateOr(Builder.CreateNot(B), A); + } +} + +// Transform pattern like: +// (1 << Y) u<= X or ~(-1 << Y) u< X or ((1 << Y)+(-1)) u< X +// (1 << Y) u> X or ~(-1 << Y) u>= X or ((1 << Y)+(-1)) u>= X +// Into: +// (X l>> Y) != 0 +// (X l>> Y) == 0 +static Instruction *foldICmpWithHighBitMask(ICmpInst &Cmp, + InstCombiner::BuilderTy &Builder) { + ICmpInst::Predicate Pred, NewPred; + Value *X, *Y; + if (match(&Cmp, + m_c_ICmp(Pred, m_OneUse(m_Shl(m_One(), m_Value(Y))), m_Value(X)))) { + // We want X to be the icmp's second operand, so swap predicate if it isn't. + if (Cmp.getOperand(0) == X) + Pred = Cmp.getSwappedPredicate(); + + switch (Pred) { + case ICmpInst::ICMP_ULE: + NewPred = ICmpInst::ICMP_NE; + break; + case ICmpInst::ICMP_UGT: + NewPred = ICmpInst::ICMP_EQ; + break; + default: + return nullptr; + } + } else if (match(&Cmp, m_c_ICmp(Pred, + m_OneUse(m_CombineOr( + m_Not(m_Shl(m_AllOnes(), m_Value(Y))), + m_Add(m_Shl(m_One(), m_Value(Y)), + m_AllOnes()))), + m_Value(X)))) { + // The variant with 'add' is not canonical, (the variant with 'not' is) + // we only get it because it has extra uses, and can't be canonicalized, + + // We want X to be the icmp's second operand, so swap predicate if it isn't. + if (Cmp.getOperand(0) == X) + Pred = Cmp.getSwappedPredicate(); + + switch (Pred) { + case ICmpInst::ICMP_ULT: + NewPred = ICmpInst::ICMP_NE; + break; + case ICmpInst::ICMP_UGE: + NewPred = ICmpInst::ICMP_EQ; + break; + default: + return nullptr; + } + } else + return nullptr; + + Value *NewX = Builder.CreateLShr(X, Y, X->getName() + ".highbits"); + Constant *Zero = Constant::getNullValue(NewX->getType()); + return CmpInst::Create(Instruction::ICmp, NewPred, NewX, Zero); +} + +static Instruction *foldVectorCmp(CmpInst &Cmp, + InstCombiner::BuilderTy &Builder) { + // If both arguments of the cmp are shuffles that use the same mask and + // shuffle within a single vector, move the shuffle after the cmp. + Value *LHS = Cmp.getOperand(0), *RHS = Cmp.getOperand(1); + Value *V1, *V2; + Constant *M; + if (match(LHS, m_ShuffleVector(m_Value(V1), m_Undef(), m_Constant(M))) && + match(RHS, m_ShuffleVector(m_Value(V2), m_Undef(), m_Specific(M))) && + V1->getType() == V2->getType() && + (LHS->hasOneUse() || RHS->hasOneUse())) { + // cmp (shuffle V1, M), (shuffle V2, M) --> shuffle (cmp V1, V2), M + CmpInst::Predicate P = Cmp.getPredicate(); + Value *NewCmp = isa<ICmpInst>(Cmp) ? Builder.CreateICmp(P, V1, V2) + : Builder.CreateFCmp(P, V1, V2); + return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()), M); + } + return nullptr; +} + +Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { + bool Changed = false; + const SimplifyQuery Q = SQ.getWithInstruction(&I); + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + unsigned Op0Cplxity = getComplexity(Op0); + unsigned Op1Cplxity = getComplexity(Op1); + + /// Orders the operands of the compare so that they are listed from most + /// complex to least complex. This puts constants before unary operators, + /// before binary operators. + if (Op0Cplxity < Op1Cplxity || + (Op0Cplxity == Op1Cplxity && swapMayExposeCSEOpportunities(Op0, Op1))) { + I.swapOperands(); + std::swap(Op0, Op1); + Changed = true; + } + + if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, Q)) + return replaceInstUsesWith(I, V); + + // Comparing -val or val with non-zero is the same as just comparing val + // ie, abs(val) != 0 -> val != 0 + if (I.getPredicate() == ICmpInst::ICMP_NE && match(Op1, m_Zero())) { + Value *Cond, *SelectTrue, *SelectFalse; + if (match(Op0, m_Select(m_Value(Cond), m_Value(SelectTrue), + m_Value(SelectFalse)))) { + if (Value *V = dyn_castNegVal(SelectTrue)) { + if (V == SelectFalse) + return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1); + } + else if (Value *V = dyn_castNegVal(SelectFalse)) { + if (V == SelectTrue) + return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1); + } + } + } + + if (Op0->getType()->isIntOrIntVectorTy(1)) + if (Instruction *Res = canonicalizeICmpBool(I, Builder)) + return Res; + + if (ICmpInst *NewICmp = canonicalizeCmpWithConstant(I)) + return NewICmp; + + if (Instruction *Res = foldICmpWithConstant(I)) + return Res; + + if (Instruction *Res = foldICmpWithDominatingICmp(I)) + return Res; + + if (Instruction *Res = foldICmpBinOp(I, Q)) + return Res; + + if (Instruction *Res = foldICmpUsingKnownBits(I)) + return Res; + + // Test if the ICmpInst instruction is used exclusively by a select as + // part of a minimum or maximum operation. If so, refrain from doing + // any other folding. This helps out other analyses which understand + // non-obfuscated minimum and maximum idioms, such as ScalarEvolution + // and CodeGen. And in this case, at least one of the comparison + // operands has at least one user besides the compare (the select), + // which would often largely negate the benefit of folding anyway. + // + // Do the same for the other patterns recognized by matchSelectPattern. + if (I.hasOneUse()) + if (SelectInst *SI = dyn_cast<SelectInst>(I.user_back())) { + Value *A, *B; + SelectPatternResult SPR = matchSelectPattern(SI, A, B); + if (SPR.Flavor != SPF_UNKNOWN) + return nullptr; + } + + // Do this after checking for min/max to prevent infinite looping. + if (Instruction *Res = foldICmpWithZero(I)) + return Res; + + // FIXME: We only do this after checking for min/max to prevent infinite + // looping caused by a reverse canonicalization of these patterns for min/max. + // FIXME: The organization of folds is a mess. These would naturally go into + // canonicalizeCmpWithConstant(), but we can't move all of the above folds + // down here after the min/max restriction. + ICmpInst::Predicate Pred = I.getPredicate(); + const APInt *C; + if (match(Op1, m_APInt(C))) { + // For i32: x >u 2147483647 -> x <s 0 -> true if sign bit set + if (Pred == ICmpInst::ICMP_UGT && C->isMaxSignedValue()) { + Constant *Zero = Constant::getNullValue(Op0->getType()); + return new ICmpInst(ICmpInst::ICMP_SLT, Op0, Zero); + } + + // For i32: x <u 2147483648 -> x >s -1 -> true if sign bit clear + if (Pred == ICmpInst::ICMP_ULT && C->isMinSignedValue()) { + Constant *AllOnes = Constant::getAllOnesValue(Op0->getType()); + return new ICmpInst(ICmpInst::ICMP_SGT, Op0, AllOnes); + } + } + + if (Instruction *Res = foldICmpInstWithConstant(I)) + return Res; + + // Try to match comparison as a sign bit test. Intentionally do this after + // foldICmpInstWithConstant() to potentially let other folds to happen first. + if (Instruction *New = foldSignBitTest(I)) + return New; + + if (Instruction *Res = foldICmpInstWithConstantNotInt(I)) + return Res; + + // If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now. + if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op0)) + if (Instruction *NI = foldGEPICmp(GEP, Op1, I.getPredicate(), I)) + return NI; + if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1)) + if (Instruction *NI = foldGEPICmp(GEP, Op0, + ICmpInst::getSwappedPredicate(I.getPredicate()), I)) + return NI; + + // Try to optimize equality comparisons against alloca-based pointers. + if (Op0->getType()->isPointerTy() && I.isEquality()) { + assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?"); + if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op0, DL))) + if (Instruction *New = foldAllocaCmp(I, Alloca, Op1)) + return New; + if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op1, DL))) + if (Instruction *New = foldAllocaCmp(I, Alloca, Op0)) + return New; + } + + if (Instruction *Res = foldICmpBitCast(I, Builder)) + return Res; + + if (Instruction *R = foldICmpWithCastOp(I)) + return R; + + if (Instruction *Res = foldICmpWithMinMax(I)) + return Res; + + { + Value *A, *B; + // Transform (A & ~B) == 0 --> (A & B) != 0 + // and (A & ~B) != 0 --> (A & B) == 0 + // if A is a power of 2. + if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) && + match(Op1, m_Zero()) && + isKnownToBeAPowerOfTwo(A, false, 0, &I) && I.isEquality()) + return new ICmpInst(I.getInversePredicate(), Builder.CreateAnd(A, B), + Op1); + + // ~X < ~Y --> Y < X + // ~X < C --> X > ~C + if (match(Op0, m_Not(m_Value(A)))) { + if (match(Op1, m_Not(m_Value(B)))) + return new ICmpInst(I.getPredicate(), B, A); + + const APInt *C; + if (match(Op1, m_APInt(C))) + return new ICmpInst(I.getSwappedPredicate(), A, + ConstantInt::get(Op1->getType(), ~(*C))); + } + + Instruction *AddI = nullptr; + if (match(&I, m_UAddWithOverflow(m_Value(A), m_Value(B), + m_Instruction(AddI))) && + isa<IntegerType>(A->getType())) { + Value *Result; + Constant *Overflow; + if (OptimizeOverflowCheck(Instruction::Add, /*Signed*/false, A, B, + *AddI, Result, Overflow)) { + replaceInstUsesWith(*AddI, Result); + return replaceInstUsesWith(I, Overflow); + } + } + + // (zext a) * (zext b) --> llvm.umul.with.overflow. + if (match(Op0, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) { + if (Instruction *R = processUMulZExtIdiom(I, Op0, Op1, *this)) + return R; + } + if (match(Op1, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) { + if (Instruction *R = processUMulZExtIdiom(I, Op1, Op0, *this)) + return R; + } + } + + if (Instruction *Res = foldICmpEquality(I)) + return Res; + + // The 'cmpxchg' instruction returns an aggregate containing the old value and + // an i1 which indicates whether or not we successfully did the swap. + // + // Replace comparisons between the old value and the expected value with the + // indicator that 'cmpxchg' returns. + // + // N.B. This transform is only valid when the 'cmpxchg' is not permitted to + // spuriously fail. In those cases, the old value may equal the expected + // value but it is possible for the swap to not occur. + if (I.getPredicate() == ICmpInst::ICMP_EQ) + if (auto *EVI = dyn_cast<ExtractValueInst>(Op0)) + if (auto *ACXI = dyn_cast<AtomicCmpXchgInst>(EVI->getAggregateOperand())) + if (EVI->getIndices()[0] == 0 && ACXI->getCompareOperand() == Op1 && + !ACXI->isWeak()) + return ExtractValueInst::Create(ACXI, 1); + + { + Value *X; + const APInt *C; + // icmp X+Cst, X + if (match(Op0, m_Add(m_Value(X), m_APInt(C))) && Op1 == X) + return foldICmpAddOpConst(X, *C, I.getPredicate()); + + // icmp X, X+Cst + if (match(Op1, m_Add(m_Value(X), m_APInt(C))) && Op0 == X) + return foldICmpAddOpConst(X, *C, I.getSwappedPredicate()); + } + + if (Instruction *Res = foldICmpWithHighBitMask(I, Builder)) + return Res; + + if (I.getType()->isVectorTy()) + if (Instruction *Res = foldVectorCmp(I, Builder)) + return Res; + + return Changed ? &I : nullptr; +} + +/// Fold fcmp ([us]itofp x, cst) if possible. +Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI, + Constant *RHSC) { + if (!isa<ConstantFP>(RHSC)) return nullptr; + const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF(); + + // Get the width of the mantissa. We don't want to hack on conversions that + // might lose information from the integer, e.g. "i64 -> float" + int MantissaWidth = LHSI->getType()->getFPMantissaWidth(); + if (MantissaWidth == -1) return nullptr; // Unknown. + + IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType()); + + bool LHSUnsigned = isa<UIToFPInst>(LHSI); + + if (I.isEquality()) { + FCmpInst::Predicate P = I.getPredicate(); + bool IsExact = false; + APSInt RHSCvt(IntTy->getBitWidth(), LHSUnsigned); + RHS.convertToInteger(RHSCvt, APFloat::rmNearestTiesToEven, &IsExact); + + // If the floating point constant isn't an integer value, we know if we will + // ever compare equal / not equal to it. + if (!IsExact) { + // TODO: Can never be -0.0 and other non-representable values + APFloat RHSRoundInt(RHS); + RHSRoundInt.roundToIntegral(APFloat::rmNearestTiesToEven); + if (RHS.compare(RHSRoundInt) != APFloat::cmpEqual) { + if (P == FCmpInst::FCMP_OEQ || P == FCmpInst::FCMP_UEQ) + return replaceInstUsesWith(I, Builder.getFalse()); + + assert(P == FCmpInst::FCMP_ONE || P == FCmpInst::FCMP_UNE); + return replaceInstUsesWith(I, Builder.getTrue()); + } + } + + // TODO: If the constant is exactly representable, is it always OK to do + // equality compares as integer? + } + + // Check to see that the input is converted from an integer type that is small + // enough that preserves all bits. TODO: check here for "known" sign bits. + // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e. + unsigned InputSize = IntTy->getScalarSizeInBits(); + + // Following test does NOT adjust InputSize downwards for signed inputs, + // because the most negative value still requires all the mantissa bits + // to distinguish it from one less than that value. + if ((int)InputSize > MantissaWidth) { + // Conversion would lose accuracy. Check if loss can impact comparison. + int Exp = ilogb(RHS); + if (Exp == APFloat::IEK_Inf) { + int MaxExponent = ilogb(APFloat::getLargest(RHS.getSemantics())); + if (MaxExponent < (int)InputSize - !LHSUnsigned) + // Conversion could create infinity. + return nullptr; + } else { + // Note that if RHS is zero or NaN, then Exp is negative + // and first condition is trivially false. + if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned) + // Conversion could affect comparison. + return nullptr; + } + } + + // Otherwise, we can potentially simplify the comparison. We know that it + // will always come through as an integer value and we know the constant is + // not a NAN (it would have been previously simplified). + assert(!RHS.isNaN() && "NaN comparison not already folded!"); + + ICmpInst::Predicate Pred; + switch (I.getPredicate()) { + default: llvm_unreachable("Unexpected predicate!"); + case FCmpInst::FCMP_UEQ: + case FCmpInst::FCMP_OEQ: + Pred = ICmpInst::ICMP_EQ; + break; + case FCmpInst::FCMP_UGT: + case FCmpInst::FCMP_OGT: + Pred = LHSUnsigned ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_SGT; + break; + case FCmpInst::FCMP_UGE: + case FCmpInst::FCMP_OGE: + Pred = LHSUnsigned ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_SGE; + break; + case FCmpInst::FCMP_ULT: + case FCmpInst::FCMP_OLT: + Pred = LHSUnsigned ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_SLT; + break; + case FCmpInst::FCMP_ULE: + case FCmpInst::FCMP_OLE: + Pred = LHSUnsigned ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_SLE; + break; + case FCmpInst::FCMP_UNE: + case FCmpInst::FCMP_ONE: + Pred = ICmpInst::ICMP_NE; + break; + case FCmpInst::FCMP_ORD: + return replaceInstUsesWith(I, Builder.getTrue()); + case FCmpInst::FCMP_UNO: + return replaceInstUsesWith(I, Builder.getFalse()); + } + + // Now we know that the APFloat is a normal number, zero or inf. + + // See if the FP constant is too large for the integer. For example, + // comparing an i8 to 300.0. + unsigned IntWidth = IntTy->getScalarSizeInBits(); + + if (!LHSUnsigned) { + // If the RHS value is > SignedMax, fold the comparison. This handles +INF + // and large values. + APFloat SMax(RHS.getSemantics()); + SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true, + APFloat::rmNearestTiesToEven); + if (SMax.compare(RHS) == APFloat::cmpLessThan) { // smax < 13123.0 + if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SLT || + Pred == ICmpInst::ICMP_SLE) + return replaceInstUsesWith(I, Builder.getTrue()); + return replaceInstUsesWith(I, Builder.getFalse()); + } + } else { + // If the RHS value is > UnsignedMax, fold the comparison. This handles + // +INF and large values. + APFloat UMax(RHS.getSemantics()); + UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false, + APFloat::rmNearestTiesToEven); + if (UMax.compare(RHS) == APFloat::cmpLessThan) { // umax < 13123.0 + if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_ULT || + Pred == ICmpInst::ICMP_ULE) + return replaceInstUsesWith(I, Builder.getTrue()); + return replaceInstUsesWith(I, Builder.getFalse()); + } + } + + if (!LHSUnsigned) { + // See if the RHS value is < SignedMin. + APFloat SMin(RHS.getSemantics()); + SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true, + APFloat::rmNearestTiesToEven); + if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0 + if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT || + Pred == ICmpInst::ICMP_SGE) + return replaceInstUsesWith(I, Builder.getTrue()); + return replaceInstUsesWith(I, Builder.getFalse()); + } + } else { + // See if the RHS value is < UnsignedMin. + APFloat SMin(RHS.getSemantics()); + SMin.convertFromAPInt(APInt::getMinValue(IntWidth), true, + APFloat::rmNearestTiesToEven); + if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // umin > 12312.0 + if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT || + Pred == ICmpInst::ICMP_UGE) + return replaceInstUsesWith(I, Builder.getTrue()); + return replaceInstUsesWith(I, Builder.getFalse()); + } + } + + // Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or + // [0, UMAX], but it may still be fractional. See if it is fractional by + // casting the FP value to the integer value and back, checking for equality. + // Don't do this for zero, because -0.0 is not fractional. + Constant *RHSInt = LHSUnsigned + ? ConstantExpr::getFPToUI(RHSC, IntTy) + : ConstantExpr::getFPToSI(RHSC, IntTy); + if (!RHS.isZero()) { + bool Equal = LHSUnsigned + ? ConstantExpr::getUIToFP(RHSInt, RHSC->getType()) == RHSC + : ConstantExpr::getSIToFP(RHSInt, RHSC->getType()) == RHSC; + if (!Equal) { + // If we had a comparison against a fractional value, we have to adjust + // the compare predicate and sometimes the value. RHSC is rounded towards + // zero at this point. + switch (Pred) { + default: llvm_unreachable("Unexpected integer comparison!"); + case ICmpInst::ICMP_NE: // (float)int != 4.4 --> true + return replaceInstUsesWith(I, Builder.getTrue()); + case ICmpInst::ICMP_EQ: // (float)int == 4.4 --> false + return replaceInstUsesWith(I, Builder.getFalse()); + case ICmpInst::ICMP_ULE: + // (float)int <= 4.4 --> int <= 4 + // (float)int <= -4.4 --> false + if (RHS.isNegative()) + return replaceInstUsesWith(I, Builder.getFalse()); + break; + case ICmpInst::ICMP_SLE: + // (float)int <= 4.4 --> int <= 4 + // (float)int <= -4.4 --> int < -4 + if (RHS.isNegative()) + Pred = ICmpInst::ICMP_SLT; + break; + case ICmpInst::ICMP_ULT: + // (float)int < -4.4 --> false + // (float)int < 4.4 --> int <= 4 + if (RHS.isNegative()) + return replaceInstUsesWith(I, Builder.getFalse()); + Pred = ICmpInst::ICMP_ULE; + break; + case ICmpInst::ICMP_SLT: + // (float)int < -4.4 --> int < -4 + // (float)int < 4.4 --> int <= 4 + if (!RHS.isNegative()) + Pred = ICmpInst::ICMP_SLE; + break; + case ICmpInst::ICMP_UGT: + // (float)int > 4.4 --> int > 4 + // (float)int > -4.4 --> true + if (RHS.isNegative()) + return replaceInstUsesWith(I, Builder.getTrue()); + break; + case ICmpInst::ICMP_SGT: + // (float)int > 4.4 --> int > 4 + // (float)int > -4.4 --> int >= -4 + if (RHS.isNegative()) + Pred = ICmpInst::ICMP_SGE; + break; + case ICmpInst::ICMP_UGE: + // (float)int >= -4.4 --> true + // (float)int >= 4.4 --> int > 4 + if (RHS.isNegative()) + return replaceInstUsesWith(I, Builder.getTrue()); + Pred = ICmpInst::ICMP_UGT; + break; + case ICmpInst::ICMP_SGE: + // (float)int >= -4.4 --> int >= -4 + // (float)int >= 4.4 --> int > 4 + if (!RHS.isNegative()) + Pred = ICmpInst::ICMP_SGT; + break; + } + } + } + + // Lower this FP comparison into an appropriate integer version of the + // comparison. + return new ICmpInst(Pred, LHSI->getOperand(0), RHSInt); +} + +/// Fold (C / X) < 0.0 --> X < 0.0 if possible. Swap predicate if necessary. +static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI, + Constant *RHSC) { + // When C is not 0.0 and infinities are not allowed: + // (C / X) < 0.0 is a sign-bit test of X + // (C / X) < 0.0 --> X < 0.0 (if C is positive) + // (C / X) < 0.0 --> X > 0.0 (if C is negative, swap the predicate) + // + // Proof: + // Multiply (C / X) < 0.0 by X * X / C. + // - X is non zero, if it is the flag 'ninf' is violated. + // - C defines the sign of X * X * C. Thus it also defines whether to swap + // the predicate. C is also non zero by definition. + // + // Thus X * X / C is non zero and the transformation is valid. [qed] + + FCmpInst::Predicate Pred = I.getPredicate(); + + // Check that predicates are valid. + if ((Pred != FCmpInst::FCMP_OGT) && (Pred != FCmpInst::FCMP_OLT) && + (Pred != FCmpInst::FCMP_OGE) && (Pred != FCmpInst::FCMP_OLE)) + return nullptr; + + // Check that RHS operand is zero. + if (!match(RHSC, m_AnyZeroFP())) + return nullptr; + + // Check fastmath flags ('ninf'). + if (!LHSI->hasNoInfs() || !I.hasNoInfs()) + return nullptr; + + // Check the properties of the dividend. It must not be zero to avoid a + // division by zero (see Proof). + const APFloat *C; + if (!match(LHSI->getOperand(0), m_APFloat(C))) + return nullptr; + + if (C->isZero()) + return nullptr; + + // Get swapped predicate if necessary. + if (C->isNegative()) + Pred = I.getSwappedPredicate(); + + return new FCmpInst(Pred, LHSI->getOperand(1), RHSC, "", &I); +} + +/// Optimize fabs(X) compared with zero. +static Instruction *foldFabsWithFcmpZero(FCmpInst &I) { + Value *X; + if (!match(I.getOperand(0), m_Intrinsic<Intrinsic::fabs>(m_Value(X))) || + !match(I.getOperand(1), m_PosZeroFP())) + return nullptr; + + auto replacePredAndOp0 = [](FCmpInst *I, FCmpInst::Predicate P, Value *X) { + I->setPredicate(P); + I->setOperand(0, X); + return I; + }; + + switch (I.getPredicate()) { + case FCmpInst::FCMP_UGE: + case FCmpInst::FCMP_OLT: + // fabs(X) >= 0.0 --> true + // fabs(X) < 0.0 --> false + llvm_unreachable("fcmp should have simplified"); + + case FCmpInst::FCMP_OGT: + // fabs(X) > 0.0 --> X != 0.0 + return replacePredAndOp0(&I, FCmpInst::FCMP_ONE, X); + + case FCmpInst::FCMP_UGT: + // fabs(X) u> 0.0 --> X u!= 0.0 + return replacePredAndOp0(&I, FCmpInst::FCMP_UNE, X); + + case FCmpInst::FCMP_OLE: + // fabs(X) <= 0.0 --> X == 0.0 + return replacePredAndOp0(&I, FCmpInst::FCMP_OEQ, X); + + case FCmpInst::FCMP_ULE: + // fabs(X) u<= 0.0 --> X u== 0.0 + return replacePredAndOp0(&I, FCmpInst::FCMP_UEQ, X); + + case FCmpInst::FCMP_OGE: + // fabs(X) >= 0.0 --> !isnan(X) + assert(!I.hasNoNaNs() && "fcmp should have simplified"); + return replacePredAndOp0(&I, FCmpInst::FCMP_ORD, X); + + case FCmpInst::FCMP_ULT: + // fabs(X) u< 0.0 --> isnan(X) + assert(!I.hasNoNaNs() && "fcmp should have simplified"); + return replacePredAndOp0(&I, FCmpInst::FCMP_UNO, X); + + case FCmpInst::FCMP_OEQ: + case FCmpInst::FCMP_UEQ: + case FCmpInst::FCMP_ONE: + case FCmpInst::FCMP_UNE: + case FCmpInst::FCMP_ORD: + case FCmpInst::FCMP_UNO: + // Look through the fabs() because it doesn't change anything but the sign. + // fabs(X) == 0.0 --> X == 0.0, + // fabs(X) != 0.0 --> X != 0.0 + // isnan(fabs(X)) --> isnan(X) + // !isnan(fabs(X) --> !isnan(X) + return replacePredAndOp0(&I, I.getPredicate(), X); + + default: + return nullptr; + } +} + +Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) { + bool Changed = false; + + /// Orders the operands of the compare so that they are listed from most + /// complex to least complex. This puts constants before unary operators, + /// before binary operators. + if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1))) { + I.swapOperands(); + Changed = true; + } + + const CmpInst::Predicate Pred = I.getPredicate(); + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + if (Value *V = SimplifyFCmpInst(Pred, Op0, Op1, I.getFastMathFlags(), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + // Simplify 'fcmp pred X, X' + Type *OpType = Op0->getType(); + assert(OpType == Op1->getType() && "fcmp with different-typed operands?"); + if (Op0 == Op1) { + switch (Pred) { + default: break; + case FCmpInst::FCMP_UNO: // True if unordered: isnan(X) | isnan(Y) + case FCmpInst::FCMP_ULT: // True if unordered or less than + case FCmpInst::FCMP_UGT: // True if unordered or greater than + case FCmpInst::FCMP_UNE: // True if unordered or not equal + // Canonicalize these to be 'fcmp uno %X, 0.0'. + I.setPredicate(FCmpInst::FCMP_UNO); + I.setOperand(1, Constant::getNullValue(OpType)); + return &I; + + case FCmpInst::FCMP_ORD: // True if ordered (no nans) + case FCmpInst::FCMP_OEQ: // True if ordered and equal + case FCmpInst::FCMP_OGE: // True if ordered and greater than or equal + case FCmpInst::FCMP_OLE: // True if ordered and less than or equal + // Canonicalize these to be 'fcmp ord %X, 0.0'. + I.setPredicate(FCmpInst::FCMP_ORD); + I.setOperand(1, Constant::getNullValue(OpType)); + return &I; + } + } + + // If we're just checking for a NaN (ORD/UNO) and have a non-NaN operand, + // then canonicalize the operand to 0.0. + if (Pred == CmpInst::FCMP_ORD || Pred == CmpInst::FCMP_UNO) { + if (!match(Op0, m_PosZeroFP()) && isKnownNeverNaN(Op0, &TLI)) { + I.setOperand(0, ConstantFP::getNullValue(OpType)); + return &I; + } + if (!match(Op1, m_PosZeroFP()) && isKnownNeverNaN(Op1, &TLI)) { + I.setOperand(1, ConstantFP::getNullValue(OpType)); + return &I; + } + } + + // fcmp pred (fneg X), (fneg Y) -> fcmp swap(pred) X, Y + Value *X, *Y; + if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y)))) + return new FCmpInst(I.getSwappedPredicate(), X, Y, "", &I); + + // Test if the FCmpInst instruction is used exclusively by a select as + // part of a minimum or maximum operation. If so, refrain from doing + // any other folding. This helps out other analyses which understand + // non-obfuscated minimum and maximum idioms, such as ScalarEvolution + // and CodeGen. And in this case, at least one of the comparison + // operands has at least one user besides the compare (the select), + // which would often largely negate the benefit of folding anyway. + if (I.hasOneUse()) + if (SelectInst *SI = dyn_cast<SelectInst>(I.user_back())) { + Value *A, *B; + SelectPatternResult SPR = matchSelectPattern(SI, A, B); + if (SPR.Flavor != SPF_UNKNOWN) + return nullptr; + } + + // The sign of 0.0 is ignored by fcmp, so canonicalize to +0.0: + // fcmp Pred X, -0.0 --> fcmp Pred X, 0.0 + if (match(Op1, m_AnyZeroFP()) && !match(Op1, m_PosZeroFP())) { + I.setOperand(1, ConstantFP::getNullValue(OpType)); + return &I; + } + + // Handle fcmp with instruction LHS and constant RHS. + Instruction *LHSI; + Constant *RHSC; + if (match(Op0, m_Instruction(LHSI)) && match(Op1, m_Constant(RHSC))) { + switch (LHSI->getOpcode()) { + case Instruction::PHI: + // Only fold fcmp into the PHI if the phi and fcmp are in the same + // block. If in the same block, we're encouraging jump threading. If + // not, we are just pessimizing the code by making an i1 phi. + if (LHSI->getParent() == I.getParent()) + if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI))) + return NV; + break; + case Instruction::SIToFP: + case Instruction::UIToFP: + if (Instruction *NV = foldFCmpIntToFPConst(I, LHSI, RHSC)) + return NV; + break; + case Instruction::FDiv: + if (Instruction *NV = foldFCmpReciprocalAndZero(I, LHSI, RHSC)) + return NV; + break; + case Instruction::Load: + if (auto *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) + if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) + if (GV->isConstant() && GV->hasDefinitiveInitializer() && + !cast<LoadInst>(LHSI)->isVolatile()) + if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I)) + return Res; + break; + } + } + + if (Instruction *R = foldFabsWithFcmpZero(I)) + return R; + + if (match(Op0, m_FNeg(m_Value(X)))) { + // fcmp pred (fneg X), C --> fcmp swap(pred) X, -C + Constant *C; + if (match(Op1, m_Constant(C))) { + Constant *NegC = ConstantExpr::getFNeg(C); + return new FCmpInst(I.getSwappedPredicate(), X, NegC, "", &I); + } + } + + if (match(Op0, m_FPExt(m_Value(X)))) { + // fcmp (fpext X), (fpext Y) -> fcmp X, Y + if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType()) + return new FCmpInst(Pred, X, Y, "", &I); + + // fcmp (fpext X), C -> fcmp X, (fptrunc C) if fptrunc is lossless + const APFloat *C; + if (match(Op1, m_APFloat(C))) { + const fltSemantics &FPSem = + X->getType()->getScalarType()->getFltSemantics(); + bool Lossy; + APFloat TruncC = *C; + TruncC.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy); + + // Avoid lossy conversions and denormals. + // Zero is a special case that's OK to convert. + APFloat Fabs = TruncC; + Fabs.clearSign(); + if (!Lossy && + ((Fabs.compare(APFloat::getSmallestNormalized(FPSem)) != + APFloat::cmpLessThan) || Fabs.isZero())) { + Constant *NewC = ConstantFP::get(X->getType(), TruncC); + return new FCmpInst(Pred, X, NewC, "", &I); + } + } + } + + if (I.getType()->isVectorTy()) + if (Instruction *Res = foldVectorCmp(I, Builder)) + return Res; + + return Changed ? &I : nullptr; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h new file mode 100644 index 000000000000..1dbc06d92e7a --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -0,0 +1,1006 @@ +//===- InstCombineInternal.h - InstCombine pass internals -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// This file provides internal interfaces used to implement the InstCombine. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H +#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetFolder.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" +#include "llvm/Transforms/Utils/Local.h" +#include <cassert> +#include <cstdint> + +#define DEBUG_TYPE "instcombine" + +using namespace llvm::PatternMatch; + +namespace llvm { + +class APInt; +class AssumptionCache; +class BlockFrequencyInfo; +class DataLayout; +class DominatorTree; +class GEPOperator; +class GlobalVariable; +class LoopInfo; +class OptimizationRemarkEmitter; +class ProfileSummaryInfo; +class TargetLibraryInfo; +class User; + +/// Assign a complexity or rank value to LLVM Values. This is used to reduce +/// the amount of pattern matching needed for compares and commutative +/// instructions. For example, if we have: +/// icmp ugt X, Constant +/// or +/// xor (add X, Constant), cast Z +/// +/// We do not have to consider the commuted variants of these patterns because +/// canonicalization based on complexity guarantees the above ordering. +/// +/// This routine maps IR values to various complexity ranks: +/// 0 -> undef +/// 1 -> Constants +/// 2 -> Other non-instructions +/// 3 -> Arguments +/// 4 -> Cast and (f)neg/not instructions +/// 5 -> Other instructions +static inline unsigned getComplexity(Value *V) { + if (isa<Instruction>(V)) { + if (isa<CastInst>(V) || match(V, m_Neg(m_Value())) || + match(V, m_Not(m_Value())) || match(V, m_FNeg(m_Value()))) + return 4; + return 5; + } + if (isa<Argument>(V)) + return 3; + return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2; +} + +/// Predicate canonicalization reduces the number of patterns that need to be +/// matched by other transforms. For example, we may swap the operands of a +/// conditional branch or select to create a compare with a canonical (inverted) +/// predicate which is then more likely to be matched with other values. +static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) { + switch (Pred) { + case CmpInst::ICMP_NE: + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_SLE: + case CmpInst::ICMP_UGE: + case CmpInst::ICMP_SGE: + // TODO: There are 16 FCMP predicates. Should others be (not) canonical? + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_OLE: + case CmpInst::FCMP_OGE: + return false; + default: + return true; + } +} + +/// Given an exploded icmp instruction, return true if the comparison only +/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the +/// result of the comparison is true when the input value is signed. +inline bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS, + bool &TrueIfSigned) { + switch (Pred) { + case ICmpInst::ICMP_SLT: // True if LHS s< 0 + TrueIfSigned = true; + return RHS.isNullValue(); + case ICmpInst::ICMP_SLE: // True if LHS s<= -1 + TrueIfSigned = true; + return RHS.isAllOnesValue(); + case ICmpInst::ICMP_SGT: // True if LHS s> -1 + TrueIfSigned = false; + return RHS.isAllOnesValue(); + case ICmpInst::ICMP_SGE: // True if LHS s>= 0 + TrueIfSigned = false; + return RHS.isNullValue(); + case ICmpInst::ICMP_UGT: + // True if LHS u> RHS and RHS == sign-bit-mask - 1 + TrueIfSigned = true; + return RHS.isMaxSignedValue(); + case ICmpInst::ICMP_UGE: + // True if LHS u>= RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc) + TrueIfSigned = true; + return RHS.isMinSignedValue(); + case ICmpInst::ICMP_ULT: + // True if LHS u< RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc) + TrueIfSigned = false; + return RHS.isMinSignedValue(); + case ICmpInst::ICMP_ULE: + // True if LHS u<= RHS and RHS == sign-bit-mask - 1 + TrueIfSigned = false; + return RHS.isMaxSignedValue(); + default: + return false; + } +} + +llvm::Optional<std::pair<CmpInst::Predicate, Constant *>> +getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, Constant *C); + +/// Return the source operand of a potentially bitcasted value while optionally +/// checking if it has one use. If there is no bitcast or the one use check is +/// not met, return the input value itself. +static inline Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) { + if (auto *BitCast = dyn_cast<BitCastInst>(V)) + if (!OneUseOnly || BitCast->hasOneUse()) + return BitCast->getOperand(0); + + // V is not a bitcast or V has more than one use and OneUseOnly is true. + return V; +} + +/// Add one to a Constant +static inline Constant *AddOne(Constant *C) { + return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); +} + +/// Subtract one from a Constant +static inline Constant *SubOne(Constant *C) { + return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1)); +} + +/// Return true if the specified value is free to invert (apply ~ to). +/// This happens in cases where the ~ can be eliminated. If WillInvertAllUses +/// is true, work under the assumption that the caller intends to remove all +/// uses of V and only keep uses of ~V. +/// +/// See also: canFreelyInvertAllUsersOf() +static inline bool isFreeToInvert(Value *V, bool WillInvertAllUses) { + // ~(~(X)) -> X. + if (match(V, m_Not(m_Value()))) + return true; + + // Constants can be considered to be not'ed values. + if (match(V, m_AnyIntegralConstant())) + return true; + + // Compares can be inverted if all of their uses are being modified to use the + // ~V. + if (isa<CmpInst>(V)) + return WillInvertAllUses; + + // If `V` is of the form `A + Constant` then `-1 - V` can be folded into `(-1 + // - Constant) - A` if we are willing to invert all of the uses. + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) + if (BO->getOpcode() == Instruction::Add || + BO->getOpcode() == Instruction::Sub) + if (isa<Constant>(BO->getOperand(0)) || isa<Constant>(BO->getOperand(1))) + return WillInvertAllUses; + + // Selects with invertible operands are freely invertible + if (match(V, m_Select(m_Value(), m_Not(m_Value()), m_Not(m_Value())))) + return WillInvertAllUses; + + return false; +} + +/// Given i1 V, can every user of V be freely adapted if V is changed to !V ? +/// +/// See also: isFreeToInvert() +static inline bool canFreelyInvertAllUsersOf(Value *V, Value *IgnoredUser) { + // Look at every user of V. + for (User *U : V->users()) { + if (U == IgnoredUser) + continue; // Don't consider this user. + + auto *I = cast<Instruction>(U); + switch (I->getOpcode()) { + case Instruction::Select: + case Instruction::Br: + break; // Free to invert by swapping true/false values/destinations. + case Instruction::Xor: // Can invert 'xor' if it's a 'not', by ignoring it. + if (!match(I, m_Not(m_Value()))) + return false; // Not a 'not'. + break; + default: + return false; // Don't know, likely not freely invertible. + } + // So far all users were free to invert... + } + return true; // Can freely invert all users! +} + +/// Some binary operators require special handling to avoid poison and undefined +/// behavior. If a constant vector has undef elements, replace those undefs with +/// identity constants if possible because those are always safe to execute. +/// If no identity constant exists, replace undef with some other safe constant. +static inline Constant *getSafeVectorConstantForBinop( + BinaryOperator::BinaryOps Opcode, Constant *In, bool IsRHSConstant) { + assert(In->getType()->isVectorTy() && "Not expecting scalars here"); + + Type *EltTy = In->getType()->getVectorElementType(); + auto *SafeC = ConstantExpr::getBinOpIdentity(Opcode, EltTy, IsRHSConstant); + if (!SafeC) { + // TODO: Should this be available as a constant utility function? It is + // similar to getBinOpAbsorber(). + if (IsRHSConstant) { + switch (Opcode) { + case Instruction::SRem: // X % 1 = 0 + case Instruction::URem: // X %u 1 = 0 + SafeC = ConstantInt::get(EltTy, 1); + break; + case Instruction::FRem: // X % 1.0 (doesn't simplify, but it is safe) + SafeC = ConstantFP::get(EltTy, 1.0); + break; + default: + llvm_unreachable("Only rem opcodes have no identity constant for RHS"); + } + } else { + switch (Opcode) { + case Instruction::Shl: // 0 << X = 0 + case Instruction::LShr: // 0 >>u X = 0 + case Instruction::AShr: // 0 >> X = 0 + case Instruction::SDiv: // 0 / X = 0 + case Instruction::UDiv: // 0 /u X = 0 + case Instruction::SRem: // 0 % X = 0 + case Instruction::URem: // 0 %u X = 0 + case Instruction::Sub: // 0 - X (doesn't simplify, but it is safe) + case Instruction::FSub: // 0.0 - X (doesn't simplify, but it is safe) + case Instruction::FDiv: // 0.0 / X (doesn't simplify, but it is safe) + case Instruction::FRem: // 0.0 % X = 0 + SafeC = Constant::getNullValue(EltTy); + break; + default: + llvm_unreachable("Expected to find identity constant for opcode"); + } + } + } + assert(SafeC && "Must have safe constant for binop"); + unsigned NumElts = In->getType()->getVectorNumElements(); + SmallVector<Constant *, 16> Out(NumElts); + for (unsigned i = 0; i != NumElts; ++i) { + Constant *C = In->getAggregateElement(i); + Out[i] = isa<UndefValue>(C) ? SafeC : C; + } + return ConstantVector::get(Out); +} + +/// The core instruction combiner logic. +/// +/// This class provides both the logic to recursively visit instructions and +/// combine them. +class LLVM_LIBRARY_VISIBILITY InstCombiner + : public InstVisitor<InstCombiner, Instruction *> { + // FIXME: These members shouldn't be public. +public: + /// A worklist of the instructions that need to be simplified. + InstCombineWorklist &Worklist; + + /// An IRBuilder that automatically inserts new instructions into the + /// worklist. + using BuilderTy = IRBuilder<TargetFolder, IRBuilderCallbackInserter>; + BuilderTy &Builder; + +private: + // Mode in which we are running the combiner. + const bool MinimizeSize; + + /// Enable combines that trigger rarely but are costly in compiletime. + const bool ExpensiveCombines; + + AliasAnalysis *AA; + + // Required analyses. + AssumptionCache &AC; + TargetLibraryInfo &TLI; + DominatorTree &DT; + const DataLayout &DL; + const SimplifyQuery SQ; + OptimizationRemarkEmitter &ORE; + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; + + // Optional analyses. When non-null, these can both be used to do better + // combining and will be updated to reflect any changes. + LoopInfo *LI; + + bool MadeIRChange = false; + +public: + InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder, + bool MinimizeSize, bool ExpensiveCombines, AliasAnalysis *AA, + AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, + OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI) + : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize), + ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT), + DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {} + + /// Run the combiner over the entire worklist until it is empty. + /// + /// \returns true if the IR is changed. + bool run(); + + AssumptionCache &getAssumptionCache() const { return AC; } + + const DataLayout &getDataLayout() const { return DL; } + + DominatorTree &getDominatorTree() const { return DT; } + + LoopInfo *getLoopInfo() const { return LI; } + + TargetLibraryInfo &getTargetLibraryInfo() const { return TLI; } + + // Visitation implementation - Implement instruction combining for different + // instruction types. The semantics are as follows: + // Return Value: + // null - No change was made + // I - Change was made, I is still valid, I may be dead though + // otherwise - Change was made, replace I with returned instruction + // + Instruction *visitFNeg(UnaryOperator &I); + Instruction *visitAdd(BinaryOperator &I); + Instruction *visitFAdd(BinaryOperator &I); + Value *OptimizePointerDifference(Value *LHS, Value *RHS, Type *Ty); + Instruction *visitSub(BinaryOperator &I); + Instruction *visitFSub(BinaryOperator &I); + Instruction *visitMul(BinaryOperator &I); + Instruction *visitFMul(BinaryOperator &I); + Instruction *visitURem(BinaryOperator &I); + Instruction *visitSRem(BinaryOperator &I); + Instruction *visitFRem(BinaryOperator &I); + bool simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I); + Instruction *commonRemTransforms(BinaryOperator &I); + Instruction *commonIRemTransforms(BinaryOperator &I); + Instruction *commonDivTransforms(BinaryOperator &I); + Instruction *commonIDivTransforms(BinaryOperator &I); + Instruction *visitUDiv(BinaryOperator &I); + Instruction *visitSDiv(BinaryOperator &I); + Instruction *visitFDiv(BinaryOperator &I); + Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted); + Instruction *visitAnd(BinaryOperator &I); + Instruction *visitOr(BinaryOperator &I); + Instruction *visitXor(BinaryOperator &I); + Instruction *visitShl(BinaryOperator &I); + Value *reassociateShiftAmtsOfTwoSameDirectionShifts( + BinaryOperator *Sh0, const SimplifyQuery &SQ, + bool AnalyzeForSignBitExtraction = false); + Instruction *canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract( + BinaryOperator &I); + Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract( + BinaryOperator &OldAShr); + Instruction *visitAShr(BinaryOperator &I); + Instruction *visitLShr(BinaryOperator &I); + Instruction *commonShiftTransforms(BinaryOperator &I); + Instruction *visitFCmpInst(FCmpInst &I); + Instruction *visitICmpInst(ICmpInst &I); + Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1, + BinaryOperator &I); + Instruction *commonCastTransforms(CastInst &CI); + Instruction *commonPointerCastTransforms(CastInst &CI); + Instruction *visitTrunc(TruncInst &CI); + Instruction *visitZExt(ZExtInst &CI); + Instruction *visitSExt(SExtInst &CI); + Instruction *visitFPTrunc(FPTruncInst &CI); + Instruction *visitFPExt(CastInst &CI); + Instruction *visitFPToUI(FPToUIInst &FI); + Instruction *visitFPToSI(FPToSIInst &FI); + Instruction *visitUIToFP(CastInst &CI); + Instruction *visitSIToFP(CastInst &CI); + Instruction *visitPtrToInt(PtrToIntInst &CI); + Instruction *visitIntToPtr(IntToPtrInst &CI); + Instruction *visitBitCast(BitCastInst &CI); + Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI); + Instruction *FoldItoFPtoI(Instruction &FI); + Instruction *visitSelectInst(SelectInst &SI); + Instruction *visitCallInst(CallInst &CI); + Instruction *visitInvokeInst(InvokeInst &II); + Instruction *visitCallBrInst(CallBrInst &CBI); + + Instruction *SliceUpIllegalIntegerPHI(PHINode &PN); + Instruction *visitPHINode(PHINode &PN); + Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP); + Instruction *visitAllocaInst(AllocaInst &AI); + Instruction *visitAllocSite(Instruction &FI); + Instruction *visitFree(CallInst &FI); + Instruction *visitLoadInst(LoadInst &LI); + Instruction *visitStoreInst(StoreInst &SI); + Instruction *visitAtomicRMWInst(AtomicRMWInst &SI); + Instruction *visitBranchInst(BranchInst &BI); + Instruction *visitFenceInst(FenceInst &FI); + Instruction *visitSwitchInst(SwitchInst &SI); + Instruction *visitReturnInst(ReturnInst &RI); + Instruction *visitInsertValueInst(InsertValueInst &IV); + Instruction *visitInsertElementInst(InsertElementInst &IE); + Instruction *visitExtractElementInst(ExtractElementInst &EI); + Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI); + Instruction *visitExtractValueInst(ExtractValueInst &EV); + Instruction *visitLandingPadInst(LandingPadInst &LI); + Instruction *visitVAStartInst(VAStartInst &I); + Instruction *visitVACopyInst(VACopyInst &I); + + /// Specify what to return for unhandled instructions. + Instruction *visitInstruction(Instruction &I) { return nullptr; } + + /// True when DB dominates all uses of DI except UI. + /// UI must be in the same block as DI. + /// The routine checks that the DI parent and DB are different. + bool dominatesAllUses(const Instruction *DI, const Instruction *UI, + const BasicBlock *DB) const; + + /// Try to replace select with select operand SIOpd in SI-ICmp sequence. + bool replacedSelectWithOperand(SelectInst *SI, const ICmpInst *Icmp, + const unsigned SIOpd); + + /// Try to replace instruction \p I with value \p V which are pointers + /// in different address space. + /// \return true if successful. + bool replacePointer(Instruction &I, Value *V); + +private: + bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const; + bool shouldChangeType(Type *From, Type *To) const; + Value *dyn_castNegVal(Value *V) const; + Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset, + SmallVectorImpl<Value *> &NewIndices); + + /// Classify whether a cast is worth optimizing. + /// + /// This is a helper to decide whether the simplification of + /// logic(cast(A), cast(B)) to cast(logic(A, B)) should be performed. + /// + /// \param CI The cast we are interested in. + /// + /// \return true if this cast actually results in any code being generated and + /// if it cannot already be eliminated by some other transformation. + bool shouldOptimizeCast(CastInst *CI); + + /// Try to optimize a sequence of instructions checking if an operation + /// on LHS and RHS overflows. + /// + /// If this overflow check is done via one of the overflow check intrinsics, + /// then CtxI has to be the call instruction calling that intrinsic. If this + /// overflow check is done by arithmetic followed by a compare, then CtxI has + /// to be the arithmetic instruction. + /// + /// If a simplification is possible, stores the simplified result of the + /// operation in OperationResult and result of the overflow check in + /// OverflowResult, and return true. If no simplification is possible, + /// returns false. + bool OptimizeOverflowCheck(Instruction::BinaryOps BinaryOp, bool IsSigned, + Value *LHS, Value *RHS, + Instruction &CtxI, Value *&OperationResult, + Constant *&OverflowResult); + + Instruction *visitCallBase(CallBase &Call); + Instruction *tryOptimizeCall(CallInst *CI); + bool transformConstExprCastCall(CallBase &Call); + Instruction *transformCallThroughTrampoline(CallBase &Call, + IntrinsicInst &Tramp); + + Value *simplifyMaskedLoad(IntrinsicInst &II); + Instruction *simplifyMaskedStore(IntrinsicInst &II); + Instruction *simplifyMaskedGather(IntrinsicInst &II); + Instruction *simplifyMaskedScatter(IntrinsicInst &II); + + /// Transform (zext icmp) to bitwise / integer operations in order to + /// eliminate it. + /// + /// \param ICI The icmp of the (zext icmp) pair we are interested in. + /// \parem CI The zext of the (zext icmp) pair we are interested in. + /// \param DoTransform Pass false to just test whether the given (zext icmp) + /// would be transformed. Pass true to actually perform the transformation. + /// + /// \return null if the transformation cannot be performed. If the + /// transformation can be performed the new instruction that replaces the + /// (zext icmp) pair will be returned (if \p DoTransform is false the + /// unmodified \p ICI will be returned in this case). + Instruction *transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, + bool DoTransform = true); + + Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI); + + bool willNotOverflowSignedAdd(const Value *LHS, const Value *RHS, + const Instruction &CxtI) const { + return computeOverflowForSignedAdd(LHS, RHS, &CxtI) == + OverflowResult::NeverOverflows; + } + + bool willNotOverflowUnsignedAdd(const Value *LHS, const Value *RHS, + const Instruction &CxtI) const { + return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) == + OverflowResult::NeverOverflows; + } + + bool willNotOverflowAdd(const Value *LHS, const Value *RHS, + const Instruction &CxtI, bool IsSigned) const { + return IsSigned ? willNotOverflowSignedAdd(LHS, RHS, CxtI) + : willNotOverflowUnsignedAdd(LHS, RHS, CxtI); + } + + bool willNotOverflowSignedSub(const Value *LHS, const Value *RHS, + const Instruction &CxtI) const { + return computeOverflowForSignedSub(LHS, RHS, &CxtI) == + OverflowResult::NeverOverflows; + } + + bool willNotOverflowUnsignedSub(const Value *LHS, const Value *RHS, + const Instruction &CxtI) const { + return computeOverflowForUnsignedSub(LHS, RHS, &CxtI) == + OverflowResult::NeverOverflows; + } + + bool willNotOverflowSub(const Value *LHS, const Value *RHS, + const Instruction &CxtI, bool IsSigned) const { + return IsSigned ? willNotOverflowSignedSub(LHS, RHS, CxtI) + : willNotOverflowUnsignedSub(LHS, RHS, CxtI); + } + + bool willNotOverflowSignedMul(const Value *LHS, const Value *RHS, + const Instruction &CxtI) const { + return computeOverflowForSignedMul(LHS, RHS, &CxtI) == + OverflowResult::NeverOverflows; + } + + bool willNotOverflowUnsignedMul(const Value *LHS, const Value *RHS, + const Instruction &CxtI) const { + return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) == + OverflowResult::NeverOverflows; + } + + bool willNotOverflowMul(const Value *LHS, const Value *RHS, + const Instruction &CxtI, bool IsSigned) const { + return IsSigned ? willNotOverflowSignedMul(LHS, RHS, CxtI) + : willNotOverflowUnsignedMul(LHS, RHS, CxtI); + } + + bool willNotOverflow(BinaryOperator::BinaryOps Opcode, const Value *LHS, + const Value *RHS, const Instruction &CxtI, + bool IsSigned) const { + switch (Opcode) { + case Instruction::Add: return willNotOverflowAdd(LHS, RHS, CxtI, IsSigned); + case Instruction::Sub: return willNotOverflowSub(LHS, RHS, CxtI, IsSigned); + case Instruction::Mul: return willNotOverflowMul(LHS, RHS, CxtI, IsSigned); + default: llvm_unreachable("Unexpected opcode for overflow query"); + } + } + + Value *EmitGEPOffset(User *GEP); + Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN); + Instruction *foldCastedBitwiseLogic(BinaryOperator &I); + Instruction *narrowBinOp(TruncInst &Trunc); + Instruction *narrowMaskedBinOp(BinaryOperator &And); + Instruction *narrowMathIfNoOverflow(BinaryOperator &I); + Instruction *narrowRotate(TruncInst &Trunc); + Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN); + Instruction *matchSAddSubSat(SelectInst &MinMax1); + + /// Determine if a pair of casts can be replaced by a single cast. + /// + /// \param CI1 The first of a pair of casts. + /// \param CI2 The second of a pair of casts. + /// + /// \return 0 if the cast pair cannot be eliminated, otherwise returns an + /// Instruction::CastOps value for a cast that can replace the pair, casting + /// CI1->getSrcTy() to CI2->getDstTy(). + /// + /// \see CastInst::isEliminableCastPair + Instruction::CastOps isEliminableCastPair(const CastInst *CI1, + const CastInst *CI2); + + Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI); + Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI); + Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &I); + + /// Optimize (fcmp)&(fcmp) or (fcmp)|(fcmp). + /// NOTE: Unlike most of instcombine, this returns a Value which should + /// already be inserted into the function. + Value *foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd); + + Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS, + bool JoinedByAnd, Instruction &CxtI); + Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D); + Value *getSelectCondition(Value *A, Value *B); + + Instruction *foldIntrinsicWithOverflowCommon(IntrinsicInst *II); + +public: + /// Inserts an instruction \p New before instruction \p Old + /// + /// Also adds the new instruction to the worklist and returns \p New so that + /// it is suitable for use as the return from the visitation patterns. + Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) { + assert(New && !New->getParent() && + "New instruction already inserted into a basic block!"); + BasicBlock *BB = Old.getParent(); + BB->getInstList().insert(Old.getIterator(), New); // Insert inst + Worklist.Add(New); + return New; + } + + /// Same as InsertNewInstBefore, but also sets the debug loc. + Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) { + New->setDebugLoc(Old.getDebugLoc()); + return InsertNewInstBefore(New, Old); + } + + /// A combiner-aware RAUW-like routine. + /// + /// This method is to be used when an instruction is found to be dead, + /// replaceable with another preexisting expression. Here we add all uses of + /// I to the worklist, replace all uses of I with the new value, then return + /// I, so that the inst combiner will know that I was modified. + Instruction *replaceInstUsesWith(Instruction &I, Value *V) { + // If there are no uses to replace, then we return nullptr to indicate that + // no changes were made to the program. + if (I.use_empty()) return nullptr; + + Worklist.AddUsersToWorkList(I); // Add all modified instrs to worklist. + + // If we are replacing the instruction with itself, this must be in a + // segment of unreachable code, so just clobber the instruction. + if (&I == V) + V = UndefValue::get(I.getType()); + + LLVM_DEBUG(dbgs() << "IC: Replacing " << I << "\n" + << " with " << *V << '\n'); + + I.replaceAllUsesWith(V); + return &I; + } + + /// Creates a result tuple for an overflow intrinsic \p II with a given + /// \p Result and a constant \p Overflow value. + Instruction *CreateOverflowTuple(IntrinsicInst *II, Value *Result, + Constant *Overflow) { + Constant *V[] = {UndefValue::get(Result->getType()), Overflow}; + StructType *ST = cast<StructType>(II->getType()); + Constant *Struct = ConstantStruct::get(ST, V); + return InsertValueInst::Create(Struct, Result, 0); + } + + /// Create and insert the idiom we use to indicate a block is unreachable + /// without having to rewrite the CFG from within InstCombine. + void CreateNonTerminatorUnreachable(Instruction *InsertAt) { + auto &Ctx = InsertAt->getContext(); + new StoreInst(ConstantInt::getTrue(Ctx), + UndefValue::get(Type::getInt1PtrTy(Ctx)), + InsertAt); + } + + + /// Combiner aware instruction erasure. + /// + /// When dealing with an instruction that has side effects or produces a void + /// value, we can't rely on DCE to delete the instruction. Instead, visit + /// methods should return the value returned by this function. + Instruction *eraseInstFromFunction(Instruction &I) { + LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n'); + assert(I.use_empty() && "Cannot erase instruction that is used!"); + salvageDebugInfo(I); + + // Make sure that we reprocess all operands now that we reduced their + // use counts. + if (I.getNumOperands() < 8) { + for (Use &Operand : I.operands()) + if (auto *Inst = dyn_cast<Instruction>(Operand)) + Worklist.Add(Inst); + } + Worklist.Remove(&I); + I.eraseFromParent(); + MadeIRChange = true; + return nullptr; // Don't do anything with FI + } + + void computeKnownBits(const Value *V, KnownBits &Known, + unsigned Depth, const Instruction *CxtI) const { + llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT); + } + + KnownBits computeKnownBits(const Value *V, unsigned Depth, + const Instruction *CxtI) const { + return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT); + } + + bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero = false, + unsigned Depth = 0, + const Instruction *CxtI = nullptr) { + return llvm::isKnownToBeAPowerOfTwo(V, DL, OrZero, Depth, &AC, CxtI, &DT); + } + + bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0, + const Instruction *CxtI = nullptr) const { + return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT); + } + + unsigned ComputeNumSignBits(const Value *Op, unsigned Depth = 0, + const Instruction *CxtI = nullptr) const { + return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT); + } + + OverflowResult computeOverflowForUnsignedMul(const Value *LHS, + const Value *RHS, + const Instruction *CxtI) const { + return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT); + } + + OverflowResult computeOverflowForSignedMul(const Value *LHS, + const Value *RHS, + const Instruction *CxtI) const { + return llvm::computeOverflowForSignedMul(LHS, RHS, DL, &AC, CxtI, &DT); + } + + OverflowResult computeOverflowForUnsignedAdd(const Value *LHS, + const Value *RHS, + const Instruction *CxtI) const { + return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT); + } + + OverflowResult computeOverflowForSignedAdd(const Value *LHS, + const Value *RHS, + const Instruction *CxtI) const { + return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT); + } + + OverflowResult computeOverflowForUnsignedSub(const Value *LHS, + const Value *RHS, + const Instruction *CxtI) const { + return llvm::computeOverflowForUnsignedSub(LHS, RHS, DL, &AC, CxtI, &DT); + } + + OverflowResult computeOverflowForSignedSub(const Value *LHS, const Value *RHS, + const Instruction *CxtI) const { + return llvm::computeOverflowForSignedSub(LHS, RHS, DL, &AC, CxtI, &DT); + } + + OverflowResult computeOverflow( + Instruction::BinaryOps BinaryOp, bool IsSigned, + Value *LHS, Value *RHS, Instruction *CxtI) const; + + /// Maximum size of array considered when transforming. + uint64_t MaxArraySizeForCombine = 0; + +private: + /// Performs a few simplifications for operators which are associative + /// or commutative. + bool SimplifyAssociativeOrCommutative(BinaryOperator &I); + + /// Tries to simplify binary operations which some other binary + /// operation distributes over. + /// + /// It does this by either by factorizing out common terms (eg "(A*B)+(A*C)" + /// -> "A*(B+C)") or expanding out if this results in simplifications (eg: "A + /// & (B | C) -> (A&B) | (A&C)" if this is a win). Returns the simplified + /// value, or null if it didn't simplify. + Value *SimplifyUsingDistributiveLaws(BinaryOperator &I); + + /// Tries to simplify add operations using the definition of remainder. + /// + /// The definition of remainder is X % C = X - (X / C ) * C. The add + /// expression X % C0 + (( X / C0 ) % C1) * C0 can be simplified to + /// X % (C0 * C1) + Value *SimplifyAddWithRemainder(BinaryOperator &I); + + // Binary Op helper for select operations where the expression can be + // efficiently reorganized. + Value *SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS, + Value *RHS); + + /// This tries to simplify binary operations by factorizing out common terms + /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)"). + Value *tryFactorization(BinaryOperator &, Instruction::BinaryOps, Value *, + Value *, Value *, Value *); + + /// Match a select chain which produces one of three values based on whether + /// the LHS is less than, equal to, or greater than RHS respectively. + /// Return true if we matched a three way compare idiom. The LHS, RHS, Less, + /// Equal and Greater values are saved in the matching process and returned to + /// the caller. + bool matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, Value *&RHS, + ConstantInt *&Less, ConstantInt *&Equal, + ConstantInt *&Greater); + + /// Attempts to replace V with a simpler value based on the demanded + /// bits. + Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownBits &Known, + unsigned Depth, Instruction *CxtI); + bool SimplifyDemandedBits(Instruction *I, unsigned Op, + const APInt &DemandedMask, KnownBits &Known, + unsigned Depth = 0); + + /// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne + /// bits. It also tries to handle simplifications that can be done based on + /// DemandedMask, but without modifying the Instruction. + Value *SimplifyMultipleUseDemandedBits(Instruction *I, + const APInt &DemandedMask, + KnownBits &Known, + unsigned Depth, Instruction *CxtI); + + /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded + /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence. + Value *simplifyShrShlDemandedBits( + Instruction *Shr, const APInt &ShrOp1, Instruction *Shl, + const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known); + + /// Tries to simplify operands to an integer instruction based on its + /// demanded bits. + bool SimplifyDemandedInstructionBits(Instruction &Inst); + + Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, + APInt DemandedElts, + int DmaskIdx = -1); + + Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, + APInt &UndefElts, unsigned Depth = 0, + bool AllowMultipleUsers = false); + + /// Canonicalize the position of binops relative to shufflevector. + Instruction *foldVectorBinop(BinaryOperator &Inst); + + /// Given a binary operator, cast instruction, or select which has a PHI node + /// as operand #0, see if we can fold the instruction into the PHI (which is + /// only possible if all operands to the PHI are constants). + Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN); + + /// Given an instruction with a select as one operand and a constant as the + /// other operand, try to fold the binary operator into the select arguments. + /// This also works for Cast instructions, which obviously do not have a + /// second operand. + Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI); + + /// This is a convenience wrapper function for the above two functions. + Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I); + + Instruction *foldAddWithConstant(BinaryOperator &Add); + + /// Try to rotate an operation below a PHI node, using PHI nodes for + /// its operands. + Instruction *FoldPHIArgOpIntoPHI(PHINode &PN); + Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN); + Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN); + Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN); + Instruction *FoldPHIArgZextsIntoPHI(PHINode &PN); + + /// If an integer typed PHI has only one use which is an IntToPtr operation, + /// replace the PHI with an existing pointer typed PHI if it exists. Otherwise + /// insert a new pointer typed PHI and replace the original one. + Instruction *FoldIntegerTypedPHI(PHINode &PN); + + /// Helper function for FoldPHIArgXIntoPHI() to set debug location for the + /// folded operation. + void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN); + + Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, + ICmpInst::Predicate Cond, Instruction &I); + Instruction *foldAllocaCmp(ICmpInst &ICI, const AllocaInst *Alloca, + const Value *Other); + Instruction *foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, + GlobalVariable *GV, CmpInst &ICI, + ConstantInt *AndCst = nullptr); + Instruction *foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI, + Constant *RHSC); + Instruction *foldICmpAddOpConst(Value *X, const APInt &C, + ICmpInst::Predicate Pred); + Instruction *foldICmpWithCastOp(ICmpInst &ICI); + + Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp); + Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp); + Instruction *foldICmpWithConstant(ICmpInst &Cmp); + Instruction *foldICmpInstWithConstant(ICmpInst &Cmp); + Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp); + Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ); + Instruction *foldICmpEquality(ICmpInst &Cmp); + Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I); + Instruction *foldSignBitTest(ICmpInst &I); + Instruction *foldICmpWithZero(ICmpInst &Cmp); + + Value *foldUnsignedMultiplicationOverflowCheck(ICmpInst &Cmp); + + Instruction *foldICmpSelectConstant(ICmpInst &Cmp, SelectInst *Select, + ConstantInt *C); + Instruction *foldICmpTruncConstant(ICmpInst &Cmp, TruncInst *Trunc, + const APInt &C); + Instruction *foldICmpAndConstant(ICmpInst &Cmp, BinaryOperator *And, + const APInt &C); + Instruction *foldICmpXorConstant(ICmpInst &Cmp, BinaryOperator *Xor, + const APInt &C); + Instruction *foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or, + const APInt &C); + Instruction *foldICmpMulConstant(ICmpInst &Cmp, BinaryOperator *Mul, + const APInt &C); + Instruction *foldICmpShlConstant(ICmpInst &Cmp, BinaryOperator *Shl, + const APInt &C); + Instruction *foldICmpShrConstant(ICmpInst &Cmp, BinaryOperator *Shr, + const APInt &C); + Instruction *foldICmpSRemConstant(ICmpInst &Cmp, BinaryOperator *UDiv, + const APInt &C); + Instruction *foldICmpUDivConstant(ICmpInst &Cmp, BinaryOperator *UDiv, + const APInt &C); + Instruction *foldICmpDivConstant(ICmpInst &Cmp, BinaryOperator *Div, + const APInt &C); + Instruction *foldICmpSubConstant(ICmpInst &Cmp, BinaryOperator *Sub, + const APInt &C); + Instruction *foldICmpAddConstant(ICmpInst &Cmp, BinaryOperator *Add, + const APInt &C); + Instruction *foldICmpAndConstConst(ICmpInst &Cmp, BinaryOperator *And, + const APInt &C1); + Instruction *foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And, + const APInt &C1, const APInt &C2); + Instruction *foldICmpShrConstConst(ICmpInst &I, Value *ShAmt, const APInt &C1, + const APInt &C2); + Instruction *foldICmpShlConstConst(ICmpInst &I, Value *ShAmt, const APInt &C1, + const APInt &C2); + + Instruction *foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp, + BinaryOperator *BO, + const APInt &C); + Instruction *foldICmpIntrinsicWithConstant(ICmpInst &ICI, IntrinsicInst *II, + const APInt &C); + Instruction *foldICmpEqIntrinsicWithConstant(ICmpInst &ICI, IntrinsicInst *II, + const APInt &C); + + // Helpers of visitSelectInst(). + Instruction *foldSelectExtConst(SelectInst &Sel); + Instruction *foldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI); + Instruction *foldSelectIntoOp(SelectInst &SI, Value *, Value *); + Instruction *foldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1, + Value *A, Value *B, Instruction &Outer, + SelectPatternFlavor SPF2, Value *C); + Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI); + + Instruction *OptAndOp(BinaryOperator *Op, ConstantInt *OpRHS, + ConstantInt *AndRHS, BinaryOperator &TheAnd); + + Value *insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi, + bool isSigned, bool Inside); + Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI); + bool mergeStoreIntoSuccessor(StoreInst &SI); + + /// Given an 'or' instruction, check to see if it is part of a bswap idiom. + /// If so, return the equivalent bswap intrinsic. + Instruction *matchBSwap(BinaryOperator &Or); + + Instruction *SimplifyAnyMemTransfer(AnyMemTransferInst *MI); + Instruction *SimplifyAnyMemSet(AnyMemSetInst *MI); + + Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned); + + /// Returns a value X such that Val = X * Scale, or null if none. + /// + /// If the multiplication is known not to overflow then NoSignedWrap is set. + Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap); +}; + +} // end namespace llvm + +#undef DEBUG_TYPE + +#endif // LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp new file mode 100644 index 000000000000..3a0e05832fcb --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -0,0 +1,1600 @@ +//===- InstCombineLoadStoreAlloca.cpp -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the visit functions for load, store and alloca. +// +//===----------------------------------------------------------------------===// + +#include "InstCombineInternal.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/ConstantRange.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "instcombine" + +STATISTIC(NumDeadStore, "Number of dead stores eliminated"); +STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global"); + +/// pointsToConstantGlobal - Return true if V (possibly indirectly) points to +/// some part of a constant global variable. This intentionally only accepts +/// constant expressions because we can't rewrite arbitrary instructions. +static bool pointsToConstantGlobal(Value *V) { + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) + return GV->isConstant(); + + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) { + if (CE->getOpcode() == Instruction::BitCast || + CE->getOpcode() == Instruction::AddrSpaceCast || + CE->getOpcode() == Instruction::GetElementPtr) + return pointsToConstantGlobal(CE->getOperand(0)); + } + return false; +} + +/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived) +/// pointer to an alloca. Ignore any reads of the pointer, return false if we +/// see any stores or other unknown uses. If we see pointer arithmetic, keep +/// track of whether it moves the pointer (with IsOffset) but otherwise traverse +/// the uses. If we see a memcpy/memmove that targets an unoffseted pointer to +/// the alloca, and if the source pointer is a pointer to a constant global, we +/// can optimize this. +static bool +isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, + SmallVectorImpl<Instruction *> &ToDelete) { + // We track lifetime intrinsics as we encounter them. If we decide to go + // ahead and replace the value with the global, this lets the caller quickly + // eliminate the markers. + + SmallVector<std::pair<Value *, bool>, 35> ValuesToInspect; + ValuesToInspect.emplace_back(V, false); + while (!ValuesToInspect.empty()) { + auto ValuePair = ValuesToInspect.pop_back_val(); + const bool IsOffset = ValuePair.second; + for (auto &U : ValuePair.first->uses()) { + auto *I = cast<Instruction>(U.getUser()); + + if (auto *LI = dyn_cast<LoadInst>(I)) { + // Ignore non-volatile loads, they are always ok. + if (!LI->isSimple()) return false; + continue; + } + + if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) { + // If uses of the bitcast are ok, we are ok. + ValuesToInspect.emplace_back(I, IsOffset); + continue; + } + if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { + // If the GEP has all zero indices, it doesn't offset the pointer. If it + // doesn't, it does. + ValuesToInspect.emplace_back(I, IsOffset || !GEP->hasAllZeroIndices()); + continue; + } + + if (auto *Call = dyn_cast<CallBase>(I)) { + // If this is the function being called then we treat it like a load and + // ignore it. + if (Call->isCallee(&U)) + continue; + + unsigned DataOpNo = Call->getDataOperandNo(&U); + bool IsArgOperand = Call->isArgOperand(&U); + + // Inalloca arguments are clobbered by the call. + if (IsArgOperand && Call->isInAllocaArgument(DataOpNo)) + return false; + + // If this is a readonly/readnone call site, then we know it is just a + // load (but one that potentially returns the value itself), so we can + // ignore it if we know that the value isn't captured. + if (Call->onlyReadsMemory() && + (Call->use_empty() || Call->doesNotCapture(DataOpNo))) + continue; + + // If this is being passed as a byval argument, the caller is making a + // copy, so it is only a read of the alloca. + if (IsArgOperand && Call->isByValArgument(DataOpNo)) + continue; + } + + // Lifetime intrinsics can be handled by the caller. + if (I->isLifetimeStartOrEnd()) { + assert(I->use_empty() && "Lifetime markers have no result to use!"); + ToDelete.push_back(I); + continue; + } + + // If this is isn't our memcpy/memmove, reject it as something we can't + // handle. + MemTransferInst *MI = dyn_cast<MemTransferInst>(I); + if (!MI) + return false; + + // If the transfer is using the alloca as a source of the transfer, then + // ignore it since it is a load (unless the transfer is volatile). + if (U.getOperandNo() == 1) { + if (MI->isVolatile()) return false; + continue; + } + + // If we already have seen a copy, reject the second one. + if (TheCopy) return false; + + // If the pointer has been offset from the start of the alloca, we can't + // safely handle this. + if (IsOffset) return false; + + // If the memintrinsic isn't using the alloca as the dest, reject it. + if (U.getOperandNo() != 0) return false; + + // If the source of the memcpy/move is not a constant global, reject it. + if (!pointsToConstantGlobal(MI->getSource())) + return false; + + // Otherwise, the transform is safe. Remember the copy instruction. + TheCopy = MI; + } + } + return true; +} + +/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only +/// modified by a copy from a constant global. If we can prove this, we can +/// replace any uses of the alloca with uses of the global directly. +static MemTransferInst * +isOnlyCopiedFromConstantGlobal(AllocaInst *AI, + SmallVectorImpl<Instruction *> &ToDelete) { + MemTransferInst *TheCopy = nullptr; + if (isOnlyCopiedFromConstantGlobal(AI, TheCopy, ToDelete)) + return TheCopy; + return nullptr; +} + +/// Returns true if V is dereferenceable for size of alloca. +static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI, + const DataLayout &DL) { + if (AI->isArrayAllocation()) + return false; + uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType()); + if (!AllocaSize) + return false; + return isDereferenceableAndAlignedPointer(V, Align(AI->getAlignment()), + APInt(64, AllocaSize), DL); +} + +static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) { + // Check for array size of 1 (scalar allocation). + if (!AI.isArrayAllocation()) { + // i32 1 is the canonical array size for scalar allocations. + if (AI.getArraySize()->getType()->isIntegerTy(32)) + return nullptr; + + // Canonicalize it. + Value *V = IC.Builder.getInt32(1); + AI.setOperand(0, V); + return &AI; + } + + // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1 + if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) { + if (C->getValue().getActiveBits() <= 64) { + Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue()); + AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName()); + New->setAlignment(MaybeAlign(AI.getAlignment())); + + // Scan to the end of the allocation instructions, to skip over a block of + // allocas if possible...also skip interleaved debug info + // + BasicBlock::iterator It(New); + while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It)) + ++It; + + // Now that I is pointing to the first non-allocation-inst in the block, + // insert our getelementptr instruction... + // + Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType()); + Value *NullIdx = Constant::getNullValue(IdxTy); + Value *Idx[2] = {NullIdx, NullIdx}; + Instruction *GEP = GetElementPtrInst::CreateInBounds( + NewTy, New, Idx, New->getName() + ".sub"); + IC.InsertNewInstBefore(GEP, *It); + + // Now make everything use the getelementptr instead of the original + // allocation. + return IC.replaceInstUsesWith(AI, GEP); + } + } + + if (isa<UndefValue>(AI.getArraySize())) + return IC.replaceInstUsesWith(AI, Constant::getNullValue(AI.getType())); + + // Ensure that the alloca array size argument has type intptr_t, so that + // any casting is exposed early. + Type *IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType()); + if (AI.getArraySize()->getType() != IntPtrTy) { + Value *V = IC.Builder.CreateIntCast(AI.getArraySize(), IntPtrTy, false); + AI.setOperand(0, V); + return &AI; + } + + return nullptr; +} + +namespace { +// If I and V are pointers in different address space, it is not allowed to +// use replaceAllUsesWith since I and V have different types. A +// non-target-specific transformation should not use addrspacecast on V since +// the two address space may be disjoint depending on target. +// +// This class chases down uses of the old pointer until reaching the load +// instructions, then replaces the old pointer in the load instructions with +// the new pointer. If during the chasing it sees bitcast or GEP, it will +// create new bitcast or GEP with the new pointer and use them in the load +// instruction. +class PointerReplacer { +public: + PointerReplacer(InstCombiner &IC) : IC(IC) {} + void replacePointer(Instruction &I, Value *V); + +private: + void findLoadAndReplace(Instruction &I); + void replace(Instruction *I); + Value *getReplacement(Value *I); + + SmallVector<Instruction *, 4> Path; + MapVector<Value *, Value *> WorkMap; + InstCombiner &IC; +}; +} // end anonymous namespace + +void PointerReplacer::findLoadAndReplace(Instruction &I) { + for (auto U : I.users()) { + auto *Inst = dyn_cast<Instruction>(&*U); + if (!Inst) + return; + LLVM_DEBUG(dbgs() << "Found pointer user: " << *U << '\n'); + if (isa<LoadInst>(Inst)) { + for (auto P : Path) + replace(P); + replace(Inst); + } else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) { + Path.push_back(Inst); + findLoadAndReplace(*Inst); + Path.pop_back(); + } else { + return; + } + } +} + +Value *PointerReplacer::getReplacement(Value *V) { + auto Loc = WorkMap.find(V); + if (Loc != WorkMap.end()) + return Loc->second; + return nullptr; +} + +void PointerReplacer::replace(Instruction *I) { + if (getReplacement(I)) + return; + + if (auto *LT = dyn_cast<LoadInst>(I)) { + auto *V = getReplacement(LT->getPointerOperand()); + assert(V && "Operand not replaced"); + auto *NewI = new LoadInst(I->getType(), V); + NewI->takeName(LT); + IC.InsertNewInstWith(NewI, *LT); + IC.replaceInstUsesWith(*LT, NewI); + WorkMap[LT] = NewI; + } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { + auto *V = getReplacement(GEP->getPointerOperand()); + assert(V && "Operand not replaced"); + SmallVector<Value *, 8> Indices; + Indices.append(GEP->idx_begin(), GEP->idx_end()); + auto *NewI = GetElementPtrInst::Create( + V->getType()->getPointerElementType(), V, Indices); + IC.InsertNewInstWith(NewI, *GEP); + NewI->takeName(GEP); + WorkMap[GEP] = NewI; + } else if (auto *BC = dyn_cast<BitCastInst>(I)) { + auto *V = getReplacement(BC->getOperand(0)); + assert(V && "Operand not replaced"); + auto *NewT = PointerType::get(BC->getType()->getPointerElementType(), + V->getType()->getPointerAddressSpace()); + auto *NewI = new BitCastInst(V, NewT); + IC.InsertNewInstWith(NewI, *BC); + NewI->takeName(BC); + WorkMap[BC] = NewI; + } else { + llvm_unreachable("should never reach here"); + } +} + +void PointerReplacer::replacePointer(Instruction &I, Value *V) { +#ifndef NDEBUG + auto *PT = cast<PointerType>(I.getType()); + auto *NT = cast<PointerType>(V->getType()); + assert(PT != NT && PT->getElementType() == NT->getElementType() && + "Invalid usage"); +#endif + WorkMap[&I] = V; + findLoadAndReplace(I); +} + +Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { + if (auto *I = simplifyAllocaArraySize(*this, AI)) + return I; + + if (AI.getAllocatedType()->isSized()) { + // If the alignment is 0 (unspecified), assign it the preferred alignment. + if (AI.getAlignment() == 0) + AI.setAlignment( + MaybeAlign(DL.getPrefTypeAlignment(AI.getAllocatedType()))); + + // Move all alloca's of zero byte objects to the entry block and merge them + // together. Note that we only do this for alloca's, because malloc should + // allocate and return a unique pointer, even for a zero byte allocation. + if (DL.getTypeAllocSize(AI.getAllocatedType()) == 0) { + // For a zero sized alloca there is no point in doing an array allocation. + // This is helpful if the array size is a complicated expression not used + // elsewhere. + if (AI.isArrayAllocation()) { + AI.setOperand(0, ConstantInt::get(AI.getArraySize()->getType(), 1)); + return &AI; + } + + // Get the first instruction in the entry block. + BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock(); + Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg(); + if (FirstInst != &AI) { + // If the entry block doesn't start with a zero-size alloca then move + // this one to the start of the entry block. There is no problem with + // dominance as the array size was forced to a constant earlier already. + AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst); + if (!EntryAI || !EntryAI->getAllocatedType()->isSized() || + DL.getTypeAllocSize(EntryAI->getAllocatedType()) != 0) { + AI.moveBefore(FirstInst); + return &AI; + } + + // If the alignment of the entry block alloca is 0 (unspecified), + // assign it the preferred alignment. + if (EntryAI->getAlignment() == 0) + EntryAI->setAlignment( + MaybeAlign(DL.getPrefTypeAlignment(EntryAI->getAllocatedType()))); + // Replace this zero-sized alloca with the one at the start of the entry + // block after ensuring that the address will be aligned enough for both + // types. + const MaybeAlign MaxAlign( + std::max(EntryAI->getAlignment(), AI.getAlignment())); + EntryAI->setAlignment(MaxAlign); + if (AI.getType() != EntryAI->getType()) + return new BitCastInst(EntryAI, AI.getType()); + return replaceInstUsesWith(AI, EntryAI); + } + } + } + + if (AI.getAlignment()) { + // Check to see if this allocation is only modified by a memcpy/memmove from + // a constant global whose alignment is equal to or exceeds that of the + // allocation. If this is the case, we can change all users to use + // the constant global instead. This is commonly produced by the CFE by + // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' + // is only subsequently read. + SmallVector<Instruction *, 4> ToDelete; + if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) { + unsigned SourceAlign = getOrEnforceKnownAlignment( + Copy->getSource(), AI.getAlignment(), DL, &AI, &AC, &DT); + if (AI.getAlignment() <= SourceAlign && + isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) { + LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n'); + LLVM_DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); + for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) + eraseInstFromFunction(*ToDelete[i]); + Constant *TheSrc = cast<Constant>(Copy->getSource()); + auto *SrcTy = TheSrc->getType(); + auto *DestTy = PointerType::get(AI.getType()->getPointerElementType(), + SrcTy->getPointerAddressSpace()); + Constant *Cast = + ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, DestTy); + if (AI.getType()->getPointerAddressSpace() == + SrcTy->getPointerAddressSpace()) { + Instruction *NewI = replaceInstUsesWith(AI, Cast); + eraseInstFromFunction(*Copy); + ++NumGlobalCopies; + return NewI; + } else { + PointerReplacer PtrReplacer(*this); + PtrReplacer.replacePointer(AI, Cast); + ++NumGlobalCopies; + } + } + } + } + + // At last, use the generic allocation site handler to aggressively remove + // unused allocas. + return visitAllocSite(AI); +} + +// Are we allowed to form a atomic load or store of this type? +static bool isSupportedAtomicType(Type *Ty) { + return Ty->isIntOrPtrTy() || Ty->isFloatingPointTy(); +} + +/// Helper to combine a load to a new type. +/// +/// This just does the work of combining a load to a new type. It handles +/// metadata, etc., and returns the new instruction. The \c NewTy should be the +/// loaded *value* type. This will convert it to a pointer, cast the operand to +/// that pointer type, load it, etc. +/// +/// Note that this will create all of the instructions with whatever insert +/// point the \c InstCombiner currently is using. +static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewTy, + const Twine &Suffix = "") { + assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) && + "can't fold an atomic load to requested type"); + + Value *Ptr = LI.getPointerOperand(); + unsigned AS = LI.getPointerAddressSpace(); + Value *NewPtr = nullptr; + if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) && + NewPtr->getType()->getPointerElementType() == NewTy && + NewPtr->getType()->getPointerAddressSpace() == AS)) + NewPtr = IC.Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS)); + + LoadInst *NewLoad = IC.Builder.CreateAlignedLoad( + NewTy, NewPtr, LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix); + NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); + copyMetadataForLoad(*NewLoad, LI); + return NewLoad; +} + +/// Combine a store to a new type. +/// +/// Returns the newly created store instruction. +static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value *V) { + assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) && + "can't fold an atomic store of requested type"); + + Value *Ptr = SI.getPointerOperand(); + unsigned AS = SI.getPointerAddressSpace(); + SmallVector<std::pair<unsigned, MDNode *>, 8> MD; + SI.getAllMetadata(MD); + + StoreInst *NewStore = IC.Builder.CreateAlignedStore( + V, IC.Builder.CreateBitCast(Ptr, V->getType()->getPointerTo(AS)), + SI.getAlignment(), SI.isVolatile()); + NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID()); + for (const auto &MDPair : MD) { + unsigned ID = MDPair.first; + MDNode *N = MDPair.second; + // Note, essentially every kind of metadata should be preserved here! This + // routine is supposed to clone a store instruction changing *only its + // type*. The only metadata it makes sense to drop is metadata which is + // invalidated when the pointer type changes. This should essentially + // never be the case in LLVM, but we explicitly switch over only known + // metadata to be conservatively correct. If you are adding metadata to + // LLVM which pertains to stores, you almost certainly want to add it + // here. + switch (ID) { + case LLVMContext::MD_dbg: + case LLVMContext::MD_tbaa: + case LLVMContext::MD_prof: + case LLVMContext::MD_fpmath: + case LLVMContext::MD_tbaa_struct: + case LLVMContext::MD_alias_scope: + case LLVMContext::MD_noalias: + case LLVMContext::MD_nontemporal: + case LLVMContext::MD_mem_parallel_loop_access: + case LLVMContext::MD_access_group: + // All of these directly apply. + NewStore->setMetadata(ID, N); + break; + case LLVMContext::MD_invariant_load: + case LLVMContext::MD_nonnull: + case LLVMContext::MD_range: + case LLVMContext::MD_align: + case LLVMContext::MD_dereferenceable: + case LLVMContext::MD_dereferenceable_or_null: + // These don't apply for stores. + break; + } + } + + return NewStore; +} + +/// Returns true if instruction represent minmax pattern like: +/// select ((cmp load V1, load V2), V1, V2). +static bool isMinMaxWithLoads(Value *V) { + assert(V->getType()->isPointerTy() && "Expected pointer type."); + // Ignore possible ty* to ixx* bitcast. + V = peekThroughBitcast(V); + // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax + // pattern. + CmpInst::Predicate Pred; + Instruction *L1; + Instruction *L2; + Value *LHS; + Value *RHS; + if (!match(V, m_Select(m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2)), + m_Value(LHS), m_Value(RHS)))) + return false; + return (match(L1, m_Load(m_Specific(LHS))) && + match(L2, m_Load(m_Specific(RHS)))) || + (match(L1, m_Load(m_Specific(RHS))) && + match(L2, m_Load(m_Specific(LHS)))); +} + +/// Combine loads to match the type of their uses' value after looking +/// through intervening bitcasts. +/// +/// The core idea here is that if the result of a load is used in an operation, +/// we should load the type most conducive to that operation. For example, when +/// loading an integer and converting that immediately to a pointer, we should +/// instead directly load a pointer. +/// +/// However, this routine must never change the width of a load or the number of +/// loads as that would introduce a semantic change. This combine is expected to +/// be a semantic no-op which just allows loads to more closely model the types +/// of their consuming operations. +/// +/// Currently, we also refuse to change the precise type used for an atomic load +/// or a volatile load. This is debatable, and might be reasonable to change +/// later. However, it is risky in case some backend or other part of LLVM is +/// relying on the exact type loaded to select appropriate atomic operations. +static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) { + // FIXME: We could probably with some care handle both volatile and ordered + // atomic loads here but it isn't clear that this is important. + if (!LI.isUnordered()) + return nullptr; + + if (LI.use_empty()) + return nullptr; + + // swifterror values can't be bitcasted. + if (LI.getPointerOperand()->isSwiftError()) + return nullptr; + + Type *Ty = LI.getType(); + const DataLayout &DL = IC.getDataLayout(); + + // Try to canonicalize loads which are only ever stored to operate over + // integers instead of any other type. We only do this when the loaded type + // is sized and has a size exactly the same as its store size and the store + // size is a legal integer type. + // Do not perform canonicalization if minmax pattern is found (to avoid + // infinite loop). + if (!Ty->isIntegerTy() && Ty->isSized() && + DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) && + DL.typeSizeEqualsStoreSize(Ty) && + !DL.isNonIntegralPointerType(Ty) && + !isMinMaxWithLoads( + peekThroughBitcast(LI.getPointerOperand(), /*OneUseOnly=*/true))) { + if (all_of(LI.users(), [&LI](User *U) { + auto *SI = dyn_cast<StoreInst>(U); + return SI && SI->getPointerOperand() != &LI && + !SI->getPointerOperand()->isSwiftError(); + })) { + LoadInst *NewLoad = combineLoadToNewType( + IC, LI, + Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty))); + // Replace all the stores with stores of the newly loaded value. + for (auto UI = LI.user_begin(), UE = LI.user_end(); UI != UE;) { + auto *SI = cast<StoreInst>(*UI++); + IC.Builder.SetInsertPoint(SI); + combineStoreToNewValue(IC, *SI, NewLoad); + IC.eraseInstFromFunction(*SI); + } + assert(LI.use_empty() && "Failed to remove all users of the load!"); + // Return the old load so the combiner can delete it safely. + return &LI; + } + } + + // Fold away bit casts of the loaded value by loading the desired type. + // We can do this for BitCastInsts as well as casts from and to pointer types, + // as long as those are noops (i.e., the source or dest type have the same + // bitwidth as the target's pointers). + if (LI.hasOneUse()) + if (auto* CI = dyn_cast<CastInst>(LI.user_back())) + if (CI->isNoopCast(DL)) + if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) { + LoadInst *NewLoad = combineLoadToNewType(IC, LI, CI->getDestTy()); + CI->replaceAllUsesWith(NewLoad); + IC.eraseInstFromFunction(*CI); + return &LI; + } + + // FIXME: We should also canonicalize loads of vectors when their elements are + // cast to other types. + return nullptr; +} + +static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { + // FIXME: We could probably with some care handle both volatile and atomic + // stores here but it isn't clear that this is important. + if (!LI.isSimple()) + return nullptr; + + Type *T = LI.getType(); + if (!T->isAggregateType()) + return nullptr; + + StringRef Name = LI.getName(); + assert(LI.getAlignment() && "Alignment must be set at this point"); + + if (auto *ST = dyn_cast<StructType>(T)) { + // If the struct only have one element, we unpack. + auto NumElements = ST->getNumElements(); + if (NumElements == 1) { + LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U), + ".unpack"); + AAMDNodes AAMD; + LI.getAAMetadata(AAMD); + NewLoad->setAAMetadata(AAMD); + return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue( + UndefValue::get(T), NewLoad, 0, Name)); + } + + // We don't want to break loads with padding here as we'd loose + // the knowledge that padding exists for the rest of the pipeline. + const DataLayout &DL = IC.getDataLayout(); + auto *SL = DL.getStructLayout(ST); + if (SL->hasPadding()) + return nullptr; + + auto Align = LI.getAlignment(); + if (!Align) + Align = DL.getABITypeAlignment(ST); + + auto *Addr = LI.getPointerOperand(); + auto *IdxType = Type::getInt32Ty(T->getContext()); + auto *Zero = ConstantInt::get(IdxType, 0); + + Value *V = UndefValue::get(T); + for (unsigned i = 0; i < NumElements; i++) { + Value *Indices[2] = { + Zero, + ConstantInt::get(IdxType, i), + }; + auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), + Name + ".elt"); + auto EltAlign = MinAlign(Align, SL->getElementOffset(i)); + auto *L = IC.Builder.CreateAlignedLoad(ST->getElementType(i), Ptr, + EltAlign, Name + ".unpack"); + // Propagate AA metadata. It'll still be valid on the narrowed load. + AAMDNodes AAMD; + LI.getAAMetadata(AAMD); + L->setAAMetadata(AAMD); + V = IC.Builder.CreateInsertValue(V, L, i); + } + + V->setName(Name); + return IC.replaceInstUsesWith(LI, V); + } + + if (auto *AT = dyn_cast<ArrayType>(T)) { + auto *ET = AT->getElementType(); + auto NumElements = AT->getNumElements(); + if (NumElements == 1) { + LoadInst *NewLoad = combineLoadToNewType(IC, LI, ET, ".unpack"); + AAMDNodes AAMD; + LI.getAAMetadata(AAMD); + NewLoad->setAAMetadata(AAMD); + return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue( + UndefValue::get(T), NewLoad, 0, Name)); + } + + // Bail out if the array is too large. Ideally we would like to optimize + // arrays of arbitrary size but this has a terrible impact on compile time. + // The threshold here is chosen arbitrarily, maybe needs a little bit of + // tuning. + if (NumElements > IC.MaxArraySizeForCombine) + return nullptr; + + const DataLayout &DL = IC.getDataLayout(); + auto EltSize = DL.getTypeAllocSize(ET); + auto Align = LI.getAlignment(); + if (!Align) + Align = DL.getABITypeAlignment(T); + + auto *Addr = LI.getPointerOperand(); + auto *IdxType = Type::getInt64Ty(T->getContext()); + auto *Zero = ConstantInt::get(IdxType, 0); + + Value *V = UndefValue::get(T); + uint64_t Offset = 0; + for (uint64_t i = 0; i < NumElements; i++) { + Value *Indices[2] = { + Zero, + ConstantInt::get(IdxType, i), + }; + auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices), + Name + ".elt"); + auto *L = IC.Builder.CreateAlignedLoad( + AT->getElementType(), Ptr, MinAlign(Align, Offset), Name + ".unpack"); + AAMDNodes AAMD; + LI.getAAMetadata(AAMD); + L->setAAMetadata(AAMD); + V = IC.Builder.CreateInsertValue(V, L, i); + Offset += EltSize; + } + + V->setName(Name); + return IC.replaceInstUsesWith(LI, V); + } + + return nullptr; +} + +// If we can determine that all possible objects pointed to by the provided +// pointer value are, not only dereferenceable, but also definitively less than +// or equal to the provided maximum size, then return true. Otherwise, return +// false (constant global values and allocas fall into this category). +// +// FIXME: This should probably live in ValueTracking (or similar). +static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize, + const DataLayout &DL) { + SmallPtrSet<Value *, 4> Visited; + SmallVector<Value *, 4> Worklist(1, V); + + do { + Value *P = Worklist.pop_back_val(); + P = P->stripPointerCasts(); + + if (!Visited.insert(P).second) + continue; + + if (SelectInst *SI = dyn_cast<SelectInst>(P)) { + Worklist.push_back(SI->getTrueValue()); + Worklist.push_back(SI->getFalseValue()); + continue; + } + + if (PHINode *PN = dyn_cast<PHINode>(P)) { + for (Value *IncValue : PN->incoming_values()) + Worklist.push_back(IncValue); + continue; + } + + if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) { + if (GA->isInterposable()) + return false; + Worklist.push_back(GA->getAliasee()); + continue; + } + + // If we know how big this object is, and it is less than MaxSize, continue + // searching. Otherwise, return false. + if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) { + if (!AI->getAllocatedType()->isSized()) + return false; + + ConstantInt *CS = dyn_cast<ConstantInt>(AI->getArraySize()); + if (!CS) + return false; + + uint64_t TypeSize = DL.getTypeAllocSize(AI->getAllocatedType()); + // Make sure that, even if the multiplication below would wrap as an + // uint64_t, we still do the right thing. + if ((CS->getValue().zextOrSelf(128)*APInt(128, TypeSize)).ugt(MaxSize)) + return false; + continue; + } + + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) { + if (!GV->hasDefinitiveInitializer() || !GV->isConstant()) + return false; + + uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType()); + if (InitSize > MaxSize) + return false; + continue; + } + + return false; + } while (!Worklist.empty()); + + return true; +} + +// If we're indexing into an object of a known size, and the outer index is +// not a constant, but having any value but zero would lead to undefined +// behavior, replace it with zero. +// +// For example, if we have: +// @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4 +// ... +// %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x +// ... = load i32* %arrayidx, align 4 +// Then we know that we can replace %x in the GEP with i64 0. +// +// FIXME: We could fold any GEP index to zero that would cause UB if it were +// not zero. Currently, we only handle the first such index. Also, we could +// also search through non-zero constant indices if we kept track of the +// offsets those indices implied. +static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI, + Instruction *MemI, unsigned &Idx) { + if (GEPI->getNumOperands() < 2) + return false; + + // Find the first non-zero index of a GEP. If all indices are zero, return + // one past the last index. + auto FirstNZIdx = [](const GetElementPtrInst *GEPI) { + unsigned I = 1; + for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) { + Value *V = GEPI->getOperand(I); + if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) + if (CI->isZero()) + continue; + + break; + } + + return I; + }; + + // Skip through initial 'zero' indices, and find the corresponding pointer + // type. See if the next index is not a constant. + Idx = FirstNZIdx(GEPI); + if (Idx == GEPI->getNumOperands()) + return false; + if (isa<Constant>(GEPI->getOperand(Idx))) + return false; + + SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx); + Type *AllocTy = + GetElementPtrInst::getIndexedType(GEPI->getSourceElementType(), Ops); + if (!AllocTy || !AllocTy->isSized()) + return false; + const DataLayout &DL = IC.getDataLayout(); + uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy); + + // If there are more indices after the one we might replace with a zero, make + // sure they're all non-negative. If any of them are negative, the overall + // address being computed might be before the base address determined by the + // first non-zero index. + auto IsAllNonNegative = [&]() { + for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) { + KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI); + if (Known.isNonNegative()) + continue; + return false; + } + + return true; + }; + + // FIXME: If the GEP is not inbounds, and there are extra indices after the + // one we'll replace, those could cause the address computation to wrap + // (rendering the IsAllNonNegative() check below insufficient). We can do + // better, ignoring zero indices (and other indices we can prove small + // enough not to wrap). + if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds()) + return false; + + // Note that isObjectSizeLessThanOrEq will return true only if the pointer is + // also known to be dereferenceable. + return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) && + IsAllNonNegative(); +} + +// If we're indexing into an object with a variable index for the memory +// access, but the object has only one element, we can assume that the index +// will always be zero. If we replace the GEP, return it. +template <typename T> +static Instruction *replaceGEPIdxWithZero(InstCombiner &IC, Value *Ptr, + T &MemI) { + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) { + unsigned Idx; + if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) { + Instruction *NewGEPI = GEPI->clone(); + NewGEPI->setOperand(Idx, + ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0)); + NewGEPI->insertBefore(GEPI); + MemI.setOperand(MemI.getPointerOperandIndex(), NewGEPI); + return NewGEPI; + } + } + + return nullptr; +} + +static bool canSimplifyNullStoreOrGEP(StoreInst &SI) { + if (NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace())) + return false; + + auto *Ptr = SI.getPointerOperand(); + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) + Ptr = GEPI->getOperand(0); + return (isa<ConstantPointerNull>(Ptr) && + !NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace())); +} + +static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) { + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) { + const Value *GEPI0 = GEPI->getOperand(0); + if (isa<ConstantPointerNull>(GEPI0) && + !NullPointerIsDefined(LI.getFunction(), GEPI->getPointerAddressSpace())) + return true; + } + if (isa<UndefValue>(Op) || + (isa<ConstantPointerNull>(Op) && + !NullPointerIsDefined(LI.getFunction(), LI.getPointerAddressSpace()))) + return true; + return false; +} + +Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { + Value *Op = LI.getOperand(0); + + // Try to canonicalize the loaded type. + if (Instruction *Res = combineLoadToOperationType(*this, LI)) + return Res; + + // Attempt to improve the alignment. + unsigned KnownAlign = getOrEnforceKnownAlignment( + Op, DL.getPrefTypeAlignment(LI.getType()), DL, &LI, &AC, &DT); + unsigned LoadAlign = LI.getAlignment(); + unsigned EffectiveLoadAlign = + LoadAlign != 0 ? LoadAlign : DL.getABITypeAlignment(LI.getType()); + + if (KnownAlign > EffectiveLoadAlign) + LI.setAlignment(MaybeAlign(KnownAlign)); + else if (LoadAlign == 0) + LI.setAlignment(MaybeAlign(EffectiveLoadAlign)); + + // Replace GEP indices if possible. + if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) { + Worklist.Add(NewGEPI); + return &LI; + } + + if (Instruction *Res = unpackLoadToAggregate(*this, LI)) + return Res; + + // Do really simple store-to-load forwarding and load CSE, to catch cases + // where there are several consecutive memory accesses to the same location, + // separated by a few arithmetic operations. + BasicBlock::iterator BBI(LI); + bool IsLoadCSE = false; + if (Value *AvailableVal = FindAvailableLoadedValue( + &LI, LI.getParent(), BBI, DefMaxInstsToScan, AA, &IsLoadCSE)) { + if (IsLoadCSE) + combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false); + + return replaceInstUsesWith( + LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(), + LI.getName() + ".cast")); + } + + // None of the following transforms are legal for volatile/ordered atomic + // loads. Most of them do apply for unordered atomics. + if (!LI.isUnordered()) return nullptr; + + // load(gep null, ...) -> unreachable + // load null/undef -> unreachable + // TODO: Consider a target hook for valid address spaces for this xforms. + if (canSimplifyNullLoadOrGEP(LI, Op)) { + // Insert a new store to null instruction before the load to indicate + // that this code is not reachable. We do this instead of inserting + // an unreachable instruction directly because we cannot modify the + // CFG. + StoreInst *SI = new StoreInst(UndefValue::get(LI.getType()), + Constant::getNullValue(Op->getType()), &LI); + SI->setDebugLoc(LI.getDebugLoc()); + return replaceInstUsesWith(LI, UndefValue::get(LI.getType())); + } + + if (Op->hasOneUse()) { + // Change select and PHI nodes to select values instead of addresses: this + // helps alias analysis out a lot, allows many others simplifications, and + // exposes redundancy in the code. + // + // Note that we cannot do the transformation unless we know that the + // introduced loads cannot trap! Something like this is valid as long as + // the condition is always false: load (select bool %C, int* null, int* %G), + // but it would not be valid if we transformed it to load from null + // unconditionally. + // + if (SelectInst *SI = dyn_cast<SelectInst>(Op)) { + // load (select (Cond, &V1, &V2)) --> select(Cond, load &V1, load &V2). + const MaybeAlign Alignment(LI.getAlignment()); + if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(), + Alignment, DL, SI) && + isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(), + Alignment, DL, SI)) { + LoadInst *V1 = + Builder.CreateLoad(LI.getType(), SI->getOperand(1), + SI->getOperand(1)->getName() + ".val"); + LoadInst *V2 = + Builder.CreateLoad(LI.getType(), SI->getOperand(2), + SI->getOperand(2)->getName() + ".val"); + assert(LI.isUnordered() && "implied by above"); + V1->setAlignment(Alignment); + V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); + V2->setAlignment(Alignment); + V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); + return SelectInst::Create(SI->getCondition(), V1, V2); + } + + // load (select (cond, null, P)) -> load P + if (isa<ConstantPointerNull>(SI->getOperand(1)) && + !NullPointerIsDefined(SI->getFunction(), + LI.getPointerAddressSpace())) { + LI.setOperand(0, SI->getOperand(2)); + return &LI; + } + + // load (select (cond, P, null)) -> load P + if (isa<ConstantPointerNull>(SI->getOperand(2)) && + !NullPointerIsDefined(SI->getFunction(), + LI.getPointerAddressSpace())) { + LI.setOperand(0, SI->getOperand(1)); + return &LI; + } + } + } + return nullptr; +} + +/// Look for extractelement/insertvalue sequence that acts like a bitcast. +/// +/// \returns underlying value that was "cast", or nullptr otherwise. +/// +/// For example, if we have: +/// +/// %E0 = extractelement <2 x double> %U, i32 0 +/// %V0 = insertvalue [2 x double] undef, double %E0, 0 +/// %E1 = extractelement <2 x double> %U, i32 1 +/// %V1 = insertvalue [2 x double] %V0, double %E1, 1 +/// +/// and the layout of a <2 x double> is isomorphic to a [2 x double], +/// then %V1 can be safely approximated by a conceptual "bitcast" of %U. +/// Note that %U may contain non-undef values where %V1 has undef. +static Value *likeBitCastFromVector(InstCombiner &IC, Value *V) { + Value *U = nullptr; + while (auto *IV = dyn_cast<InsertValueInst>(V)) { + auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand()); + if (!E) + return nullptr; + auto *W = E->getVectorOperand(); + if (!U) + U = W; + else if (U != W) + return nullptr; + auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand()); + if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin()) + return nullptr; + V = IV->getAggregateOperand(); + } + if (!isa<UndefValue>(V) ||!U) + return nullptr; + + auto *UT = cast<VectorType>(U->getType()); + auto *VT = V->getType(); + // Check that types UT and VT are bitwise isomorphic. + const auto &DL = IC.getDataLayout(); + if (DL.getTypeStoreSizeInBits(UT) != DL.getTypeStoreSizeInBits(VT)) { + return nullptr; + } + if (auto *AT = dyn_cast<ArrayType>(VT)) { + if (AT->getNumElements() != UT->getNumElements()) + return nullptr; + } else { + auto *ST = cast<StructType>(VT); + if (ST->getNumElements() != UT->getNumElements()) + return nullptr; + for (const auto *EltT : ST->elements()) { + if (EltT != UT->getElementType()) + return nullptr; + } + } + return U; +} + +/// Combine stores to match the type of value being stored. +/// +/// The core idea here is that the memory does not have any intrinsic type and +/// where we can we should match the type of a store to the type of value being +/// stored. +/// +/// However, this routine must never change the width of a store or the number of +/// stores as that would introduce a semantic change. This combine is expected to +/// be a semantic no-op which just allows stores to more closely model the types +/// of their incoming values. +/// +/// Currently, we also refuse to change the precise type used for an atomic or +/// volatile store. This is debatable, and might be reasonable to change later. +/// However, it is risky in case some backend or other part of LLVM is relying +/// on the exact type stored to select appropriate atomic operations. +/// +/// \returns true if the store was successfully combined away. This indicates +/// the caller must erase the store instruction. We have to let the caller erase +/// the store instruction as otherwise there is no way to signal whether it was +/// combined or not: IC.EraseInstFromFunction returns a null pointer. +static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) { + // FIXME: We could probably with some care handle both volatile and ordered + // atomic stores here but it isn't clear that this is important. + if (!SI.isUnordered()) + return false; + + // swifterror values can't be bitcasted. + if (SI.getPointerOperand()->isSwiftError()) + return false; + + Value *V = SI.getValueOperand(); + + // Fold away bit casts of the stored value by storing the original type. + if (auto *BC = dyn_cast<BitCastInst>(V)) { + V = BC->getOperand(0); + if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) { + combineStoreToNewValue(IC, SI, V); + return true; + } + } + + if (Value *U = likeBitCastFromVector(IC, V)) + if (!SI.isAtomic() || isSupportedAtomicType(U->getType())) { + combineStoreToNewValue(IC, SI, U); + return true; + } + + // FIXME: We should also canonicalize stores of vectors when their elements + // are cast to other types. + return false; +} + +static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) { + // FIXME: We could probably with some care handle both volatile and atomic + // stores here but it isn't clear that this is important. + if (!SI.isSimple()) + return false; + + Value *V = SI.getValueOperand(); + Type *T = V->getType(); + + if (!T->isAggregateType()) + return false; + + if (auto *ST = dyn_cast<StructType>(T)) { + // If the struct only have one element, we unpack. + unsigned Count = ST->getNumElements(); + if (Count == 1) { + V = IC.Builder.CreateExtractValue(V, 0); + combineStoreToNewValue(IC, SI, V); + return true; + } + + // We don't want to break loads with padding here as we'd loose + // the knowledge that padding exists for the rest of the pipeline. + const DataLayout &DL = IC.getDataLayout(); + auto *SL = DL.getStructLayout(ST); + if (SL->hasPadding()) + return false; + + auto Align = SI.getAlignment(); + if (!Align) + Align = DL.getABITypeAlignment(ST); + + SmallString<16> EltName = V->getName(); + EltName += ".elt"; + auto *Addr = SI.getPointerOperand(); + SmallString<16> AddrName = Addr->getName(); + AddrName += ".repack"; + + auto *IdxType = Type::getInt32Ty(ST->getContext()); + auto *Zero = ConstantInt::get(IdxType, 0); + for (unsigned i = 0; i < Count; i++) { + Value *Indices[2] = { + Zero, + ConstantInt::get(IdxType, i), + }; + auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), + AddrName); + auto *Val = IC.Builder.CreateExtractValue(V, i, EltName); + auto EltAlign = MinAlign(Align, SL->getElementOffset(i)); + llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign); + AAMDNodes AAMD; + SI.getAAMetadata(AAMD); + NS->setAAMetadata(AAMD); + } + + return true; + } + + if (auto *AT = dyn_cast<ArrayType>(T)) { + // If the array only have one element, we unpack. + auto NumElements = AT->getNumElements(); + if (NumElements == 1) { + V = IC.Builder.CreateExtractValue(V, 0); + combineStoreToNewValue(IC, SI, V); + return true; + } + + // Bail out if the array is too large. Ideally we would like to optimize + // arrays of arbitrary size but this has a terrible impact on compile time. + // The threshold here is chosen arbitrarily, maybe needs a little bit of + // tuning. + if (NumElements > IC.MaxArraySizeForCombine) + return false; + + const DataLayout &DL = IC.getDataLayout(); + auto EltSize = DL.getTypeAllocSize(AT->getElementType()); + auto Align = SI.getAlignment(); + if (!Align) + Align = DL.getABITypeAlignment(T); + + SmallString<16> EltName = V->getName(); + EltName += ".elt"; + auto *Addr = SI.getPointerOperand(); + SmallString<16> AddrName = Addr->getName(); + AddrName += ".repack"; + + auto *IdxType = Type::getInt64Ty(T->getContext()); + auto *Zero = ConstantInt::get(IdxType, 0); + + uint64_t Offset = 0; + for (uint64_t i = 0; i < NumElements; i++) { + Value *Indices[2] = { + Zero, + ConstantInt::get(IdxType, i), + }; + auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices), + AddrName); + auto *Val = IC.Builder.CreateExtractValue(V, i, EltName); + auto EltAlign = MinAlign(Align, Offset); + Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign); + AAMDNodes AAMD; + SI.getAAMetadata(AAMD); + NS->setAAMetadata(AAMD); + Offset += EltSize; + } + + return true; + } + + return false; +} + +/// equivalentAddressValues - Test if A and B will obviously have the same +/// value. This includes recognizing that %t0 and %t1 will have the same +/// value in code like this: +/// %t0 = getelementptr \@a, 0, 3 +/// store i32 0, i32* %t0 +/// %t1 = getelementptr \@a, 0, 3 +/// %t2 = load i32* %t1 +/// +static bool equivalentAddressValues(Value *A, Value *B) { + // Test if the values are trivially equivalent. + if (A == B) return true; + + // Test if the values come form identical arithmetic instructions. + // This uses isIdenticalToWhenDefined instead of isIdenticalTo because + // its only used to compare two uses within the same basic block, which + // means that they'll always either have the same value or one of them + // will have an undefined value. + if (isa<BinaryOperator>(A) || + isa<CastInst>(A) || + isa<PHINode>(A) || + isa<GetElementPtrInst>(A)) + if (Instruction *BI = dyn_cast<Instruction>(B)) + if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI)) + return true; + + // Otherwise they may not be equivalent. + return false; +} + +/// Converts store (bitcast (load (bitcast (select ...)))) to +/// store (load (select ...)), where select is minmax: +/// select ((cmp load V1, load V2), V1, V2). +static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC, + StoreInst &SI) { + // bitcast? + if (!match(SI.getPointerOperand(), m_BitCast(m_Value()))) + return false; + // load? integer? + Value *LoadAddr; + if (!match(SI.getValueOperand(), m_Load(m_BitCast(m_Value(LoadAddr))))) + return false; + auto *LI = cast<LoadInst>(SI.getValueOperand()); + if (!LI->getType()->isIntegerTy()) + return false; + if (!isMinMaxWithLoads(LoadAddr)) + return false; + + if (!all_of(LI->users(), [LI, LoadAddr](User *U) { + auto *SI = dyn_cast<StoreInst>(U); + return SI && SI->getPointerOperand() != LI && + peekThroughBitcast(SI->getPointerOperand()) != LoadAddr && + !SI->getPointerOperand()->isSwiftError(); + })) + return false; + + IC.Builder.SetInsertPoint(LI); + LoadInst *NewLI = combineLoadToNewType( + IC, *LI, LoadAddr->getType()->getPointerElementType()); + // Replace all the stores with stores of the newly loaded value. + for (auto *UI : LI->users()) { + auto *USI = cast<StoreInst>(UI); + IC.Builder.SetInsertPoint(USI); + combineStoreToNewValue(IC, *USI, NewLI); + } + IC.replaceInstUsesWith(*LI, UndefValue::get(LI->getType())); + IC.eraseInstFromFunction(*LI); + return true; +} + +Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { + Value *Val = SI.getOperand(0); + Value *Ptr = SI.getOperand(1); + + // Try to canonicalize the stored type. + if (combineStoreToValueType(*this, SI)) + return eraseInstFromFunction(SI); + + // Attempt to improve the alignment. + const Align KnownAlign = Align(getOrEnforceKnownAlignment( + Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT)); + const MaybeAlign StoreAlign = MaybeAlign(SI.getAlignment()); + const Align EffectiveStoreAlign = + StoreAlign ? *StoreAlign : Align(DL.getABITypeAlignment(Val->getType())); + + if (KnownAlign > EffectiveStoreAlign) + SI.setAlignment(KnownAlign); + else if (!StoreAlign) + SI.setAlignment(EffectiveStoreAlign); + + // Try to canonicalize the stored type. + if (unpackStoreToAggregate(*this, SI)) + return eraseInstFromFunction(SI); + + if (removeBitcastsFromLoadStoreOnMinMax(*this, SI)) + return eraseInstFromFunction(SI); + + // Replace GEP indices if possible. + if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) { + Worklist.Add(NewGEPI); + return &SI; + } + + // Don't hack volatile/ordered stores. + // FIXME: Some bits are legal for ordered atomic stores; needs refactoring. + if (!SI.isUnordered()) return nullptr; + + // If the RHS is an alloca with a single use, zapify the store, making the + // alloca dead. + if (Ptr->hasOneUse()) { + if (isa<AllocaInst>(Ptr)) + return eraseInstFromFunction(SI); + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) { + if (isa<AllocaInst>(GEP->getOperand(0))) { + if (GEP->getOperand(0)->hasOneUse()) + return eraseInstFromFunction(SI); + } + } + } + + // If we have a store to a location which is known constant, we can conclude + // that the store must be storing the constant value (else the memory + // wouldn't be constant), and this must be a noop. + if (AA->pointsToConstantMemory(Ptr)) + return eraseInstFromFunction(SI); + + // Do really simple DSE, to catch cases where there are several consecutive + // stores to the same location, separated by a few arithmetic operations. This + // situation often occurs with bitfield accesses. + BasicBlock::iterator BBI(SI); + for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts; + --ScanInsts) { + --BBI; + // Don't count debug info directives, lest they affect codegen, + // and we skip pointer-to-pointer bitcasts, which are NOPs. + if (isa<DbgInfoIntrinsic>(BBI) || + (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) { + ScanInsts++; + continue; + } + + if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) { + // Prev store isn't volatile, and stores to the same location? + if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1), + SI.getOperand(1))) { + ++NumDeadStore; + ++BBI; + eraseInstFromFunction(*PrevSI); + continue; + } + break; + } + + // If this is a load, we have to stop. However, if the loaded value is from + // the pointer we're loading and is producing the pointer we're storing, + // then *this* store is dead (X = load P; store X -> P). + if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) { + if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) { + assert(SI.isUnordered() && "can't eliminate ordering operation"); + return eraseInstFromFunction(SI); + } + + // Otherwise, this is a load from some other location. Stores before it + // may not be dead. + break; + } + + // Don't skip over loads, throws or things that can modify memory. + if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow()) + break; + } + + // store X, null -> turns into 'unreachable' in SimplifyCFG + // store X, GEP(null, Y) -> turns into 'unreachable' in SimplifyCFG + if (canSimplifyNullStoreOrGEP(SI)) { + if (!isa<UndefValue>(Val)) { + SI.setOperand(0, UndefValue::get(Val->getType())); + if (Instruction *U = dyn_cast<Instruction>(Val)) + Worklist.Add(U); // Dropped a use. + } + return nullptr; // Do not modify these! + } + + // store undef, Ptr -> noop + if (isa<UndefValue>(Val)) + return eraseInstFromFunction(SI); + + // If this store is the second-to-last instruction in the basic block + // (excluding debug info and bitcasts of pointers) and if the block ends with + // an unconditional branch, try to move the store to the successor block. + BBI = SI.getIterator(); + do { + ++BBI; + } while (isa<DbgInfoIntrinsic>(BBI) || + (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())); + + if (BranchInst *BI = dyn_cast<BranchInst>(BBI)) + if (BI->isUnconditional()) + mergeStoreIntoSuccessor(SI); + + return nullptr; +} + +/// Try to transform: +/// if () { *P = v1; } else { *P = v2 } +/// or: +/// *P = v1; if () { *P = v2; } +/// into a phi node with a store in the successor. +bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) { + assert(SI.isUnordered() && + "This code has not been audited for volatile or ordered store case."); + + // Check if the successor block has exactly 2 incoming edges. + BasicBlock *StoreBB = SI.getParent(); + BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0); + if (!DestBB->hasNPredecessors(2)) + return false; + + // Capture the other block (the block that doesn't contain our store). + pred_iterator PredIter = pred_begin(DestBB); + if (*PredIter == StoreBB) + ++PredIter; + BasicBlock *OtherBB = *PredIter; + + // Bail out if all of the relevant blocks aren't distinct. This can happen, + // for example, if SI is in an infinite loop. + if (StoreBB == DestBB || OtherBB == DestBB) + return false; + + // Verify that the other block ends in a branch and is not otherwise empty. + BasicBlock::iterator BBI(OtherBB->getTerminator()); + BranchInst *OtherBr = dyn_cast<BranchInst>(BBI); + if (!OtherBr || BBI == OtherBB->begin()) + return false; + + // If the other block ends in an unconditional branch, check for the 'if then + // else' case. There is an instruction before the branch. + StoreInst *OtherStore = nullptr; + if (OtherBr->isUnconditional()) { + --BBI; + // Skip over debugging info. + while (isa<DbgInfoIntrinsic>(BBI) || + (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) { + if (BBI==OtherBB->begin()) + return false; + --BBI; + } + // If this isn't a store, isn't a store to the same location, or is not the + // right kind of store, bail out. + OtherStore = dyn_cast<StoreInst>(BBI); + if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) || + !SI.isSameOperationAs(OtherStore)) + return false; + } else { + // Otherwise, the other block ended with a conditional branch. If one of the + // destinations is StoreBB, then we have the if/then case. + if (OtherBr->getSuccessor(0) != StoreBB && + OtherBr->getSuccessor(1) != StoreBB) + return false; + + // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an + // if/then triangle. See if there is a store to the same ptr as SI that + // lives in OtherBB. + for (;; --BBI) { + // Check to see if we find the matching store. + if ((OtherStore = dyn_cast<StoreInst>(BBI))) { + if (OtherStore->getOperand(1) != SI.getOperand(1) || + !SI.isSameOperationAs(OtherStore)) + return false; + break; + } + // If we find something that may be using or overwriting the stored + // value, or if we run out of instructions, we can't do the transform. + if (BBI->mayReadFromMemory() || BBI->mayThrow() || + BBI->mayWriteToMemory() || BBI == OtherBB->begin()) + return false; + } + + // In order to eliminate the store in OtherBr, we have to make sure nothing + // reads or overwrites the stored value in StoreBB. + for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) { + // FIXME: This should really be AA driven. + if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory()) + return false; + } + } + + // Insert a PHI node now if we need it. + Value *MergedVal = OtherStore->getOperand(0); + // The debug locations of the original instructions might differ. Merge them. + DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(), + OtherStore->getDebugLoc()); + if (MergedVal != SI.getOperand(0)) { + PHINode *PN = PHINode::Create(MergedVal->getType(), 2, "storemerge"); + PN->addIncoming(SI.getOperand(0), SI.getParent()); + PN->addIncoming(OtherStore->getOperand(0), OtherBB); + MergedVal = InsertNewInstBefore(PN, DestBB->front()); + PN->setDebugLoc(MergedLoc); + } + + // Advance to a place where it is safe to insert the new store and insert it. + BBI = DestBB->getFirstInsertionPt(); + StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(), + MaybeAlign(SI.getAlignment()), + SI.getOrdering(), SI.getSyncScopeID()); + InsertNewInstBefore(NewSI, *BBI); + NewSI->setDebugLoc(MergedLoc); + + // If the two stores had AA tags, merge them. + AAMDNodes AATags; + SI.getAAMetadata(AATags); + if (AATags) { + OtherStore->getAAMetadata(AATags, /* Merge = */ true); + NewSI->setAAMetadata(AATags); + } + + // Nuke the old stores. + eraseInstFromFunction(SI); + eraseInstFromFunction(*OtherStore); + return true; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp new file mode 100644 index 000000000000..0b9128a9f5a1 --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -0,0 +1,1482 @@ +//===- InstCombineMulDivRem.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the visit functions for mul, fmul, sdiv, udiv, fdiv, +// srem, urem, frem. +// +//===----------------------------------------------------------------------===// + +#include "InstCombineInternal.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <utility> + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "instcombine" + +/// The specific integer value is used in a context where it is known to be +/// non-zero. If this allows us to simplify the computation, do so and return +/// the new operand, otherwise return null. +static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC, + Instruction &CxtI) { + // If V has multiple uses, then we would have to do more analysis to determine + // if this is safe. For example, the use could be in dynamically unreached + // code. + if (!V->hasOneUse()) return nullptr; + + bool MadeChange = false; + + // ((1 << A) >>u B) --> (1 << (A-B)) + // Because V cannot be zero, we know that B is less than A. + Value *A = nullptr, *B = nullptr, *One = nullptr; + if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(One), m_Value(A))), m_Value(B))) && + match(One, m_One())) { + A = IC.Builder.CreateSub(A, B); + return IC.Builder.CreateShl(One, A); + } + + // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it + // inexact. Similarly for <<. + BinaryOperator *I = dyn_cast<BinaryOperator>(V); + if (I && I->isLogicalShift() && + IC.isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0, &CxtI)) { + // We know that this is an exact/nuw shift and that the input is a + // non-zero context as well. + if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) { + I->setOperand(0, V2); + MadeChange = true; + } + + if (I->getOpcode() == Instruction::LShr && !I->isExact()) { + I->setIsExact(); + MadeChange = true; + } + + if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) { + I->setHasNoUnsignedWrap(); + MadeChange = true; + } + } + + // TODO: Lots more we could do here: + // If V is a phi node, we can call this on each of its operands. + // "select cond, X, 0" can simplify to "X". + + return MadeChange ? V : nullptr; +} + +/// A helper routine of InstCombiner::visitMul(). +/// +/// If C is a scalar/vector of known powers of 2, then this function returns +/// a new scalar/vector obtained from logBase2 of C. +/// Return a null pointer otherwise. +static Constant *getLogBase2(Type *Ty, Constant *C) { + const APInt *IVal; + if (match(C, m_APInt(IVal)) && IVal->isPowerOf2()) + return ConstantInt::get(Ty, IVal->logBase2()); + + if (!Ty->isVectorTy()) + return nullptr; + + SmallVector<Constant *, 4> Elts; + for (unsigned I = 0, E = Ty->getVectorNumElements(); I != E; ++I) { + Constant *Elt = C->getAggregateElement(I); + if (!Elt) + return nullptr; + if (isa<UndefValue>(Elt)) { + Elts.push_back(UndefValue::get(Ty->getScalarType())); + continue; + } + if (!match(Elt, m_APInt(IVal)) || !IVal->isPowerOf2()) + return nullptr; + Elts.push_back(ConstantInt::get(Ty->getScalarType(), IVal->logBase2())); + } + + return ConstantVector::get(Elts); +} + +// TODO: This is a specific form of a much more general pattern. +// We could detect a select with any binop identity constant, or we +// could use SimplifyBinOp to see if either arm of the select reduces. +// But that needs to be done carefully and/or while removing potential +// reverse canonicalizations as in InstCombiner::foldSelectIntoOp(). +static Value *foldMulSelectToNegate(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *Cond, *OtherOp; + + // mul (select Cond, 1, -1), OtherOp --> select Cond, OtherOp, -OtherOp + // mul OtherOp, (select Cond, 1, -1) --> select Cond, OtherOp, -OtherOp + if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_One(), m_AllOnes())), + m_Value(OtherOp)))) + return Builder.CreateSelect(Cond, OtherOp, Builder.CreateNeg(OtherOp)); + + // mul (select Cond, -1, 1), OtherOp --> select Cond, -OtherOp, OtherOp + // mul OtherOp, (select Cond, -1, 1) --> select Cond, -OtherOp, OtherOp + if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_AllOnes(), m_One())), + m_Value(OtherOp)))) + return Builder.CreateSelect(Cond, Builder.CreateNeg(OtherOp), OtherOp); + + // fmul (select Cond, 1.0, -1.0), OtherOp --> select Cond, OtherOp, -OtherOp + // fmul OtherOp, (select Cond, 1.0, -1.0) --> select Cond, OtherOp, -OtherOp + if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(1.0), + m_SpecificFP(-1.0))), + m_Value(OtherOp)))) { + IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); + Builder.setFastMathFlags(I.getFastMathFlags()); + return Builder.CreateSelect(Cond, OtherOp, Builder.CreateFNeg(OtherOp)); + } + + // fmul (select Cond, -1.0, 1.0), OtherOp --> select Cond, -OtherOp, OtherOp + // fmul OtherOp, (select Cond, -1.0, 1.0) --> select Cond, -OtherOp, OtherOp + if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(-1.0), + m_SpecificFP(1.0))), + m_Value(OtherOp)))) { + IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); + Builder.setFastMathFlags(I.getFastMathFlags()); + return Builder.CreateSelect(Cond, Builder.CreateFNeg(OtherOp), OtherOp); + } + + return nullptr; +} + +Instruction *InstCombiner::visitMul(BinaryOperator &I) { + if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (SimplifyAssociativeOrCommutative(I)) + return &I; + + if (Instruction *X = foldVectorBinop(I)) + return X; + + if (Value *V = SimplifyUsingDistributiveLaws(I)) + return replaceInstUsesWith(I, V); + + // X * -1 == 0 - X + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + if (match(Op1, m_AllOnes())) { + BinaryOperator *BO = BinaryOperator::CreateNeg(Op0, I.getName()); + if (I.hasNoSignedWrap()) + BO->setHasNoSignedWrap(); + return BO; + } + + // Also allow combining multiply instructions on vectors. + { + Value *NewOp; + Constant *C1, *C2; + const APInt *IVal; + if (match(&I, m_Mul(m_Shl(m_Value(NewOp), m_Constant(C2)), + m_Constant(C1))) && + match(C1, m_APInt(IVal))) { + // ((X << C2)*C1) == (X * (C1 << C2)) + Constant *Shl = ConstantExpr::getShl(C1, C2); + BinaryOperator *Mul = cast<BinaryOperator>(I.getOperand(0)); + BinaryOperator *BO = BinaryOperator::CreateMul(NewOp, Shl); + if (I.hasNoUnsignedWrap() && Mul->hasNoUnsignedWrap()) + BO->setHasNoUnsignedWrap(); + if (I.hasNoSignedWrap() && Mul->hasNoSignedWrap() && + Shl->isNotMinSignedValue()) + BO->setHasNoSignedWrap(); + return BO; + } + + if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) { + // Replace X*(2^C) with X << C, where C is either a scalar or a vector. + if (Constant *NewCst = getLogBase2(NewOp->getType(), C1)) { + BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst); + + if (I.hasNoUnsignedWrap()) + Shl->setHasNoUnsignedWrap(); + if (I.hasNoSignedWrap()) { + const APInt *V; + if (match(NewCst, m_APInt(V)) && *V != V->getBitWidth() - 1) + Shl->setHasNoSignedWrap(); + } + + return Shl; + } + } + } + + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { + // (Y - X) * (-(2**n)) -> (X - Y) * (2**n), for positive nonzero n + // (Y + const) * (-(2**n)) -> (-constY) * (2**n), for positive nonzero n + // The "* (2**n)" thus becomes a potential shifting opportunity. + { + const APInt & Val = CI->getValue(); + const APInt &PosVal = Val.abs(); + if (Val.isNegative() && PosVal.isPowerOf2()) { + Value *X = nullptr, *Y = nullptr; + if (Op0->hasOneUse()) { + ConstantInt *C1; + Value *Sub = nullptr; + if (match(Op0, m_Sub(m_Value(Y), m_Value(X)))) + Sub = Builder.CreateSub(X, Y, "suba"); + else if (match(Op0, m_Add(m_Value(Y), m_ConstantInt(C1)))) + Sub = Builder.CreateSub(Builder.CreateNeg(C1), Y, "subc"); + if (Sub) + return + BinaryOperator::CreateMul(Sub, + ConstantInt::get(Y->getType(), PosVal)); + } + } + } + } + + if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I)) + return FoldedMul; + + if (Value *FoldedMul = foldMulSelectToNegate(I, Builder)) + return replaceInstUsesWith(I, FoldedMul); + + // Simplify mul instructions with a constant RHS. + if (isa<Constant>(Op1)) { + // Canonicalize (X+C1)*CI -> X*CI+C1*CI. + Value *X; + Constant *C1; + if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) { + Value *Mul = Builder.CreateMul(C1, Op1); + // Only go forward with the transform if C1*CI simplifies to a tidier + // constant. + if (!match(Mul, m_Mul(m_Value(), m_Value()))) + return BinaryOperator::CreateAdd(Builder.CreateMul(X, Op1), Mul); + } + } + + // -X * C --> X * -C + Value *X, *Y; + Constant *Op1C; + if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Constant(Op1C))) + return BinaryOperator::CreateMul(X, ConstantExpr::getNeg(Op1C)); + + // -X * -Y --> X * Y + if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Neg(m_Value(Y)))) { + auto *NewMul = BinaryOperator::CreateMul(X, Y); + if (I.hasNoSignedWrap() && + cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap() && + cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap()) + NewMul->setHasNoSignedWrap(); + return NewMul; + } + + // -X * Y --> -(X * Y) + // X * -Y --> -(X * Y) + if (match(&I, m_c_Mul(m_OneUse(m_Neg(m_Value(X))), m_Value(Y)))) + return BinaryOperator::CreateNeg(Builder.CreateMul(X, Y)); + + // (X / Y) * Y = X - (X % Y) + // (X / Y) * -Y = (X % Y) - X + { + Value *Y = Op1; + BinaryOperator *Div = dyn_cast<BinaryOperator>(Op0); + if (!Div || (Div->getOpcode() != Instruction::UDiv && + Div->getOpcode() != Instruction::SDiv)) { + Y = Op0; + Div = dyn_cast<BinaryOperator>(Op1); + } + Value *Neg = dyn_castNegVal(Y); + if (Div && Div->hasOneUse() && + (Div->getOperand(1) == Y || Div->getOperand(1) == Neg) && + (Div->getOpcode() == Instruction::UDiv || + Div->getOpcode() == Instruction::SDiv)) { + Value *X = Div->getOperand(0), *DivOp1 = Div->getOperand(1); + + // If the division is exact, X % Y is zero, so we end up with X or -X. + if (Div->isExact()) { + if (DivOp1 == Y) + return replaceInstUsesWith(I, X); + return BinaryOperator::CreateNeg(X); + } + + auto RemOpc = Div->getOpcode() == Instruction::UDiv ? Instruction::URem + : Instruction::SRem; + Value *Rem = Builder.CreateBinOp(RemOpc, X, DivOp1); + if (DivOp1 == Y) + return BinaryOperator::CreateSub(X, Rem); + return BinaryOperator::CreateSub(Rem, X); + } + } + + /// i1 mul -> i1 and. + if (I.getType()->isIntOrIntVectorTy(1)) + return BinaryOperator::CreateAnd(Op0, Op1); + + // X*(1 << Y) --> X << Y + // (1 << Y)*X --> X << Y + { + Value *Y; + BinaryOperator *BO = nullptr; + bool ShlNSW = false; + if (match(Op0, m_Shl(m_One(), m_Value(Y)))) { + BO = BinaryOperator::CreateShl(Op1, Y); + ShlNSW = cast<ShlOperator>(Op0)->hasNoSignedWrap(); + } else if (match(Op1, m_Shl(m_One(), m_Value(Y)))) { + BO = BinaryOperator::CreateShl(Op0, Y); + ShlNSW = cast<ShlOperator>(Op1)->hasNoSignedWrap(); + } + if (BO) { + if (I.hasNoUnsignedWrap()) + BO->setHasNoUnsignedWrap(); + if (I.hasNoSignedWrap() && ShlNSW) + BO->setHasNoSignedWrap(); + return BO; + } + } + + // (bool X) * Y --> X ? Y : 0 + // Y * (bool X) --> X ? Y : 0 + if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) + return SelectInst::Create(X, Op1, ConstantInt::get(I.getType(), 0)); + if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) + return SelectInst::Create(X, Op0, ConstantInt::get(I.getType(), 0)); + + // (lshr X, 31) * Y --> (ashr X, 31) & Y + // Y * (lshr X, 31) --> (ashr X, 31) & Y + // TODO: We are not checking one-use because the elimination of the multiply + // is better for analysis? + // TODO: Should we canonicalize to '(X < 0) ? Y : 0' instead? That would be + // more similar to what we're doing above. + const APInt *C; + if (match(Op0, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1) + return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op1); + if (match(Op1, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1) + return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op0); + + if (Instruction *Ext = narrowMathIfNoOverflow(I)) + return Ext; + + bool Changed = false; + if (!I.hasNoSignedWrap() && willNotOverflowSignedMul(Op0, Op1, I)) { + Changed = true; + I.setHasNoSignedWrap(true); + } + + if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) { + Changed = true; + I.setHasNoUnsignedWrap(true); + } + + return Changed ? &I : nullptr; +} + +Instruction *InstCombiner::visitFMul(BinaryOperator &I) { + if (Value *V = SimplifyFMulInst(I.getOperand(0), I.getOperand(1), + I.getFastMathFlags(), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (SimplifyAssociativeOrCommutative(I)) + return &I; + + if (Instruction *X = foldVectorBinop(I)) + return X; + + if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I)) + return FoldedMul; + + if (Value *FoldedMul = foldMulSelectToNegate(I, Builder)) + return replaceInstUsesWith(I, FoldedMul); + + // X * -1.0 --> -X + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + if (match(Op1, m_SpecificFP(-1.0))) + return BinaryOperator::CreateFNegFMF(Op0, &I); + + // -X * -Y --> X * Y + Value *X, *Y; + if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y)))) + return BinaryOperator::CreateFMulFMF(X, Y, &I); + + // -X * C --> X * -C + Constant *C; + if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_Constant(C))) + return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I); + + // fabs(X) * fabs(X) -> X * X + if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X)))) + return BinaryOperator::CreateFMulFMF(X, X, &I); + + // (select A, B, C) * (select A, D, E) --> select A, (B*D), (C*E) + if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1)) + return replaceInstUsesWith(I, V); + + if (I.hasAllowReassoc()) { + // Reassociate constant RHS with another constant to form constant + // expression. + if (match(Op1, m_Constant(C)) && C->isFiniteNonZeroFP()) { + Constant *C1; + if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) { + // (C1 / X) * C --> (C * C1) / X + Constant *CC1 = ConstantExpr::getFMul(C, C1); + if (CC1->isNormalFP()) + return BinaryOperator::CreateFDivFMF(CC1, X, &I); + } + if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) { + // (X / C1) * C --> X * (C / C1) + Constant *CDivC1 = ConstantExpr::getFDiv(C, C1); + if (CDivC1->isNormalFP()) + return BinaryOperator::CreateFMulFMF(X, CDivC1, &I); + + // If the constant was a denormal, try reassociating differently. + // (X / C1) * C --> X / (C1 / C) + Constant *C1DivC = ConstantExpr::getFDiv(C1, C); + if (Op0->hasOneUse() && C1DivC->isNormalFP()) + return BinaryOperator::CreateFDivFMF(X, C1DivC, &I); + } + + // We do not need to match 'fadd C, X' and 'fsub X, C' because they are + // canonicalized to 'fadd X, C'. Distributing the multiply may allow + // further folds and (X * C) + C2 is 'fma'. + if (match(Op0, m_OneUse(m_FAdd(m_Value(X), m_Constant(C1))))) { + // (X + C1) * C --> (X * C) + (C * C1) + Constant *CC1 = ConstantExpr::getFMul(C, C1); + Value *XC = Builder.CreateFMulFMF(X, C, &I); + return BinaryOperator::CreateFAddFMF(XC, CC1, &I); + } + if (match(Op0, m_OneUse(m_FSub(m_Constant(C1), m_Value(X))))) { + // (C1 - X) * C --> (C * C1) - (X * C) + Constant *CC1 = ConstantExpr::getFMul(C, C1); + Value *XC = Builder.CreateFMulFMF(X, C, &I); + return BinaryOperator::CreateFSubFMF(CC1, XC, &I); + } + } + + Value *Z; + if (match(&I, m_c_FMul(m_OneUse(m_FDiv(m_Value(X), m_Value(Y))), + m_Value(Z)))) { + // Sink division: (X / Y) * Z --> (X * Z) / Y + Value *NewFMul = Builder.CreateFMulFMF(X, Z, &I); + return BinaryOperator::CreateFDivFMF(NewFMul, Y, &I); + } + + // sqrt(X) * sqrt(Y) -> sqrt(X * Y) + // nnan disallows the possibility of returning a number if both operands are + // negative (in that case, we should return NaN). + if (I.hasNoNaNs() && + match(Op0, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(X)))) && + match(Op1, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) { + Value *XY = Builder.CreateFMulFMF(X, Y, &I); + Value *Sqrt = Builder.CreateUnaryIntrinsic(Intrinsic::sqrt, XY, &I); + return replaceInstUsesWith(I, Sqrt); + } + + // Like the similar transform in instsimplify, this requires 'nsz' because + // sqrt(-0.0) = -0.0, and -0.0 * -0.0 does not simplify to -0.0. + if (I.hasNoNaNs() && I.hasNoSignedZeros() && Op0 == Op1 && + Op0->hasNUses(2)) { + // Peek through fdiv to find squaring of square root: + // (X / sqrt(Y)) * (X / sqrt(Y)) --> (X * X) / Y + if (match(Op0, m_FDiv(m_Value(X), + m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) { + Value *XX = Builder.CreateFMulFMF(X, X, &I); + return BinaryOperator::CreateFDivFMF(XX, Y, &I); + } + // (sqrt(Y) / X) * (sqrt(Y) / X) --> Y / (X * X) + if (match(Op0, m_FDiv(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y)), + m_Value(X)))) { + Value *XX = Builder.CreateFMulFMF(X, X, &I); + return BinaryOperator::CreateFDivFMF(Y, XX, &I); + } + } + + // exp(X) * exp(Y) -> exp(X + Y) + // Match as long as at least one of exp has only one use. + if (match(Op0, m_Intrinsic<Intrinsic::exp>(m_Value(X))) && + match(Op1, m_Intrinsic<Intrinsic::exp>(m_Value(Y))) && + (Op0->hasOneUse() || Op1->hasOneUse())) { + Value *XY = Builder.CreateFAddFMF(X, Y, &I); + Value *Exp = Builder.CreateUnaryIntrinsic(Intrinsic::exp, XY, &I); + return replaceInstUsesWith(I, Exp); + } + + // exp2(X) * exp2(Y) -> exp2(X + Y) + // Match as long as at least one of exp2 has only one use. + if (match(Op0, m_Intrinsic<Intrinsic::exp2>(m_Value(X))) && + match(Op1, m_Intrinsic<Intrinsic::exp2>(m_Value(Y))) && + (Op0->hasOneUse() || Op1->hasOneUse())) { + Value *XY = Builder.CreateFAddFMF(X, Y, &I); + Value *Exp2 = Builder.CreateUnaryIntrinsic(Intrinsic::exp2, XY, &I); + return replaceInstUsesWith(I, Exp2); + } + + // (X*Y) * X => (X*X) * Y where Y != X + // The purpose is two-fold: + // 1) to form a power expression (of X). + // 2) potentially shorten the critical path: After transformation, the + // latency of the instruction Y is amortized by the expression of X*X, + // and therefore Y is in a "less critical" position compared to what it + // was before the transformation. + if (match(Op0, m_OneUse(m_c_FMul(m_Specific(Op1), m_Value(Y)))) && + Op1 != Y) { + Value *XX = Builder.CreateFMulFMF(Op1, Op1, &I); + return BinaryOperator::CreateFMulFMF(XX, Y, &I); + } + if (match(Op1, m_OneUse(m_c_FMul(m_Specific(Op0), m_Value(Y)))) && + Op0 != Y) { + Value *XX = Builder.CreateFMulFMF(Op0, Op0, &I); + return BinaryOperator::CreateFMulFMF(XX, Y, &I); + } + } + + // log2(X * 0.5) * Y = log2(X) * Y - Y + if (I.isFast()) { + IntrinsicInst *Log2 = nullptr; + if (match(Op0, m_OneUse(m_Intrinsic<Intrinsic::log2>( + m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) { + Log2 = cast<IntrinsicInst>(Op0); + Y = Op1; + } + if (match(Op1, m_OneUse(m_Intrinsic<Intrinsic::log2>( + m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) { + Log2 = cast<IntrinsicInst>(Op1); + Y = Op0; + } + if (Log2) { + Log2->setArgOperand(0, X); + Log2->copyFastMathFlags(&I); + Value *LogXTimesY = Builder.CreateFMulFMF(Log2, Y, &I); + return BinaryOperator::CreateFSubFMF(LogXTimesY, Y, &I); + } + } + + return nullptr; +} + +/// Fold a divide or remainder with a select instruction divisor when one of the +/// select operands is zero. In that case, we can use the other select operand +/// because div/rem by zero is undefined. +bool InstCombiner::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) { + SelectInst *SI = dyn_cast<SelectInst>(I.getOperand(1)); + if (!SI) + return false; + + int NonNullOperand; + if (match(SI->getTrueValue(), m_Zero())) + // div/rem X, (Cond ? 0 : Y) -> div/rem X, Y + NonNullOperand = 2; + else if (match(SI->getFalseValue(), m_Zero())) + // div/rem X, (Cond ? Y : 0) -> div/rem X, Y + NonNullOperand = 1; + else + return false; + + // Change the div/rem to use 'Y' instead of the select. + I.setOperand(1, SI->getOperand(NonNullOperand)); + + // Okay, we know we replace the operand of the div/rem with 'Y' with no + // problem. However, the select, or the condition of the select may have + // multiple uses. Based on our knowledge that the operand must be non-zero, + // propagate the known value for the select into other uses of it, and + // propagate a known value of the condition into its other users. + + // If the select and condition only have a single use, don't bother with this, + // early exit. + Value *SelectCond = SI->getCondition(); + if (SI->use_empty() && SelectCond->hasOneUse()) + return true; + + // Scan the current block backward, looking for other uses of SI. + BasicBlock::iterator BBI = I.getIterator(), BBFront = I.getParent()->begin(); + Type *CondTy = SelectCond->getType(); + while (BBI != BBFront) { + --BBI; + // If we found an instruction that we can't assume will return, so + // information from below it cannot be propagated above it. + if (!isGuaranteedToTransferExecutionToSuccessor(&*BBI)) + break; + + // Replace uses of the select or its condition with the known values. + for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end(); + I != E; ++I) { + if (*I == SI) { + *I = SI->getOperand(NonNullOperand); + Worklist.Add(&*BBI); + } else if (*I == SelectCond) { + *I = NonNullOperand == 1 ? ConstantInt::getTrue(CondTy) + : ConstantInt::getFalse(CondTy); + Worklist.Add(&*BBI); + } + } + + // If we past the instruction, quit looking for it. + if (&*BBI == SI) + SI = nullptr; + if (&*BBI == SelectCond) + SelectCond = nullptr; + + // If we ran out of things to eliminate, break out of the loop. + if (!SelectCond && !SI) + break; + + } + return true; +} + +/// True if the multiply can not be expressed in an int this size. +static bool multiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product, + bool IsSigned) { + bool Overflow; + Product = IsSigned ? C1.smul_ov(C2, Overflow) : C1.umul_ov(C2, Overflow); + return Overflow; +} + +/// True if C1 is a multiple of C2. Quotient contains C1/C2. +static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient, + bool IsSigned) { + assert(C1.getBitWidth() == C2.getBitWidth() && "Constant widths not equal"); + + // Bail if we will divide by zero. + if (C2.isNullValue()) + return false; + + // Bail if we would divide INT_MIN by -1. + if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue()) + return false; + + APInt Remainder(C1.getBitWidth(), /*val=*/0ULL, IsSigned); + if (IsSigned) + APInt::sdivrem(C1, C2, Quotient, Remainder); + else + APInt::udivrem(C1, C2, Quotient, Remainder); + + return Remainder.isMinValue(); +} + +/// This function implements the transforms common to both integer division +/// instructions (udiv and sdiv). It is called by the visitors to those integer +/// division instructions. +/// Common integer divide transforms +Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + bool IsSigned = I.getOpcode() == Instruction::SDiv; + Type *Ty = I.getType(); + + // The RHS is known non-zero. + if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) { + I.setOperand(1, V); + return &I; + } + + // Handle cases involving: [su]div X, (select Cond, Y, Z) + // This does not apply for fdiv. + if (simplifyDivRemOfSelectWithZeroOp(I)) + return &I; + + const APInt *C2; + if (match(Op1, m_APInt(C2))) { + Value *X; + const APInt *C1; + + // (X / C1) / C2 -> X / (C1*C2) + if ((IsSigned && match(Op0, m_SDiv(m_Value(X), m_APInt(C1)))) || + (!IsSigned && match(Op0, m_UDiv(m_Value(X), m_APInt(C1))))) { + APInt Product(C1->getBitWidth(), /*val=*/0ULL, IsSigned); + if (!multiplyOverflows(*C1, *C2, Product, IsSigned)) + return BinaryOperator::Create(I.getOpcode(), X, + ConstantInt::get(Ty, Product)); + } + + if ((IsSigned && match(Op0, m_NSWMul(m_Value(X), m_APInt(C1)))) || + (!IsSigned && match(Op0, m_NUWMul(m_Value(X), m_APInt(C1))))) { + APInt Quotient(C1->getBitWidth(), /*val=*/0ULL, IsSigned); + + // (X * C1) / C2 -> X / (C2 / C1) if C2 is a multiple of C1. + if (isMultiple(*C2, *C1, Quotient, IsSigned)) { + auto *NewDiv = BinaryOperator::Create(I.getOpcode(), X, + ConstantInt::get(Ty, Quotient)); + NewDiv->setIsExact(I.isExact()); + return NewDiv; + } + + // (X * C1) / C2 -> X * (C1 / C2) if C1 is a multiple of C2. + if (isMultiple(*C1, *C2, Quotient, IsSigned)) { + auto *Mul = BinaryOperator::Create(Instruction::Mul, X, + ConstantInt::get(Ty, Quotient)); + auto *OBO = cast<OverflowingBinaryOperator>(Op0); + Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap()); + Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap()); + return Mul; + } + } + + if ((IsSigned && match(Op0, m_NSWShl(m_Value(X), m_APInt(C1))) && + *C1 != C1->getBitWidth() - 1) || + (!IsSigned && match(Op0, m_NUWShl(m_Value(X), m_APInt(C1))))) { + APInt Quotient(C1->getBitWidth(), /*val=*/0ULL, IsSigned); + APInt C1Shifted = APInt::getOneBitSet( + C1->getBitWidth(), static_cast<unsigned>(C1->getLimitedValue())); + + // (X << C1) / C2 -> X / (C2 >> C1) if C2 is a multiple of 1 << C1. + if (isMultiple(*C2, C1Shifted, Quotient, IsSigned)) { + auto *BO = BinaryOperator::Create(I.getOpcode(), X, + ConstantInt::get(Ty, Quotient)); + BO->setIsExact(I.isExact()); + return BO; + } + + // (X << C1) / C2 -> X * ((1 << C1) / C2) if 1 << C1 is a multiple of C2. + if (isMultiple(C1Shifted, *C2, Quotient, IsSigned)) { + auto *Mul = BinaryOperator::Create(Instruction::Mul, X, + ConstantInt::get(Ty, Quotient)); + auto *OBO = cast<OverflowingBinaryOperator>(Op0); + Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap()); + Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap()); + return Mul; + } + } + + if (!C2->isNullValue()) // avoid X udiv 0 + if (Instruction *FoldedDiv = foldBinOpIntoSelectOrPhi(I)) + return FoldedDiv; + } + + if (match(Op0, m_One())) { + assert(!Ty->isIntOrIntVectorTy(1) && "i1 divide not removed?"); + if (IsSigned) { + // If Op1 is 0 then it's undefined behaviour, if Op1 is 1 then the + // result is one, if Op1 is -1 then the result is minus one, otherwise + // it's zero. + Value *Inc = Builder.CreateAdd(Op1, Op0); + Value *Cmp = Builder.CreateICmpULT(Inc, ConstantInt::get(Ty, 3)); + return SelectInst::Create(Cmp, Op1, ConstantInt::get(Ty, 0)); + } else { + // If Op1 is 0 then it's undefined behaviour. If Op1 is 1 then the + // result is one, otherwise it's zero. + return new ZExtInst(Builder.CreateICmpEQ(Op1, Op0), Ty); + } + } + + // See if we can fold away this div instruction. + if (SimplifyDemandedInstructionBits(I)) + return &I; + + // (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y + Value *X, *Z; + if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) // (X - Z) / Y; Y = Op1 + if ((IsSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) || + (!IsSigned && match(Z, m_URem(m_Specific(X), m_Specific(Op1))))) + return BinaryOperator::Create(I.getOpcode(), X, Op1); + + // (X << Y) / X -> 1 << Y + Value *Y; + if (IsSigned && match(Op0, m_NSWShl(m_Specific(Op1), m_Value(Y)))) + return BinaryOperator::CreateNSWShl(ConstantInt::get(Ty, 1), Y); + if (!IsSigned && match(Op0, m_NUWShl(m_Specific(Op1), m_Value(Y)))) + return BinaryOperator::CreateNUWShl(ConstantInt::get(Ty, 1), Y); + + // X / (X * Y) -> 1 / Y if the multiplication does not overflow. + if (match(Op1, m_c_Mul(m_Specific(Op0), m_Value(Y)))) { + bool HasNSW = cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap(); + bool HasNUW = cast<OverflowingBinaryOperator>(Op1)->hasNoUnsignedWrap(); + if ((IsSigned && HasNSW) || (!IsSigned && HasNUW)) { + I.setOperand(0, ConstantInt::get(Ty, 1)); + I.setOperand(1, Y); + return &I; + } + } + + return nullptr; +} + +static const unsigned MaxDepth = 6; + +namespace { + +using FoldUDivOperandCb = Instruction *(*)(Value *Op0, Value *Op1, + const BinaryOperator &I, + InstCombiner &IC); + +/// Used to maintain state for visitUDivOperand(). +struct UDivFoldAction { + /// Informs visitUDiv() how to fold this operand. This can be zero if this + /// action joins two actions together. + FoldUDivOperandCb FoldAction; + + /// Which operand to fold. + Value *OperandToFold; + + union { + /// The instruction returned when FoldAction is invoked. + Instruction *FoldResult; + + /// Stores the LHS action index if this action joins two actions together. + size_t SelectLHSIdx; + }; + + UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand) + : FoldAction(FA), OperandToFold(InputOperand), FoldResult(nullptr) {} + UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand, size_t SLHS) + : FoldAction(FA), OperandToFold(InputOperand), SelectLHSIdx(SLHS) {} +}; + +} // end anonymous namespace + +// X udiv 2^C -> X >> C +static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1, + const BinaryOperator &I, InstCombiner &IC) { + Constant *C1 = getLogBase2(Op0->getType(), cast<Constant>(Op1)); + if (!C1) + llvm_unreachable("Failed to constant fold udiv -> logbase2"); + BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, C1); + if (I.isExact()) + LShr->setIsExact(); + return LShr; +} + +// X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2) +// X udiv (zext (C1 << N)), where C1 is "1<<C2" --> X >> (N+C2) +static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I, + InstCombiner &IC) { + Value *ShiftLeft; + if (!match(Op1, m_ZExt(m_Value(ShiftLeft)))) + ShiftLeft = Op1; + + Constant *CI; + Value *N; + if (!match(ShiftLeft, m_Shl(m_Constant(CI), m_Value(N)))) + llvm_unreachable("match should never fail here!"); + Constant *Log2Base = getLogBase2(N->getType(), CI); + if (!Log2Base) + llvm_unreachable("getLogBase2 should never fail here!"); + N = IC.Builder.CreateAdd(N, Log2Base); + if (Op1 != ShiftLeft) + N = IC.Builder.CreateZExt(N, Op1->getType()); + BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N); + if (I.isExact()) + LShr->setIsExact(); + return LShr; +} + +// Recursively visits the possible right hand operands of a udiv +// instruction, seeing through select instructions, to determine if we can +// replace the udiv with something simpler. If we find that an operand is not +// able to simplify the udiv, we abort the entire transformation. +static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I, + SmallVectorImpl<UDivFoldAction> &Actions, + unsigned Depth = 0) { + // Check to see if this is an unsigned division with an exact power of 2, + // if so, convert to a right shift. + if (match(Op1, m_Power2())) { + Actions.push_back(UDivFoldAction(foldUDivPow2Cst, Op1)); + return Actions.size(); + } + + // X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2) + if (match(Op1, m_Shl(m_Power2(), m_Value())) || + match(Op1, m_ZExt(m_Shl(m_Power2(), m_Value())))) { + Actions.push_back(UDivFoldAction(foldUDivShl, Op1)); + return Actions.size(); + } + + // The remaining tests are all recursive, so bail out if we hit the limit. + if (Depth++ == MaxDepth) + return 0; + + if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) + if (size_t LHSIdx = + visitUDivOperand(Op0, SI->getOperand(1), I, Actions, Depth)) + if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions, Depth)) { + Actions.push_back(UDivFoldAction(nullptr, Op1, LHSIdx - 1)); + return Actions.size(); + } + + return 0; +} + +/// If we have zero-extended operands of an unsigned div or rem, we may be able +/// to narrow the operation (sink the zext below the math). +static Instruction *narrowUDivURem(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Instruction::BinaryOps Opcode = I.getOpcode(); + Value *N = I.getOperand(0); + Value *D = I.getOperand(1); + Type *Ty = I.getType(); + Value *X, *Y; + if (match(N, m_ZExt(m_Value(X))) && match(D, m_ZExt(m_Value(Y))) && + X->getType() == Y->getType() && (N->hasOneUse() || D->hasOneUse())) { + // udiv (zext X), (zext Y) --> zext (udiv X, Y) + // urem (zext X), (zext Y) --> zext (urem X, Y) + Value *NarrowOp = Builder.CreateBinOp(Opcode, X, Y); + return new ZExtInst(NarrowOp, Ty); + } + + Constant *C; + if ((match(N, m_OneUse(m_ZExt(m_Value(X)))) && match(D, m_Constant(C))) || + (match(D, m_OneUse(m_ZExt(m_Value(X)))) && match(N, m_Constant(C)))) { + // If the constant is the same in the smaller type, use the narrow version. + Constant *TruncC = ConstantExpr::getTrunc(C, X->getType()); + if (ConstantExpr::getZExt(TruncC, Ty) != C) + return nullptr; + + // udiv (zext X), C --> zext (udiv X, C') + // urem (zext X), C --> zext (urem X, C') + // udiv C, (zext X) --> zext (udiv C', X) + // urem C, (zext X) --> zext (urem C', X) + Value *NarrowOp = isa<Constant>(D) ? Builder.CreateBinOp(Opcode, X, TruncC) + : Builder.CreateBinOp(Opcode, TruncC, X); + return new ZExtInst(NarrowOp, Ty); + } + + return nullptr; +} + +Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { + if (Value *V = SimplifyUDivInst(I.getOperand(0), I.getOperand(1), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (Instruction *X = foldVectorBinop(I)) + return X; + + // Handle the integer div common cases + if (Instruction *Common = commonIDivTransforms(I)) + return Common; + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + Value *X; + const APInt *C1, *C2; + if (match(Op0, m_LShr(m_Value(X), m_APInt(C1))) && match(Op1, m_APInt(C2))) { + // (X lshr C1) udiv C2 --> X udiv (C2 << C1) + bool Overflow; + APInt C2ShlC1 = C2->ushl_ov(*C1, Overflow); + if (!Overflow) { + bool IsExact = I.isExact() && match(Op0, m_Exact(m_Value())); + BinaryOperator *BO = BinaryOperator::CreateUDiv( + X, ConstantInt::get(X->getType(), C2ShlC1)); + if (IsExact) + BO->setIsExact(); + return BO; + } + } + + // Op0 / C where C is large (negative) --> zext (Op0 >= C) + // TODO: Could use isKnownNegative() to handle non-constant values. + Type *Ty = I.getType(); + if (match(Op1, m_Negative())) { + Value *Cmp = Builder.CreateICmpUGE(Op0, Op1); + return CastInst::CreateZExtOrBitCast(Cmp, Ty); + } + // Op0 / (sext i1 X) --> zext (Op0 == -1) (if X is 0, the div is undefined) + if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) { + Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty)); + return CastInst::CreateZExtOrBitCast(Cmp, Ty); + } + + if (Instruction *NarrowDiv = narrowUDivURem(I, Builder)) + return NarrowDiv; + + // If the udiv operands are non-overflowing multiplies with a common operand, + // then eliminate the common factor: + // (A * B) / (A * X) --> B / X (and commuted variants) + // TODO: The code would be reduced if we had m_c_NUWMul pattern matching. + // TODO: If -reassociation handled this generally, we could remove this. + Value *A, *B; + if (match(Op0, m_NUWMul(m_Value(A), m_Value(B)))) { + if (match(Op1, m_NUWMul(m_Specific(A), m_Value(X))) || + match(Op1, m_NUWMul(m_Value(X), m_Specific(A)))) + return BinaryOperator::CreateUDiv(B, X); + if (match(Op1, m_NUWMul(m_Specific(B), m_Value(X))) || + match(Op1, m_NUWMul(m_Value(X), m_Specific(B)))) + return BinaryOperator::CreateUDiv(A, X); + } + + // (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...)))) + SmallVector<UDivFoldAction, 6> UDivActions; + if (visitUDivOperand(Op0, Op1, I, UDivActions)) + for (unsigned i = 0, e = UDivActions.size(); i != e; ++i) { + FoldUDivOperandCb Action = UDivActions[i].FoldAction; + Value *ActionOp1 = UDivActions[i].OperandToFold; + Instruction *Inst; + if (Action) + Inst = Action(Op0, ActionOp1, I, *this); + else { + // This action joins two actions together. The RHS of this action is + // simply the last action we processed, we saved the LHS action index in + // the joining action. + size_t SelectRHSIdx = i - 1; + Value *SelectRHS = UDivActions[SelectRHSIdx].FoldResult; + size_t SelectLHSIdx = UDivActions[i].SelectLHSIdx; + Value *SelectLHS = UDivActions[SelectLHSIdx].FoldResult; + Inst = SelectInst::Create(cast<SelectInst>(ActionOp1)->getCondition(), + SelectLHS, SelectRHS); + } + + // If this is the last action to process, return it to the InstCombiner. + // Otherwise, we insert it before the UDiv and record it so that we may + // use it as part of a joining action (i.e., a SelectInst). + if (e - i != 1) { + Inst->insertBefore(&I); + UDivActions[i].FoldResult = Inst; + } else + return Inst; + } + + return nullptr; +} + +Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { + if (Value *V = SimplifySDivInst(I.getOperand(0), I.getOperand(1), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (Instruction *X = foldVectorBinop(I)) + return X; + + // Handle the integer div common cases + if (Instruction *Common = commonIDivTransforms(I)) + return Common; + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + Value *X; + // sdiv Op0, -1 --> -Op0 + // sdiv Op0, (sext i1 X) --> -Op0 (because if X is 0, the op is undefined) + if (match(Op1, m_AllOnes()) || + (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))) + return BinaryOperator::CreateNeg(Op0); + + // X / INT_MIN --> X == INT_MIN + if (match(Op1, m_SignMask())) + return new ZExtInst(Builder.CreateICmpEQ(Op0, Op1), I.getType()); + + const APInt *Op1C; + if (match(Op1, m_APInt(Op1C))) { + // sdiv exact X, C --> ashr exact X, log2(C) + if (I.isExact() && Op1C->isNonNegative() && Op1C->isPowerOf2()) { + Value *ShAmt = ConstantInt::get(Op1->getType(), Op1C->exactLogBase2()); + return BinaryOperator::CreateExactAShr(Op0, ShAmt, I.getName()); + } + + // If the dividend is sign-extended and the constant divisor is small enough + // to fit in the source type, shrink the division to the narrower type: + // (sext X) sdiv C --> sext (X sdiv C) + Value *Op0Src; + if (match(Op0, m_OneUse(m_SExt(m_Value(Op0Src)))) && + Op0Src->getType()->getScalarSizeInBits() >= Op1C->getMinSignedBits()) { + + // In the general case, we need to make sure that the dividend is not the + // minimum signed value because dividing that by -1 is UB. But here, we + // know that the -1 divisor case is already handled above. + + Constant *NarrowDivisor = + ConstantExpr::getTrunc(cast<Constant>(Op1), Op0Src->getType()); + Value *NarrowOp = Builder.CreateSDiv(Op0Src, NarrowDivisor); + return new SExtInst(NarrowOp, Op0->getType()); + } + + // -X / C --> X / -C (if the negation doesn't overflow). + // TODO: This could be enhanced to handle arbitrary vector constants by + // checking if all elements are not the min-signed-val. + if (!Op1C->isMinSignedValue() && + match(Op0, m_NSWSub(m_Zero(), m_Value(X)))) { + Constant *NegC = ConstantInt::get(I.getType(), -(*Op1C)); + Instruction *BO = BinaryOperator::CreateSDiv(X, NegC); + BO->setIsExact(I.isExact()); + return BO; + } + } + + // -X / Y --> -(X / Y) + Value *Y; + if (match(&I, m_SDiv(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y)))) + return BinaryOperator::CreateNSWNeg( + Builder.CreateSDiv(X, Y, I.getName(), I.isExact())); + + // If the sign bits of both operands are zero (i.e. we can prove they are + // unsigned inputs), turn this into a udiv. + APInt Mask(APInt::getSignMask(I.getType()->getScalarSizeInBits())); + if (MaskedValueIsZero(Op0, Mask, 0, &I)) { + if (MaskedValueIsZero(Op1, Mask, 0, &I)) { + // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set + auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName()); + BO->setIsExact(I.isExact()); + return BO; + } + + if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) { + // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y) + // Safe because the only negative value (1 << Y) can take on is + // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have + // the sign bit set. + auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName()); + BO->setIsExact(I.isExact()); + return BO; + } + } + + return nullptr; +} + +/// Remove negation and try to convert division into multiplication. +static Instruction *foldFDivConstantDivisor(BinaryOperator &I) { + Constant *C; + if (!match(I.getOperand(1), m_Constant(C))) + return nullptr; + + // -X / C --> X / -C + Value *X; + if (match(I.getOperand(0), m_FNeg(m_Value(X)))) + return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I); + + // If the constant divisor has an exact inverse, this is always safe. If not, + // then we can still create a reciprocal if fast-math-flags allow it and the + // constant is a regular number (not zero, infinite, or denormal). + if (!(C->hasExactInverseFP() || (I.hasAllowReciprocal() && C->isNormalFP()))) + return nullptr; + + // Disallow denormal constants because we don't know what would happen + // on all targets. + // TODO: Use Intrinsic::canonicalize or let function attributes tell us that + // denorms are flushed? + auto *RecipC = ConstantExpr::getFDiv(ConstantFP::get(I.getType(), 1.0), C); + if (!RecipC->isNormalFP()) + return nullptr; + + // X / C --> X * (1 / C) + return BinaryOperator::CreateFMulFMF(I.getOperand(0), RecipC, &I); +} + +/// Remove negation and try to reassociate constant math. +static Instruction *foldFDivConstantDividend(BinaryOperator &I) { + Constant *C; + if (!match(I.getOperand(0), m_Constant(C))) + return nullptr; + + // C / -X --> -C / X + Value *X; + if (match(I.getOperand(1), m_FNeg(m_Value(X)))) + return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I); + + if (!I.hasAllowReassoc() || !I.hasAllowReciprocal()) + return nullptr; + + // Try to reassociate C / X expressions where X includes another constant. + Constant *C2, *NewC = nullptr; + if (match(I.getOperand(1), m_FMul(m_Value(X), m_Constant(C2)))) { + // C / (X * C2) --> (C / C2) / X + NewC = ConstantExpr::getFDiv(C, C2); + } else if (match(I.getOperand(1), m_FDiv(m_Value(X), m_Constant(C2)))) { + // C / (X / C2) --> (C * C2) / X + NewC = ConstantExpr::getFMul(C, C2); + } + // Disallow denormal constants because we don't know what would happen + // on all targets. + // TODO: Use Intrinsic::canonicalize or let function attributes tell us that + // denorms are flushed? + if (!NewC || !NewC->isNormalFP()) + return nullptr; + + return BinaryOperator::CreateFDivFMF(NewC, X, &I); +} + +Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { + if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1), + I.getFastMathFlags(), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (Instruction *X = foldVectorBinop(I)) + return X; + + if (Instruction *R = foldFDivConstantDivisor(I)) + return R; + + if (Instruction *R = foldFDivConstantDividend(I)) + return R; + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + if (isa<Constant>(Op0)) + if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + + if (isa<Constant>(Op1)) + if (SelectInst *SI = dyn_cast<SelectInst>(Op0)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + + if (I.hasAllowReassoc() && I.hasAllowReciprocal()) { + Value *X, *Y; + if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) && + (!isa<Constant>(Y) || !isa<Constant>(Op1))) { + // (X / Y) / Z => X / (Y * Z) + Value *YZ = Builder.CreateFMulFMF(Y, Op1, &I); + return BinaryOperator::CreateFDivFMF(X, YZ, &I); + } + if (match(Op1, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) && + (!isa<Constant>(Y) || !isa<Constant>(Op0))) { + // Z / (X / Y) => (Y * Z) / X + Value *YZ = Builder.CreateFMulFMF(Y, Op0, &I); + return BinaryOperator::CreateFDivFMF(YZ, X, &I); + } + } + + if (I.hasAllowReassoc() && Op0->hasOneUse() && Op1->hasOneUse()) { + // sin(X) / cos(X) -> tan(X) + // cos(X) / sin(X) -> 1/tan(X) (cotangent) + Value *X; + bool IsTan = match(Op0, m_Intrinsic<Intrinsic::sin>(m_Value(X))) && + match(Op1, m_Intrinsic<Intrinsic::cos>(m_Specific(X))); + bool IsCot = + !IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) && + match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X))); + + if ((IsTan || IsCot) && + hasFloatFn(&TLI, I.getType(), LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) { + IRBuilder<> B(&I); + IRBuilder<>::FastMathFlagGuard FMFGuard(B); + B.setFastMathFlags(I.getFastMathFlags()); + AttributeList Attrs = + cast<CallBase>(Op0)->getCalledFunction()->getAttributes(); + Value *Res = emitUnaryFloatFnCall(X, &TLI, LibFunc_tan, LibFunc_tanf, + LibFunc_tanl, B, Attrs); + if (IsCot) + Res = B.CreateFDiv(ConstantFP::get(I.getType(), 1.0), Res); + return replaceInstUsesWith(I, Res); + } + } + + // -X / -Y -> X / Y + Value *X, *Y; + if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y)))) { + I.setOperand(0, X); + I.setOperand(1, Y); + return &I; + } + + // X / (X * Y) --> 1.0 / Y + // Reassociate to (X / X -> 1.0) is legal when NaNs are not allowed. + // We can ignore the possibility that X is infinity because INF/INF is NaN. + if (I.hasNoNaNs() && I.hasAllowReassoc() && + match(Op1, m_c_FMul(m_Specific(Op0), m_Value(Y)))) { + I.setOperand(0, ConstantFP::get(I.getType(), 1.0)); + I.setOperand(1, Y); + return &I; + } + + // X / fabs(X) -> copysign(1.0, X) + // fabs(X) / X -> copysign(1.0, X) + if (I.hasNoNaNs() && I.hasNoInfs() && + (match(&I, + m_FDiv(m_Value(X), m_Intrinsic<Intrinsic::fabs>(m_Deferred(X)))) || + match(&I, m_FDiv(m_Intrinsic<Intrinsic::fabs>(m_Value(X)), + m_Deferred(X))))) { + Value *V = Builder.CreateBinaryIntrinsic( + Intrinsic::copysign, ConstantFP::get(I.getType(), 1.0), X, &I); + return replaceInstUsesWith(I, V); + } + return nullptr; +} + +/// This function implements the transforms common to both integer remainder +/// instructions (urem and srem). It is called by the visitors to those integer +/// remainder instructions. +/// Common integer remainder transforms +Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + // The RHS is known non-zero. + if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) { + I.setOperand(1, V); + return &I; + } + + // Handle cases involving: rem X, (select Cond, Y, Z) + if (simplifyDivRemOfSelectWithZeroOp(I)) + return &I; + + if (isa<Constant>(Op1)) { + if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) { + if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) { + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + } else if (auto *PN = dyn_cast<PHINode>(Op0I)) { + const APInt *Op1Int; + if (match(Op1, m_APInt(Op1Int)) && !Op1Int->isMinValue() && + (I.getOpcode() == Instruction::URem || + !Op1Int->isMinSignedValue())) { + // foldOpIntoPhi will speculate instructions to the end of the PHI's + // predecessor blocks, so do this only if we know the srem or urem + // will not fault. + if (Instruction *NV = foldOpIntoPhi(I, PN)) + return NV; + } + } + + // See if we can fold away this rem instruction. + if (SimplifyDemandedInstructionBits(I)) + return &I; + } + } + + return nullptr; +} + +Instruction *InstCombiner::visitURem(BinaryOperator &I) { + if (Value *V = SimplifyURemInst(I.getOperand(0), I.getOperand(1), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (Instruction *X = foldVectorBinop(I)) + return X; + + if (Instruction *common = commonIRemTransforms(I)) + return common; + + if (Instruction *NarrowRem = narrowUDivURem(I, Builder)) + return NarrowRem; + + // X urem Y -> X and Y-1, where Y is a power of 2, + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + Type *Ty = I.getType(); + if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) { + // This may increase instruction count, we don't enforce that Y is a + // constant. + Constant *N1 = Constant::getAllOnesValue(Ty); + Value *Add = Builder.CreateAdd(Op1, N1); + return BinaryOperator::CreateAnd(Op0, Add); + } + + // 1 urem X -> zext(X != 1) + if (match(Op0, m_One())) + return CastInst::CreateZExtOrBitCast(Builder.CreateICmpNE(Op1, Op0), Ty); + + // X urem C -> X < C ? X : X - C, where C >= signbit. + if (match(Op1, m_Negative())) { + Value *Cmp = Builder.CreateICmpULT(Op0, Op1); + Value *Sub = Builder.CreateSub(Op0, Op1); + return SelectInst::Create(Cmp, Op0, Sub); + } + + // If the divisor is a sext of a boolean, then the divisor must be max + // unsigned value (-1). Therefore, the remainder is Op0 unless Op0 is also + // max unsigned value. In that case, the remainder is 0: + // urem Op0, (sext i1 X) --> (Op0 == -1) ? 0 : Op0 + Value *X; + if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) { + Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty)); + return SelectInst::Create(Cmp, ConstantInt::getNullValue(Ty), Op0); + } + + return nullptr; +} + +Instruction *InstCombiner::visitSRem(BinaryOperator &I) { + if (Value *V = SimplifySRemInst(I.getOperand(0), I.getOperand(1), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (Instruction *X = foldVectorBinop(I)) + return X; + + // Handle the integer rem common cases + if (Instruction *Common = commonIRemTransforms(I)) + return Common; + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + { + const APInt *Y; + // X % -Y -> X % Y + if (match(Op1, m_Negative(Y)) && !Y->isMinSignedValue()) { + Worklist.AddValue(I.getOperand(1)); + I.setOperand(1, ConstantInt::get(I.getType(), -*Y)); + return &I; + } + } + + // -X srem Y --> -(X srem Y) + Value *X, *Y; + if (match(&I, m_SRem(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y)))) + return BinaryOperator::CreateNSWNeg(Builder.CreateSRem(X, Y)); + + // If the sign bits of both operands are zero (i.e. we can prove they are + // unsigned inputs), turn this into a urem. + APInt Mask(APInt::getSignMask(I.getType()->getScalarSizeInBits())); + if (MaskedValueIsZero(Op1, Mask, 0, &I) && + MaskedValueIsZero(Op0, Mask, 0, &I)) { + // X srem Y -> X urem Y, iff X and Y don't have sign bit set + return BinaryOperator::CreateURem(Op0, Op1, I.getName()); + } + + // If it's a constant vector, flip any negative values positive. + if (isa<ConstantVector>(Op1) || isa<ConstantDataVector>(Op1)) { + Constant *C = cast<Constant>(Op1); + unsigned VWidth = C->getType()->getVectorNumElements(); + + bool hasNegative = false; + bool hasMissing = false; + for (unsigned i = 0; i != VWidth; ++i) { + Constant *Elt = C->getAggregateElement(i); + if (!Elt) { + hasMissing = true; + break; + } + + if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elt)) + if (RHS->isNegative()) + hasNegative = true; + } + + if (hasNegative && !hasMissing) { + SmallVector<Constant *, 16> Elts(VWidth); + for (unsigned i = 0; i != VWidth; ++i) { + Elts[i] = C->getAggregateElement(i); // Handle undef, etc. + if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elts[i])) { + if (RHS->isNegative()) + Elts[i] = cast<ConstantInt>(ConstantExpr::getNeg(RHS)); + } + } + + Constant *NewRHSV = ConstantVector::get(Elts); + if (NewRHSV != C) { // Don't loop on -MININT + Worklist.AddValue(I.getOperand(1)); + I.setOperand(1, NewRHSV); + return &I; + } + } + } + + return nullptr; +} + +Instruction *InstCombiner::visitFRem(BinaryOperator &I) { + if (Value *V = SimplifyFRemInst(I.getOperand(0), I.getOperand(1), + I.getFastMathFlags(), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (Instruction *X = foldVectorBinop(I)) + return X; + + return nullptr; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp new file mode 100644 index 000000000000..e0376b7582f3 --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -0,0 +1,1266 @@ +//===- InstCombinePHI.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the visitPHINode function. +// +//===----------------------------------------------------------------------===// + +#include "InstCombineInternal.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/PatternMatch.h" +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "instcombine" + +static cl::opt<unsigned> +MaxNumPhis("instcombine-max-num-phis", cl::init(512), + cl::desc("Maximum number phis to handle in intptr/ptrint folding")); + +/// The PHI arguments will be folded into a single operation with a PHI node +/// as input. The debug location of the single operation will be the merged +/// locations of the original PHI node arguments. +void InstCombiner::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) { + auto *FirstInst = cast<Instruction>(PN.getIncomingValue(0)); + Inst->setDebugLoc(FirstInst->getDebugLoc()); + // We do not expect a CallInst here, otherwise, N-way merging of DebugLoc + // will be inefficient. + assert(!isa<CallInst>(Inst)); + + for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) { + auto *I = cast<Instruction>(PN.getIncomingValue(i)); + Inst->applyMergedLocation(Inst->getDebugLoc(), I->getDebugLoc()); + } +} + +// Replace Integer typed PHI PN if the PHI's value is used as a pointer value. +// If there is an existing pointer typed PHI that produces the same value as PN, +// replace PN and the IntToPtr operation with it. Otherwise, synthesize a new +// PHI node: +// +// Case-1: +// bb1: +// int_init = PtrToInt(ptr_init) +// br label %bb2 +// bb2: +// int_val = PHI([int_init, %bb1], [int_val_inc, %bb2] +// ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2] +// ptr_val2 = IntToPtr(int_val) +// ... +// use(ptr_val2) +// ptr_val_inc = ... +// inc_val_inc = PtrToInt(ptr_val_inc) +// +// ==> +// bb1: +// br label %bb2 +// bb2: +// ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2] +// ... +// use(ptr_val) +// ptr_val_inc = ... +// +// Case-2: +// bb1: +// int_ptr = BitCast(ptr_ptr) +// int_init = Load(int_ptr) +// br label %bb2 +// bb2: +// int_val = PHI([int_init, %bb1], [int_val_inc, %bb2] +// ptr_val2 = IntToPtr(int_val) +// ... +// use(ptr_val2) +// ptr_val_inc = ... +// inc_val_inc = PtrToInt(ptr_val_inc) +// ==> +// bb1: +// ptr_init = Load(ptr_ptr) +// br label %bb2 +// bb2: +// ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2] +// ... +// use(ptr_val) +// ptr_val_inc = ... +// ... +// +Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) { + if (!PN.getType()->isIntegerTy()) + return nullptr; + if (!PN.hasOneUse()) + return nullptr; + + auto *IntToPtr = dyn_cast<IntToPtrInst>(PN.user_back()); + if (!IntToPtr) + return nullptr; + + // Check if the pointer is actually used as pointer: + auto HasPointerUse = [](Instruction *IIP) { + for (User *U : IIP->users()) { + Value *Ptr = nullptr; + if (LoadInst *LoadI = dyn_cast<LoadInst>(U)) { + Ptr = LoadI->getPointerOperand(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + Ptr = SI->getPointerOperand(); + } else if (GetElementPtrInst *GI = dyn_cast<GetElementPtrInst>(U)) { + Ptr = GI->getPointerOperand(); + } + + if (Ptr && Ptr == IIP) + return true; + } + return false; + }; + + if (!HasPointerUse(IntToPtr)) + return nullptr; + + if (DL.getPointerSizeInBits(IntToPtr->getAddressSpace()) != + DL.getTypeSizeInBits(IntToPtr->getOperand(0)->getType())) + return nullptr; + + SmallVector<Value *, 4> AvailablePtrVals; + for (unsigned i = 0; i != PN.getNumIncomingValues(); ++i) { + Value *Arg = PN.getIncomingValue(i); + + // First look backward: + if (auto *PI = dyn_cast<PtrToIntInst>(Arg)) { + AvailablePtrVals.emplace_back(PI->getOperand(0)); + continue; + } + + // Next look forward: + Value *ArgIntToPtr = nullptr; + for (User *U : Arg->users()) { + if (isa<IntToPtrInst>(U) && U->getType() == IntToPtr->getType() && + (DT.dominates(cast<Instruction>(U), PN.getIncomingBlock(i)) || + cast<Instruction>(U)->getParent() == PN.getIncomingBlock(i))) { + ArgIntToPtr = U; + break; + } + } + + if (ArgIntToPtr) { + AvailablePtrVals.emplace_back(ArgIntToPtr); + continue; + } + + // If Arg is defined by a PHI, allow it. This will also create + // more opportunities iteratively. + if (isa<PHINode>(Arg)) { + AvailablePtrVals.emplace_back(Arg); + continue; + } + + // For a single use integer load: + auto *LoadI = dyn_cast<LoadInst>(Arg); + if (!LoadI) + return nullptr; + + if (!LoadI->hasOneUse()) + return nullptr; + + // Push the integer typed Load instruction into the available + // value set, and fix it up later when the pointer typed PHI + // is synthesized. + AvailablePtrVals.emplace_back(LoadI); + } + + // Now search for a matching PHI + auto *BB = PN.getParent(); + assert(AvailablePtrVals.size() == PN.getNumIncomingValues() && + "Not enough available ptr typed incoming values"); + PHINode *MatchingPtrPHI = nullptr; + unsigned NumPhis = 0; + for (auto II = BB->begin(), EI = BasicBlock::iterator(BB->getFirstNonPHI()); + II != EI; II++, NumPhis++) { + // FIXME: consider handling this in AggressiveInstCombine + if (NumPhis > MaxNumPhis) + return nullptr; + PHINode *PtrPHI = dyn_cast<PHINode>(II); + if (!PtrPHI || PtrPHI == &PN || PtrPHI->getType() != IntToPtr->getType()) + continue; + MatchingPtrPHI = PtrPHI; + for (unsigned i = 0; i != PtrPHI->getNumIncomingValues(); ++i) { + if (AvailablePtrVals[i] != + PtrPHI->getIncomingValueForBlock(PN.getIncomingBlock(i))) { + MatchingPtrPHI = nullptr; + break; + } + } + + if (MatchingPtrPHI) + break; + } + + if (MatchingPtrPHI) { + assert(MatchingPtrPHI->getType() == IntToPtr->getType() && + "Phi's Type does not match with IntToPtr"); + // The PtrToCast + IntToPtr will be simplified later + return CastInst::CreateBitOrPointerCast(MatchingPtrPHI, + IntToPtr->getOperand(0)->getType()); + } + + // If it requires a conversion for every PHI operand, do not do it. + if (all_of(AvailablePtrVals, [&](Value *V) { + return (V->getType() != IntToPtr->getType()) || isa<IntToPtrInst>(V); + })) + return nullptr; + + // If any of the operand that requires casting is a terminator + // instruction, do not do it. + if (any_of(AvailablePtrVals, [&](Value *V) { + if (V->getType() == IntToPtr->getType()) + return false; + + auto *Inst = dyn_cast<Instruction>(V); + return Inst && Inst->isTerminator(); + })) + return nullptr; + + PHINode *NewPtrPHI = PHINode::Create( + IntToPtr->getType(), PN.getNumIncomingValues(), PN.getName() + ".ptr"); + + InsertNewInstBefore(NewPtrPHI, PN); + SmallDenseMap<Value *, Instruction *> Casts; + for (unsigned i = 0; i != PN.getNumIncomingValues(); ++i) { + auto *IncomingBB = PN.getIncomingBlock(i); + auto *IncomingVal = AvailablePtrVals[i]; + + if (IncomingVal->getType() == IntToPtr->getType()) { + NewPtrPHI->addIncoming(IncomingVal, IncomingBB); + continue; + } + +#ifndef NDEBUG + LoadInst *LoadI = dyn_cast<LoadInst>(IncomingVal); + assert((isa<PHINode>(IncomingVal) || + IncomingVal->getType()->isPointerTy() || + (LoadI && LoadI->hasOneUse())) && + "Can not replace LoadInst with multiple uses"); +#endif + // Need to insert a BitCast. + // For an integer Load instruction with a single use, the load + IntToPtr + // cast will be simplified into a pointer load: + // %v = load i64, i64* %a.ip, align 8 + // %v.cast = inttoptr i64 %v to float ** + // ==> + // %v.ptrp = bitcast i64 * %a.ip to float ** + // %v.cast = load float *, float ** %v.ptrp, align 8 + Instruction *&CI = Casts[IncomingVal]; + if (!CI) { + CI = CastInst::CreateBitOrPointerCast(IncomingVal, IntToPtr->getType(), + IncomingVal->getName() + ".ptr"); + if (auto *IncomingI = dyn_cast<Instruction>(IncomingVal)) { + BasicBlock::iterator InsertPos(IncomingI); + InsertPos++; + if (isa<PHINode>(IncomingI)) + InsertPos = IncomingI->getParent()->getFirstInsertionPt(); + InsertNewInstBefore(CI, *InsertPos); + } else { + auto *InsertBB = &IncomingBB->getParent()->getEntryBlock(); + InsertNewInstBefore(CI, *InsertBB->getFirstInsertionPt()); + } + } + NewPtrPHI->addIncoming(CI, IncomingBB); + } + + // The PtrToCast + IntToPtr will be simplified later + return CastInst::CreateBitOrPointerCast(NewPtrPHI, + IntToPtr->getOperand(0)->getType()); +} + +/// If we have something like phi [add (a,b), add(a,c)] and if a/b/c and the +/// adds all have a single use, turn this into a phi and a single binop. +Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) { + Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0)); + assert(isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)); + unsigned Opc = FirstInst->getOpcode(); + Value *LHSVal = FirstInst->getOperand(0); + Value *RHSVal = FirstInst->getOperand(1); + + Type *LHSType = LHSVal->getType(); + Type *RHSType = RHSVal->getType(); + + // Scan to see if all operands are the same opcode, and all have one use. + for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) { + Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i)); + if (!I || I->getOpcode() != Opc || !I->hasOneUse() || + // Verify type of the LHS matches so we don't fold cmp's of different + // types. + I->getOperand(0)->getType() != LHSType || + I->getOperand(1)->getType() != RHSType) + return nullptr; + + // If they are CmpInst instructions, check their predicates + if (CmpInst *CI = dyn_cast<CmpInst>(I)) + if (CI->getPredicate() != cast<CmpInst>(FirstInst)->getPredicate()) + return nullptr; + + // Keep track of which operand needs a phi node. + if (I->getOperand(0) != LHSVal) LHSVal = nullptr; + if (I->getOperand(1) != RHSVal) RHSVal = nullptr; + } + + // If both LHS and RHS would need a PHI, don't do this transformation, + // because it would increase the number of PHIs entering the block, + // which leads to higher register pressure. This is especially + // bad when the PHIs are in the header of a loop. + if (!LHSVal && !RHSVal) + return nullptr; + + // Otherwise, this is safe to transform! + + Value *InLHS = FirstInst->getOperand(0); + Value *InRHS = FirstInst->getOperand(1); + PHINode *NewLHS = nullptr, *NewRHS = nullptr; + if (!LHSVal) { + NewLHS = PHINode::Create(LHSType, PN.getNumIncomingValues(), + FirstInst->getOperand(0)->getName() + ".pn"); + NewLHS->addIncoming(InLHS, PN.getIncomingBlock(0)); + InsertNewInstBefore(NewLHS, PN); + LHSVal = NewLHS; + } + + if (!RHSVal) { + NewRHS = PHINode::Create(RHSType, PN.getNumIncomingValues(), + FirstInst->getOperand(1)->getName() + ".pn"); + NewRHS->addIncoming(InRHS, PN.getIncomingBlock(0)); + InsertNewInstBefore(NewRHS, PN); + RHSVal = NewRHS; + } + + // Add all operands to the new PHIs. + if (NewLHS || NewRHS) { + for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { + Instruction *InInst = cast<Instruction>(PN.getIncomingValue(i)); + if (NewLHS) { + Value *NewInLHS = InInst->getOperand(0); + NewLHS->addIncoming(NewInLHS, PN.getIncomingBlock(i)); + } + if (NewRHS) { + Value *NewInRHS = InInst->getOperand(1); + NewRHS->addIncoming(NewInRHS, PN.getIncomingBlock(i)); + } + } + } + + if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) { + CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), + LHSVal, RHSVal); + PHIArgMergedDebugLoc(NewCI, PN); + return NewCI; + } + + BinaryOperator *BinOp = cast<BinaryOperator>(FirstInst); + BinaryOperator *NewBinOp = + BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal); + + NewBinOp->copyIRFlags(PN.getIncomingValue(0)); + + for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) + NewBinOp->andIRFlags(PN.getIncomingValue(i)); + + PHIArgMergedDebugLoc(NewBinOp, PN); + return NewBinOp; +} + +Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) { + GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0)); + + SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(), + FirstInst->op_end()); + // This is true if all GEP bases are allocas and if all indices into them are + // constants. + bool AllBasePointersAreAllocas = true; + + // We don't want to replace this phi if the replacement would require + // more than one phi, which leads to higher register pressure. This is + // especially bad when the PHIs are in the header of a loop. + bool NeededPhi = false; + + bool AllInBounds = true; + + // Scan to see if all operands are the same opcode, and all have one use. + for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) { + GetElementPtrInst *GEP= dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i)); + if (!GEP || !GEP->hasOneUse() || GEP->getType() != FirstInst->getType() || + GEP->getNumOperands() != FirstInst->getNumOperands()) + return nullptr; + + AllInBounds &= GEP->isInBounds(); + + // Keep track of whether or not all GEPs are of alloca pointers. + if (AllBasePointersAreAllocas && + (!isa<AllocaInst>(GEP->getOperand(0)) || + !GEP->hasAllConstantIndices())) + AllBasePointersAreAllocas = false; + + // Compare the operand lists. + for (unsigned op = 0, e = FirstInst->getNumOperands(); op != e; ++op) { + if (FirstInst->getOperand(op) == GEP->getOperand(op)) + continue; + + // Don't merge two GEPs when two operands differ (introducing phi nodes) + // if one of the PHIs has a constant for the index. The index may be + // substantially cheaper to compute for the constants, so making it a + // variable index could pessimize the path. This also handles the case + // for struct indices, which must always be constant. + if (isa<ConstantInt>(FirstInst->getOperand(op)) || + isa<ConstantInt>(GEP->getOperand(op))) + return nullptr; + + if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType()) + return nullptr; + + // If we already needed a PHI for an earlier operand, and another operand + // also requires a PHI, we'd be introducing more PHIs than we're + // eliminating, which increases register pressure on entry to the PHI's + // block. + if (NeededPhi) + return nullptr; + + FixedOperands[op] = nullptr; // Needs a PHI. + NeededPhi = true; + } + } + + // If all of the base pointers of the PHI'd GEPs are from allocas, don't + // bother doing this transformation. At best, this will just save a bit of + // offset calculation, but all the predecessors will have to materialize the + // stack address into a register anyway. We'd actually rather *clone* the + // load up into the predecessors so that we have a load of a gep of an alloca, + // which can usually all be folded into the load. + if (AllBasePointersAreAllocas) + return nullptr; + + // Otherwise, this is safe to transform. Insert PHI nodes for each operand + // that is variable. + SmallVector<PHINode*, 16> OperandPhis(FixedOperands.size()); + + bool HasAnyPHIs = false; + for (unsigned i = 0, e = FixedOperands.size(); i != e; ++i) { + if (FixedOperands[i]) continue; // operand doesn't need a phi. + Value *FirstOp = FirstInst->getOperand(i); + PHINode *NewPN = PHINode::Create(FirstOp->getType(), e, + FirstOp->getName()+".pn"); + InsertNewInstBefore(NewPN, PN); + + NewPN->addIncoming(FirstOp, PN.getIncomingBlock(0)); + OperandPhis[i] = NewPN; + FixedOperands[i] = NewPN; + HasAnyPHIs = true; + } + + + // Add all operands to the new PHIs. + if (HasAnyPHIs) { + for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { + GetElementPtrInst *InGEP =cast<GetElementPtrInst>(PN.getIncomingValue(i)); + BasicBlock *InBB = PN.getIncomingBlock(i); + + for (unsigned op = 0, e = OperandPhis.size(); op != e; ++op) + if (PHINode *OpPhi = OperandPhis[op]) + OpPhi->addIncoming(InGEP->getOperand(op), InBB); + } + } + + Value *Base = FixedOperands[0]; + GetElementPtrInst *NewGEP = + GetElementPtrInst::Create(FirstInst->getSourceElementType(), Base, + makeArrayRef(FixedOperands).slice(1)); + if (AllInBounds) NewGEP->setIsInBounds(); + PHIArgMergedDebugLoc(NewGEP, PN); + return NewGEP; +} + + +/// Return true if we know that it is safe to sink the load out of the block +/// that defines it. This means that it must be obvious the value of the load is +/// not changed from the point of the load to the end of the block it is in. +/// +/// Finally, it is safe, but not profitable, to sink a load targeting a +/// non-address-taken alloca. Doing so will cause us to not promote the alloca +/// to a register. +static bool isSafeAndProfitableToSinkLoad(LoadInst *L) { + BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end(); + + for (++BBI; BBI != E; ++BBI) + if (BBI->mayWriteToMemory()) + return false; + + // Check for non-address taken alloca. If not address-taken already, it isn't + // profitable to do this xform. + if (AllocaInst *AI = dyn_cast<AllocaInst>(L->getOperand(0))) { + bool isAddressTaken = false; + for (User *U : AI->users()) { + if (isa<LoadInst>(U)) continue; + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + // If storing TO the alloca, then the address isn't taken. + if (SI->getOperand(1) == AI) continue; + } + isAddressTaken = true; + break; + } + + if (!isAddressTaken && AI->isStaticAlloca()) + return false; + } + + // If this load is a load from a GEP with a constant offset from an alloca, + // then we don't want to sink it. In its present form, it will be + // load [constant stack offset]. Sinking it will cause us to have to + // materialize the stack addresses in each predecessor in a register only to + // do a shared load from register in the successor. + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(L->getOperand(0))) + if (AllocaInst *AI = dyn_cast<AllocaInst>(GEP->getOperand(0))) + if (AI->isStaticAlloca() && GEP->hasAllConstantIndices()) + return false; + + return true; +} + +Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { + LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0)); + + // FIXME: This is overconservative; this transform is allowed in some cases + // for atomic operations. + if (FirstLI->isAtomic()) + return nullptr; + + // When processing loads, we need to propagate two bits of information to the + // sunk load: whether it is volatile, and what its alignment is. We currently + // don't sink loads when some have their alignment specified and some don't. + // visitLoadInst will propagate an alignment onto the load when TD is around, + // and if TD isn't around, we can't handle the mixed case. + bool isVolatile = FirstLI->isVolatile(); + MaybeAlign LoadAlignment(FirstLI->getAlignment()); + unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace(); + + // We can't sink the load if the loaded value could be modified between the + // load and the PHI. + if (FirstLI->getParent() != PN.getIncomingBlock(0) || + !isSafeAndProfitableToSinkLoad(FirstLI)) + return nullptr; + + // If the PHI is of volatile loads and the load block has multiple + // successors, sinking it would remove a load of the volatile value from + // the path through the other successor. + if (isVolatile && + FirstLI->getParent()->getTerminator()->getNumSuccessors() != 1) + return nullptr; + + // Check to see if all arguments are the same operation. + for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { + LoadInst *LI = dyn_cast<LoadInst>(PN.getIncomingValue(i)); + if (!LI || !LI->hasOneUse()) + return nullptr; + + // We can't sink the load if the loaded value could be modified between + // the load and the PHI. + if (LI->isVolatile() != isVolatile || + LI->getParent() != PN.getIncomingBlock(i) || + LI->getPointerAddressSpace() != LoadAddrSpace || + !isSafeAndProfitableToSinkLoad(LI)) + return nullptr; + + // If some of the loads have an alignment specified but not all of them, + // we can't do the transformation. + if ((LoadAlignment.hasValue()) != (LI->getAlignment() != 0)) + return nullptr; + + LoadAlignment = std::min(LoadAlignment, MaybeAlign(LI->getAlignment())); + + // If the PHI is of volatile loads and the load block has multiple + // successors, sinking it would remove a load of the volatile value from + // the path through the other successor. + if (isVolatile && + LI->getParent()->getTerminator()->getNumSuccessors() != 1) + return nullptr; + } + + // Okay, they are all the same operation. Create a new PHI node of the + // correct type, and PHI together all of the LHS's of the instructions. + PHINode *NewPN = PHINode::Create(FirstLI->getOperand(0)->getType(), + PN.getNumIncomingValues(), + PN.getName()+".in"); + + Value *InVal = FirstLI->getOperand(0); + NewPN->addIncoming(InVal, PN.getIncomingBlock(0)); + LoadInst *NewLI = + new LoadInst(FirstLI->getType(), NewPN, "", isVolatile, LoadAlignment); + + unsigned KnownIDs[] = { + LLVMContext::MD_tbaa, + LLVMContext::MD_range, + LLVMContext::MD_invariant_load, + LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + LLVMContext::MD_nonnull, + LLVMContext::MD_align, + LLVMContext::MD_dereferenceable, + LLVMContext::MD_dereferenceable_or_null, + LLVMContext::MD_access_group, + }; + + for (unsigned ID : KnownIDs) + NewLI->setMetadata(ID, FirstLI->getMetadata(ID)); + + // Add all operands to the new PHI and combine TBAA metadata. + for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { + LoadInst *LI = cast<LoadInst>(PN.getIncomingValue(i)); + combineMetadata(NewLI, LI, KnownIDs, true); + Value *NewInVal = LI->getOperand(0); + if (NewInVal != InVal) + InVal = nullptr; + NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i)); + } + + if (InVal) { + // The new PHI unions all of the same values together. This is really + // common, so we handle it intelligently here for compile-time speed. + NewLI->setOperand(0, InVal); + delete NewPN; + } else { + InsertNewInstBefore(NewPN, PN); + } + + // If this was a volatile load that we are merging, make sure to loop through + // and mark all the input loads as non-volatile. If we don't do this, we will + // insert a new volatile load and the old ones will not be deletable. + if (isVolatile) + for (Value *IncValue : PN.incoming_values()) + cast<LoadInst>(IncValue)->setVolatile(false); + + PHIArgMergedDebugLoc(NewLI, PN); + return NewLI; +} + +/// TODO: This function could handle other cast types, but then it might +/// require special-casing a cast from the 'i1' type. See the comment in +/// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types. +Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) { + // We cannot create a new instruction after the PHI if the terminator is an + // EHPad because there is no valid insertion point. + if (Instruction *TI = Phi.getParent()->getTerminator()) + if (TI->isEHPad()) + return nullptr; + + // Early exit for the common case of a phi with two operands. These are + // handled elsewhere. See the comment below where we check the count of zexts + // and constants for more details. + unsigned NumIncomingValues = Phi.getNumIncomingValues(); + if (NumIncomingValues < 3) + return nullptr; + + // Find the narrower type specified by the first zext. + Type *NarrowType = nullptr; + for (Value *V : Phi.incoming_values()) { + if (auto *Zext = dyn_cast<ZExtInst>(V)) { + NarrowType = Zext->getSrcTy(); + break; + } + } + if (!NarrowType) + return nullptr; + + // Walk the phi operands checking that we only have zexts or constants that + // we can shrink for free. Store the new operands for the new phi. + SmallVector<Value *, 4> NewIncoming; + unsigned NumZexts = 0; + unsigned NumConsts = 0; + for (Value *V : Phi.incoming_values()) { + if (auto *Zext = dyn_cast<ZExtInst>(V)) { + // All zexts must be identical and have one use. + if (Zext->getSrcTy() != NarrowType || !Zext->hasOneUse()) + return nullptr; + NewIncoming.push_back(Zext->getOperand(0)); + NumZexts++; + } else if (auto *C = dyn_cast<Constant>(V)) { + // Make sure that constants can fit in the new type. + Constant *Trunc = ConstantExpr::getTrunc(C, NarrowType); + if (ConstantExpr::getZExt(Trunc, C->getType()) != C) + return nullptr; + NewIncoming.push_back(Trunc); + NumConsts++; + } else { + // If it's not a cast or a constant, bail out. + return nullptr; + } + } + + // The more common cases of a phi with no constant operands or just one + // variable operand are handled by FoldPHIArgOpIntoPHI() and foldOpIntoPhi() + // respectively. foldOpIntoPhi() wants to do the opposite transform that is + // performed here. It tries to replicate a cast in the phi operand's basic + // block to expose other folding opportunities. Thus, InstCombine will + // infinite loop without this check. + if (NumConsts == 0 || NumZexts < 2) + return nullptr; + + // All incoming values are zexts or constants that are safe to truncate. + // Create a new phi node of the narrow type, phi together all of the new + // operands, and zext the result back to the original type. + PHINode *NewPhi = PHINode::Create(NarrowType, NumIncomingValues, + Phi.getName() + ".shrunk"); + for (unsigned i = 0; i != NumIncomingValues; ++i) + NewPhi->addIncoming(NewIncoming[i], Phi.getIncomingBlock(i)); + + InsertNewInstBefore(NewPhi, Phi); + return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType()); +} + +/// If all operands to a PHI node are the same "unary" operator and they all are +/// only used by the PHI, PHI together their inputs, and do the operation once, +/// to the result of the PHI. +Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) { + // We cannot create a new instruction after the PHI if the terminator is an + // EHPad because there is no valid insertion point. + if (Instruction *TI = PN.getParent()->getTerminator()) + if (TI->isEHPad()) + return nullptr; + + Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0)); + + if (isa<GetElementPtrInst>(FirstInst)) + return FoldPHIArgGEPIntoPHI(PN); + if (isa<LoadInst>(FirstInst)) + return FoldPHIArgLoadIntoPHI(PN); + + // Scan the instruction, looking for input operations that can be folded away. + // If all input operands to the phi are the same instruction (e.g. a cast from + // the same type or "+42") we can pull the operation through the PHI, reducing + // code size and simplifying code. + Constant *ConstantOp = nullptr; + Type *CastSrcTy = nullptr; + + if (isa<CastInst>(FirstInst)) { + CastSrcTy = FirstInst->getOperand(0)->getType(); + + // Be careful about transforming integer PHIs. We don't want to pessimize + // the code by turning an i32 into an i1293. + if (PN.getType()->isIntegerTy() && CastSrcTy->isIntegerTy()) { + if (!shouldChangeType(PN.getType(), CastSrcTy)) + return nullptr; + } + } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) { + // Can fold binop, compare or shift here if the RHS is a constant, + // otherwise call FoldPHIArgBinOpIntoPHI. + ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1)); + if (!ConstantOp) + return FoldPHIArgBinOpIntoPHI(PN); + } else { + return nullptr; // Cannot fold this operation. + } + + // Check to see if all arguments are the same operation. + for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { + Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i)); + if (!I || !I->hasOneUse() || !I->isSameOperationAs(FirstInst)) + return nullptr; + if (CastSrcTy) { + if (I->getOperand(0)->getType() != CastSrcTy) + return nullptr; // Cast operation must match. + } else if (I->getOperand(1) != ConstantOp) { + return nullptr; + } + } + + // Okay, they are all the same operation. Create a new PHI node of the + // correct type, and PHI together all of the LHS's of the instructions. + PHINode *NewPN = PHINode::Create(FirstInst->getOperand(0)->getType(), + PN.getNumIncomingValues(), + PN.getName()+".in"); + + Value *InVal = FirstInst->getOperand(0); + NewPN->addIncoming(InVal, PN.getIncomingBlock(0)); + + // Add all operands to the new PHI. + for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { + Value *NewInVal = cast<Instruction>(PN.getIncomingValue(i))->getOperand(0); + if (NewInVal != InVal) + InVal = nullptr; + NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i)); + } + + Value *PhiVal; + if (InVal) { + // The new PHI unions all of the same values together. This is really + // common, so we handle it intelligently here for compile-time speed. + PhiVal = InVal; + delete NewPN; + } else { + InsertNewInstBefore(NewPN, PN); + PhiVal = NewPN; + } + + // Insert and return the new operation. + if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst)) { + CastInst *NewCI = CastInst::Create(FirstCI->getOpcode(), PhiVal, + PN.getType()); + PHIArgMergedDebugLoc(NewCI, PN); + return NewCI; + } + + if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst)) { + BinOp = BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp); + BinOp->copyIRFlags(PN.getIncomingValue(0)); + + for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) + BinOp->andIRFlags(PN.getIncomingValue(i)); + + PHIArgMergedDebugLoc(BinOp, PN); + return BinOp; + } + + CmpInst *CIOp = cast<CmpInst>(FirstInst); + CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), + PhiVal, ConstantOp); + PHIArgMergedDebugLoc(NewCI, PN); + return NewCI; +} + +/// Return true if this PHI node is only used by a PHI node cycle that is dead. +static bool DeadPHICycle(PHINode *PN, + SmallPtrSetImpl<PHINode*> &PotentiallyDeadPHIs) { + if (PN->use_empty()) return true; + if (!PN->hasOneUse()) return false; + + // Remember this node, and if we find the cycle, return. + if (!PotentiallyDeadPHIs.insert(PN).second) + return true; + + // Don't scan crazily complex things. + if (PotentiallyDeadPHIs.size() == 16) + return false; + + if (PHINode *PU = dyn_cast<PHINode>(PN->user_back())) + return DeadPHICycle(PU, PotentiallyDeadPHIs); + + return false; +} + +/// Return true if this phi node is always equal to NonPhiInVal. +/// This happens with mutually cyclic phi nodes like: +/// z = some value; x = phi (y, z); y = phi (x, z) +static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal, + SmallPtrSetImpl<PHINode*> &ValueEqualPHIs) { + // See if we already saw this PHI node. + if (!ValueEqualPHIs.insert(PN).second) + return true; + + // Don't scan crazily complex things. + if (ValueEqualPHIs.size() == 16) + return false; + + // Scan the operands to see if they are either phi nodes or are equal to + // the value. + for (Value *Op : PN->incoming_values()) { + if (PHINode *OpPN = dyn_cast<PHINode>(Op)) { + if (!PHIsEqualValue(OpPN, NonPhiInVal, ValueEqualPHIs)) + return false; + } else if (Op != NonPhiInVal) + return false; + } + + return true; +} + +/// Return an existing non-zero constant if this phi node has one, otherwise +/// return constant 1. +static ConstantInt *GetAnyNonZeroConstInt(PHINode &PN) { + assert(isa<IntegerType>(PN.getType()) && "Expect only integer type phi"); + for (Value *V : PN.operands()) + if (auto *ConstVA = dyn_cast<ConstantInt>(V)) + if (!ConstVA->isZero()) + return ConstVA; + return ConstantInt::get(cast<IntegerType>(PN.getType()), 1); +} + +namespace { +struct PHIUsageRecord { + unsigned PHIId; // The ID # of the PHI (something determinstic to sort on) + unsigned Shift; // The amount shifted. + Instruction *Inst; // The trunc instruction. + + PHIUsageRecord(unsigned pn, unsigned Sh, Instruction *User) + : PHIId(pn), Shift(Sh), Inst(User) {} + + bool operator<(const PHIUsageRecord &RHS) const { + if (PHIId < RHS.PHIId) return true; + if (PHIId > RHS.PHIId) return false; + if (Shift < RHS.Shift) return true; + if (Shift > RHS.Shift) return false; + return Inst->getType()->getPrimitiveSizeInBits() < + RHS.Inst->getType()->getPrimitiveSizeInBits(); + } +}; + +struct LoweredPHIRecord { + PHINode *PN; // The PHI that was lowered. + unsigned Shift; // The amount shifted. + unsigned Width; // The width extracted. + + LoweredPHIRecord(PHINode *pn, unsigned Sh, Type *Ty) + : PN(pn), Shift(Sh), Width(Ty->getPrimitiveSizeInBits()) {} + + // Ctor form used by DenseMap. + LoweredPHIRecord(PHINode *pn, unsigned Sh) + : PN(pn), Shift(Sh), Width(0) {} +}; +} + +namespace llvm { + template<> + struct DenseMapInfo<LoweredPHIRecord> { + static inline LoweredPHIRecord getEmptyKey() { + return LoweredPHIRecord(nullptr, 0); + } + static inline LoweredPHIRecord getTombstoneKey() { + return LoweredPHIRecord(nullptr, 1); + } + static unsigned getHashValue(const LoweredPHIRecord &Val) { + return DenseMapInfo<PHINode*>::getHashValue(Val.PN) ^ (Val.Shift>>3) ^ + (Val.Width>>3); + } + static bool isEqual(const LoweredPHIRecord &LHS, + const LoweredPHIRecord &RHS) { + return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift && + LHS.Width == RHS.Width; + } + }; +} + + +/// This is an integer PHI and we know that it has an illegal type: see if it is +/// only used by trunc or trunc(lshr) operations. If so, we split the PHI into +/// the various pieces being extracted. This sort of thing is introduced when +/// SROA promotes an aggregate to large integer values. +/// +/// TODO: The user of the trunc may be an bitcast to float/double/vector or an +/// inttoptr. We should produce new PHIs in the right type. +/// +Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { + // PHIUsers - Keep track of all of the truncated values extracted from a set + // of PHIs, along with their offset. These are the things we want to rewrite. + SmallVector<PHIUsageRecord, 16> PHIUsers; + + // PHIs are often mutually cyclic, so we keep track of a whole set of PHI + // nodes which are extracted from. PHIsToSlice is a set we use to avoid + // revisiting PHIs, PHIsInspected is a ordered list of PHIs that we need to + // check the uses of (to ensure they are all extracts). + SmallVector<PHINode*, 8> PHIsToSlice; + SmallPtrSet<PHINode*, 8> PHIsInspected; + + PHIsToSlice.push_back(&FirstPhi); + PHIsInspected.insert(&FirstPhi); + + for (unsigned PHIId = 0; PHIId != PHIsToSlice.size(); ++PHIId) { + PHINode *PN = PHIsToSlice[PHIId]; + + // Scan the input list of the PHI. If any input is an invoke, and if the + // input is defined in the predecessor, then we won't be split the critical + // edge which is required to insert a truncate. Because of this, we have to + // bail out. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + InvokeInst *II = dyn_cast<InvokeInst>(PN->getIncomingValue(i)); + if (!II) continue; + if (II->getParent() != PN->getIncomingBlock(i)) + continue; + + // If we have a phi, and if it's directly in the predecessor, then we have + // a critical edge where we need to put the truncate. Since we can't + // split the edge in instcombine, we have to bail out. + return nullptr; + } + + for (User *U : PN->users()) { + Instruction *UserI = cast<Instruction>(U); + + // If the user is a PHI, inspect its uses recursively. + if (PHINode *UserPN = dyn_cast<PHINode>(UserI)) { + if (PHIsInspected.insert(UserPN).second) + PHIsToSlice.push_back(UserPN); + continue; + } + + // Truncates are always ok. + if (isa<TruncInst>(UserI)) { + PHIUsers.push_back(PHIUsageRecord(PHIId, 0, UserI)); + continue; + } + + // Otherwise it must be a lshr which can only be used by one trunc. + if (UserI->getOpcode() != Instruction::LShr || + !UserI->hasOneUse() || !isa<TruncInst>(UserI->user_back()) || + !isa<ConstantInt>(UserI->getOperand(1))) + return nullptr; + + // Bail on out of range shifts. + unsigned SizeInBits = UserI->getType()->getScalarSizeInBits(); + if (cast<ConstantInt>(UserI->getOperand(1))->getValue().uge(SizeInBits)) + return nullptr; + + unsigned Shift = cast<ConstantInt>(UserI->getOperand(1))->getZExtValue(); + PHIUsers.push_back(PHIUsageRecord(PHIId, Shift, UserI->user_back())); + } + } + + // If we have no users, they must be all self uses, just nuke the PHI. + if (PHIUsers.empty()) + return replaceInstUsesWith(FirstPhi, UndefValue::get(FirstPhi.getType())); + + // If this phi node is transformable, create new PHIs for all the pieces + // extracted out of it. First, sort the users by their offset and size. + array_pod_sort(PHIUsers.begin(), PHIUsers.end()); + + LLVM_DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n'; + for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) dbgs() + << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n';); + + // PredValues - This is a temporary used when rewriting PHI nodes. It is + // hoisted out here to avoid construction/destruction thrashing. + DenseMap<BasicBlock*, Value*> PredValues; + + // ExtractedVals - Each new PHI we introduce is saved here so we don't + // introduce redundant PHIs. + DenseMap<LoweredPHIRecord, PHINode*> ExtractedVals; + + for (unsigned UserI = 0, UserE = PHIUsers.size(); UserI != UserE; ++UserI) { + unsigned PHIId = PHIUsers[UserI].PHIId; + PHINode *PN = PHIsToSlice[PHIId]; + unsigned Offset = PHIUsers[UserI].Shift; + Type *Ty = PHIUsers[UserI].Inst->getType(); + + PHINode *EltPHI; + + // If we've already lowered a user like this, reuse the previously lowered + // value. + if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == nullptr) { + + // Otherwise, Create the new PHI node for this user. + EltPHI = PHINode::Create(Ty, PN->getNumIncomingValues(), + PN->getName()+".off"+Twine(Offset), PN); + assert(EltPHI->getType() != PN->getType() && + "Truncate didn't shrink phi?"); + + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *Pred = PN->getIncomingBlock(i); + Value *&PredVal = PredValues[Pred]; + + // If we already have a value for this predecessor, reuse it. + if (PredVal) { + EltPHI->addIncoming(PredVal, Pred); + continue; + } + + // Handle the PHI self-reuse case. + Value *InVal = PN->getIncomingValue(i); + if (InVal == PN) { + PredVal = EltPHI; + EltPHI->addIncoming(PredVal, Pred); + continue; + } + + if (PHINode *InPHI = dyn_cast<PHINode>(PN)) { + // If the incoming value was a PHI, and if it was one of the PHIs we + // already rewrote it, just use the lowered value. + if (Value *Res = ExtractedVals[LoweredPHIRecord(InPHI, Offset, Ty)]) { + PredVal = Res; + EltPHI->addIncoming(PredVal, Pred); + continue; + } + } + + // Otherwise, do an extract in the predecessor. + Builder.SetInsertPoint(Pred->getTerminator()); + Value *Res = InVal; + if (Offset) + Res = Builder.CreateLShr(Res, ConstantInt::get(InVal->getType(), + Offset), "extract"); + Res = Builder.CreateTrunc(Res, Ty, "extract.t"); + PredVal = Res; + EltPHI->addIncoming(Res, Pred); + + // If the incoming value was a PHI, and if it was one of the PHIs we are + // rewriting, we will ultimately delete the code we inserted. This + // means we need to revisit that PHI to make sure we extract out the + // needed piece. + if (PHINode *OldInVal = dyn_cast<PHINode>(PN->getIncomingValue(i))) + if (PHIsInspected.count(OldInVal)) { + unsigned RefPHIId = + find(PHIsToSlice, OldInVal) - PHIsToSlice.begin(); + PHIUsers.push_back(PHIUsageRecord(RefPHIId, Offset, + cast<Instruction>(Res))); + ++UserE; + } + } + PredValues.clear(); + + LLVM_DEBUG(dbgs() << " Made element PHI for offset " << Offset << ": " + << *EltPHI << '\n'); + ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI; + } + + // Replace the use of this piece with the PHI node. + replaceInstUsesWith(*PHIUsers[UserI].Inst, EltPHI); + } + + // Replace all the remaining uses of the PHI nodes (self uses and the lshrs) + // with undefs. + Value *Undef = UndefValue::get(FirstPhi.getType()); + for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) + replaceInstUsesWith(*PHIsToSlice[i], Undef); + return replaceInstUsesWith(FirstPhi, Undef); +} + +// PHINode simplification +// +Instruction *InstCombiner::visitPHINode(PHINode &PN) { + if (Value *V = SimplifyInstruction(&PN, SQ.getWithInstruction(&PN))) + return replaceInstUsesWith(PN, V); + + if (Instruction *Result = FoldPHIArgZextsIntoPHI(PN)) + return Result; + + // If all PHI operands are the same operation, pull them through the PHI, + // reducing code size. + if (isa<Instruction>(PN.getIncomingValue(0)) && + isa<Instruction>(PN.getIncomingValue(1)) && + cast<Instruction>(PN.getIncomingValue(0))->getOpcode() == + cast<Instruction>(PN.getIncomingValue(1))->getOpcode() && + // FIXME: The hasOneUse check will fail for PHIs that use the value more + // than themselves more than once. + PN.getIncomingValue(0)->hasOneUse()) + if (Instruction *Result = FoldPHIArgOpIntoPHI(PN)) + return Result; + + // If this is a trivial cycle in the PHI node graph, remove it. Basically, if + // this PHI only has a single use (a PHI), and if that PHI only has one use (a + // PHI)... break the cycle. + if (PN.hasOneUse()) { + if (Instruction *Result = FoldIntegerTypedPHI(PN)) + return Result; + + Instruction *PHIUser = cast<Instruction>(PN.user_back()); + if (PHINode *PU = dyn_cast<PHINode>(PHIUser)) { + SmallPtrSet<PHINode*, 16> PotentiallyDeadPHIs; + PotentiallyDeadPHIs.insert(&PN); + if (DeadPHICycle(PU, PotentiallyDeadPHIs)) + return replaceInstUsesWith(PN, UndefValue::get(PN.getType())); + } + + // If this phi has a single use, and if that use just computes a value for + // the next iteration of a loop, delete the phi. This occurs with unused + // induction variables, e.g. "for (int j = 0; ; ++j);". Detecting this + // common case here is good because the only other things that catch this + // are induction variable analysis (sometimes) and ADCE, which is only run + // late. + if (PHIUser->hasOneUse() && + (isa<BinaryOperator>(PHIUser) || isa<GetElementPtrInst>(PHIUser)) && + PHIUser->user_back() == &PN) { + return replaceInstUsesWith(PN, UndefValue::get(PN.getType())); + } + // When a PHI is used only to be compared with zero, it is safe to replace + // an incoming value proved as known nonzero with any non-zero constant. + // For example, in the code below, the incoming value %v can be replaced + // with any non-zero constant based on the fact that the PHI is only used to + // be compared with zero and %v is a known non-zero value: + // %v = select %cond, 1, 2 + // %p = phi [%v, BB] ... + // icmp eq, %p, 0 + auto *CmpInst = dyn_cast<ICmpInst>(PHIUser); + // FIXME: To be simple, handle only integer type for now. + if (CmpInst && isa<IntegerType>(PN.getType()) && CmpInst->isEquality() && + match(CmpInst->getOperand(1), m_Zero())) { + ConstantInt *NonZeroConst = nullptr; + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { + Instruction *CtxI = PN.getIncomingBlock(i)->getTerminator(); + Value *VA = PN.getIncomingValue(i); + if (isKnownNonZero(VA, DL, 0, &AC, CtxI, &DT)) { + if (!NonZeroConst) + NonZeroConst = GetAnyNonZeroConstInt(PN); + PN.setIncomingValue(i, NonZeroConst); + } + } + } + } + + // We sometimes end up with phi cycles that non-obviously end up being the + // same value, for example: + // z = some value; x = phi (y, z); y = phi (x, z) + // where the phi nodes don't necessarily need to be in the same block. Do a + // quick check to see if the PHI node only contains a single non-phi value, if + // so, scan to see if the phi cycle is actually equal to that value. + { + unsigned InValNo = 0, NumIncomingVals = PN.getNumIncomingValues(); + // Scan for the first non-phi operand. + while (InValNo != NumIncomingVals && + isa<PHINode>(PN.getIncomingValue(InValNo))) + ++InValNo; + + if (InValNo != NumIncomingVals) { + Value *NonPhiInVal = PN.getIncomingValue(InValNo); + + // Scan the rest of the operands to see if there are any conflicts, if so + // there is no need to recursively scan other phis. + for (++InValNo; InValNo != NumIncomingVals; ++InValNo) { + Value *OpVal = PN.getIncomingValue(InValNo); + if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal)) + break; + } + + // If we scanned over all operands, then we have one unique value plus + // phi values. Scan PHI nodes to see if they all merge in each other or + // the value. + if (InValNo == NumIncomingVals) { + SmallPtrSet<PHINode*, 16> ValueEqualPHIs; + if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs)) + return replaceInstUsesWith(PN, NonPhiInVal); + } + } + } + + // If there are multiple PHIs, sort their operands so that they all list + // the blocks in the same order. This will help identical PHIs be eliminated + // by other passes. Other passes shouldn't depend on this for correctness + // however. + PHINode *FirstPN = cast<PHINode>(PN.getParent()->begin()); + if (&PN != FirstPN) + for (unsigned i = 0, e = FirstPN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *BBA = PN.getIncomingBlock(i); + BasicBlock *BBB = FirstPN->getIncomingBlock(i); + if (BBA != BBB) { + Value *VA = PN.getIncomingValue(i); + unsigned j = PN.getBasicBlockIndex(BBB); + Value *VB = PN.getIncomingValue(j); + PN.setIncomingBlock(i, BBB); + PN.setIncomingValue(i, VB); + PN.setIncomingBlock(j, BBA); + PN.setIncomingValue(j, VA); + // NOTE: Instcombine normally would want us to "return &PN" if we + // modified any of the operands of an instruction. However, since we + // aren't adding or removing uses (just rearranging them) we don't do + // this in this case. + } + } + + // If this is an integer PHI and we know that it has an illegal type, see if + // it is only used by trunc or trunc(lshr) operations. If so, we split the + // PHI into the various pieces being extracted. This sort of thing is + // introduced when SROA promotes an aggregate to a single large integer type. + if (PN.getType()->isIntegerTy() && + !DL.isLegalInteger(PN.getType()->getPrimitiveSizeInBits())) + if (Instruction *Res = SliceUpIllegalIntegerPHI(PN)) + return Res; + + return nullptr; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp new file mode 100644 index 000000000000..9fc871e49b30 --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -0,0 +1,2666 @@ +//===- InstCombineSelect.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the visitSelect function. +// +//===----------------------------------------------------------------------===// + +#include "InstCombineInternal.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CmpInstAnalysis.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" +#include <cassert> +#include <utility> + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "instcombine" + +static Value *createMinMax(InstCombiner::BuilderTy &Builder, + SelectPatternFlavor SPF, Value *A, Value *B) { + CmpInst::Predicate Pred = getMinMaxPred(SPF); + assert(CmpInst::isIntPredicate(Pred) && "Expected integer predicate"); + return Builder.CreateSelect(Builder.CreateICmp(Pred, A, B), A, B); +} + +/// Replace a select operand based on an equality comparison with the identity +/// constant of a binop. +static Instruction *foldSelectBinOpIdentity(SelectInst &Sel, + const TargetLibraryInfo &TLI) { + // The select condition must be an equality compare with a constant operand. + Value *X; + Constant *C; + CmpInst::Predicate Pred; + if (!match(Sel.getCondition(), m_Cmp(Pred, m_Value(X), m_Constant(C)))) + return nullptr; + + bool IsEq; + if (ICmpInst::isEquality(Pred)) + IsEq = Pred == ICmpInst::ICMP_EQ; + else if (Pred == FCmpInst::FCMP_OEQ) + IsEq = true; + else if (Pred == FCmpInst::FCMP_UNE) + IsEq = false; + else + return nullptr; + + // A select operand must be a binop. + BinaryOperator *BO; + if (!match(Sel.getOperand(IsEq ? 1 : 2), m_BinOp(BO))) + return nullptr; + + // The compare constant must be the identity constant for that binop. + // If this a floating-point compare with 0.0, any zero constant will do. + Type *Ty = BO->getType(); + Constant *IdC = ConstantExpr::getBinOpIdentity(BO->getOpcode(), Ty, true); + if (IdC != C) { + if (!IdC || !CmpInst::isFPPredicate(Pred)) + return nullptr; + if (!match(IdC, m_AnyZeroFP()) || !match(C, m_AnyZeroFP())) + return nullptr; + } + + // Last, match the compare variable operand with a binop operand. + Value *Y; + if (!BO->isCommutative() && !match(BO, m_BinOp(m_Value(Y), m_Specific(X)))) + return nullptr; + if (!match(BO, m_c_BinOp(m_Value(Y), m_Specific(X)))) + return nullptr; + + // +0.0 compares equal to -0.0, and so it does not behave as required for this + // transform. Bail out if we can not exclude that possibility. + if (isa<FPMathOperator>(BO)) + if (!BO->hasNoSignedZeros() && !CannotBeNegativeZero(Y, &TLI)) + return nullptr; + + // BO = binop Y, X + // S = { select (cmp eq X, C), BO, ? } or { select (cmp ne X, C), ?, BO } + // => + // S = { select (cmp eq X, C), Y, ? } or { select (cmp ne X, C), ?, Y } + Sel.setOperand(IsEq ? 1 : 2, Y); + return &Sel; +} + +/// This folds: +/// select (icmp eq (and X, C1)), TC, FC +/// iff C1 is a power 2 and the difference between TC and FC is a power-of-2. +/// To something like: +/// (shr (and (X, C1)), (log2(C1) - log2(TC-FC))) + FC +/// Or: +/// (shl (and (X, C1)), (log2(TC-FC) - log2(C1))) + FC +/// With some variations depending if FC is larger than TC, or the shift +/// isn't needed, or the bit widths don't match. +static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp, + InstCombiner::BuilderTy &Builder) { + const APInt *SelTC, *SelFC; + if (!match(Sel.getTrueValue(), m_APInt(SelTC)) || + !match(Sel.getFalseValue(), m_APInt(SelFC))) + return nullptr; + + // If this is a vector select, we need a vector compare. + Type *SelType = Sel.getType(); + if (SelType->isVectorTy() != Cmp->getType()->isVectorTy()) + return nullptr; + + Value *V; + APInt AndMask; + bool CreateAnd = false; + ICmpInst::Predicate Pred = Cmp->getPredicate(); + if (ICmpInst::isEquality(Pred)) { + if (!match(Cmp->getOperand(1), m_Zero())) + return nullptr; + + V = Cmp->getOperand(0); + const APInt *AndRHS; + if (!match(V, m_And(m_Value(), m_Power2(AndRHS)))) + return nullptr; + + AndMask = *AndRHS; + } else if (decomposeBitTestICmp(Cmp->getOperand(0), Cmp->getOperand(1), + Pred, V, AndMask)) { + assert(ICmpInst::isEquality(Pred) && "Not equality test?"); + if (!AndMask.isPowerOf2()) + return nullptr; + + CreateAnd = true; + } else { + return nullptr; + } + + // In general, when both constants are non-zero, we would need an offset to + // replace the select. This would require more instructions than we started + // with. But there's one special-case that we handle here because it can + // simplify/reduce the instructions. + APInt TC = *SelTC; + APInt FC = *SelFC; + if (!TC.isNullValue() && !FC.isNullValue()) { + // If the select constants differ by exactly one bit and that's the same + // bit that is masked and checked by the select condition, the select can + // be replaced by bitwise logic to set/clear one bit of the constant result. + if (TC.getBitWidth() != AndMask.getBitWidth() || (TC ^ FC) != AndMask) + return nullptr; + if (CreateAnd) { + // If we have to create an 'and', then we must kill the cmp to not + // increase the instruction count. + if (!Cmp->hasOneUse()) + return nullptr; + V = Builder.CreateAnd(V, ConstantInt::get(SelType, AndMask)); + } + bool ExtraBitInTC = TC.ugt(FC); + if (Pred == ICmpInst::ICMP_EQ) { + // If the masked bit in V is clear, clear or set the bit in the result: + // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) ^ TC + // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) | TC + Constant *C = ConstantInt::get(SelType, TC); + return ExtraBitInTC ? Builder.CreateXor(V, C) : Builder.CreateOr(V, C); + } + if (Pred == ICmpInst::ICMP_NE) { + // If the masked bit in V is set, set or clear the bit in the result: + // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) | FC + // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) ^ FC + Constant *C = ConstantInt::get(SelType, FC); + return ExtraBitInTC ? Builder.CreateOr(V, C) : Builder.CreateXor(V, C); + } + llvm_unreachable("Only expecting equality predicates"); + } + + // Make sure one of the select arms is a power-of-2. + if (!TC.isPowerOf2() && !FC.isPowerOf2()) + return nullptr; + + // Determine which shift is needed to transform result of the 'and' into the + // desired result. + const APInt &ValC = !TC.isNullValue() ? TC : FC; + unsigned ValZeros = ValC.logBase2(); + unsigned AndZeros = AndMask.logBase2(); + + // Insert the 'and' instruction on the input to the truncate. + if (CreateAnd) + V = Builder.CreateAnd(V, ConstantInt::get(V->getType(), AndMask)); + + // If types don't match, we can still convert the select by introducing a zext + // or a trunc of the 'and'. + if (ValZeros > AndZeros) { + V = Builder.CreateZExtOrTrunc(V, SelType); + V = Builder.CreateShl(V, ValZeros - AndZeros); + } else if (ValZeros < AndZeros) { + V = Builder.CreateLShr(V, AndZeros - ValZeros); + V = Builder.CreateZExtOrTrunc(V, SelType); + } else { + V = Builder.CreateZExtOrTrunc(V, SelType); + } + + // Okay, now we know that everything is set up, we just don't know whether we + // have a icmp_ne or icmp_eq and whether the true or false val is the zero. + bool ShouldNotVal = !TC.isNullValue(); + ShouldNotVal ^= Pred == ICmpInst::ICMP_NE; + if (ShouldNotVal) + V = Builder.CreateXor(V, ValC); + + return V; +} + +/// We want to turn code that looks like this: +/// %C = or %A, %B +/// %D = select %cond, %C, %A +/// into: +/// %C = select %cond, %B, 0 +/// %D = or %A, %C +/// +/// Assuming that the specified instruction is an operand to the select, return +/// a bitmask indicating which operands of this instruction are foldable if they +/// equal the other incoming value of the select. +static unsigned getSelectFoldableOperands(BinaryOperator *I) { + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return 3; // Can fold through either operand. + case Instruction::Sub: // Can only fold on the amount subtracted. + case Instruction::Shl: // Can only fold on the shift amount. + case Instruction::LShr: + case Instruction::AShr: + return 1; + default: + return 0; // Cannot fold + } +} + +/// For the same transformation as the previous function, return the identity +/// constant that goes into the select. +static APInt getSelectFoldableConstant(BinaryOperator *I) { + switch (I->getOpcode()) { + default: llvm_unreachable("This cannot happen!"); + case Instruction::Add: + case Instruction::Sub: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return APInt::getNullValue(I->getType()->getScalarSizeInBits()); + case Instruction::And: + return APInt::getAllOnesValue(I->getType()->getScalarSizeInBits()); + case Instruction::Mul: + return APInt(I->getType()->getScalarSizeInBits(), 1); + } +} + +/// We have (select c, TI, FI), and we know that TI and FI have the same opcode. +Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI, + Instruction *FI) { + // Don't break up min/max patterns. The hasOneUse checks below prevent that + // for most cases, but vector min/max with bitcasts can be transformed. If the + // one-use restrictions are eased for other patterns, we still don't want to + // obfuscate min/max. + if ((match(&SI, m_SMin(m_Value(), m_Value())) || + match(&SI, m_SMax(m_Value(), m_Value())) || + match(&SI, m_UMin(m_Value(), m_Value())) || + match(&SI, m_UMax(m_Value(), m_Value())))) + return nullptr; + + // If this is a cast from the same type, merge. + Value *Cond = SI.getCondition(); + Type *CondTy = Cond->getType(); + if (TI->getNumOperands() == 1 && TI->isCast()) { + Type *FIOpndTy = FI->getOperand(0)->getType(); + if (TI->getOperand(0)->getType() != FIOpndTy) + return nullptr; + + // The select condition may be a vector. We may only change the operand + // type if the vector width remains the same (and matches the condition). + if (CondTy->isVectorTy()) { + if (!FIOpndTy->isVectorTy()) + return nullptr; + if (CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements()) + return nullptr; + + // TODO: If the backend knew how to deal with casts better, we could + // remove this limitation. For now, there's too much potential to create + // worse codegen by promoting the select ahead of size-altering casts + // (PR28160). + // + // Note that ValueTracking's matchSelectPattern() looks through casts + // without checking 'hasOneUse' when it matches min/max patterns, so this + // transform may end up happening anyway. + if (TI->getOpcode() != Instruction::BitCast && + (!TI->hasOneUse() || !FI->hasOneUse())) + return nullptr; + } else if (!TI->hasOneUse() || !FI->hasOneUse()) { + // TODO: The one-use restrictions for a scalar select could be eased if + // the fold of a select in visitLoadInst() was enhanced to match a pattern + // that includes a cast. + return nullptr; + } + + // Fold this by inserting a select from the input values. + Value *NewSI = + Builder.CreateSelect(Cond, TI->getOperand(0), FI->getOperand(0), + SI.getName() + ".v", &SI); + return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI, + TI->getType()); + } + + // Cond ? -X : -Y --> -(Cond ? X : Y) + Value *X, *Y; + if (match(TI, m_FNeg(m_Value(X))) && match(FI, m_FNeg(m_Value(Y))) && + (TI->hasOneUse() || FI->hasOneUse())) { + Value *NewSel = Builder.CreateSelect(Cond, X, Y, SI.getName() + ".v", &SI); + // TODO: Remove the hack for the binop form when the unary op is optimized + // properly with all IR passes. + if (TI->getOpcode() != Instruction::FNeg) + return BinaryOperator::CreateFNegFMF(NewSel, cast<BinaryOperator>(TI)); + return UnaryOperator::CreateFNeg(NewSel); + } + + // Only handle binary operators (including two-operand getelementptr) with + // one-use here. As with the cast case above, it may be possible to relax the + // one-use constraint, but that needs be examined carefully since it may not + // reduce the total number of instructions. + if (TI->getNumOperands() != 2 || FI->getNumOperands() != 2 || + (!isa<BinaryOperator>(TI) && !isa<GetElementPtrInst>(TI)) || + !TI->hasOneUse() || !FI->hasOneUse()) + return nullptr; + + // Figure out if the operations have any operands in common. + Value *MatchOp, *OtherOpT, *OtherOpF; + bool MatchIsOpZero; + if (TI->getOperand(0) == FI->getOperand(0)) { + MatchOp = TI->getOperand(0); + OtherOpT = TI->getOperand(1); + OtherOpF = FI->getOperand(1); + MatchIsOpZero = true; + } else if (TI->getOperand(1) == FI->getOperand(1)) { + MatchOp = TI->getOperand(1); + OtherOpT = TI->getOperand(0); + OtherOpF = FI->getOperand(0); + MatchIsOpZero = false; + } else if (!TI->isCommutative()) { + return nullptr; + } else if (TI->getOperand(0) == FI->getOperand(1)) { + MatchOp = TI->getOperand(0); + OtherOpT = TI->getOperand(1); + OtherOpF = FI->getOperand(0); + MatchIsOpZero = true; + } else if (TI->getOperand(1) == FI->getOperand(0)) { + MatchOp = TI->getOperand(1); + OtherOpT = TI->getOperand(0); + OtherOpF = FI->getOperand(1); + MatchIsOpZero = true; + } else { + return nullptr; + } + + // If the select condition is a vector, the operands of the original select's + // operands also must be vectors. This may not be the case for getelementptr + // for example. + if (CondTy->isVectorTy() && (!OtherOpT->getType()->isVectorTy() || + !OtherOpF->getType()->isVectorTy())) + return nullptr; + + // If we reach here, they do have operations in common. + Value *NewSI = Builder.CreateSelect(Cond, OtherOpT, OtherOpF, + SI.getName() + ".v", &SI); + Value *Op0 = MatchIsOpZero ? MatchOp : NewSI; + Value *Op1 = MatchIsOpZero ? NewSI : MatchOp; + if (auto *BO = dyn_cast<BinaryOperator>(TI)) { + BinaryOperator *NewBO = BinaryOperator::Create(BO->getOpcode(), Op0, Op1); + NewBO->copyIRFlags(TI); + NewBO->andIRFlags(FI); + return NewBO; + } + if (auto *TGEP = dyn_cast<GetElementPtrInst>(TI)) { + auto *FGEP = cast<GetElementPtrInst>(FI); + Type *ElementType = TGEP->getResultElementType(); + return TGEP->isInBounds() && FGEP->isInBounds() + ? GetElementPtrInst::CreateInBounds(ElementType, Op0, {Op1}) + : GetElementPtrInst::Create(ElementType, Op0, {Op1}); + } + llvm_unreachable("Expected BinaryOperator or GEP"); + return nullptr; +} + +static bool isSelect01(const APInt &C1I, const APInt &C2I) { + if (!C1I.isNullValue() && !C2I.isNullValue()) // One side must be zero. + return false; + return C1I.isOneValue() || C1I.isAllOnesValue() || + C2I.isOneValue() || C2I.isAllOnesValue(); +} + +/// Try to fold the select into one of the operands to allow further +/// optimization. +Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal, + Value *FalseVal) { + // See the comment above GetSelectFoldableOperands for a description of the + // transformation we are doing here. + if (auto *TVI = dyn_cast<BinaryOperator>(TrueVal)) { + if (TVI->hasOneUse() && !isa<Constant>(FalseVal)) { + if (unsigned SFO = getSelectFoldableOperands(TVI)) { + unsigned OpToFold = 0; + if ((SFO & 1) && FalseVal == TVI->getOperand(0)) { + OpToFold = 1; + } else if ((SFO & 2) && FalseVal == TVI->getOperand(1)) { + OpToFold = 2; + } + + if (OpToFold) { + APInt CI = getSelectFoldableConstant(TVI); + Value *OOp = TVI->getOperand(2-OpToFold); + // Avoid creating select between 2 constants unless it's selecting + // between 0, 1 and -1. + const APInt *OOpC; + bool OOpIsAPInt = match(OOp, m_APInt(OOpC)); + if (!isa<Constant>(OOp) || (OOpIsAPInt && isSelect01(CI, *OOpC))) { + Value *C = ConstantInt::get(OOp->getType(), CI); + Value *NewSel = Builder.CreateSelect(SI.getCondition(), OOp, C); + NewSel->takeName(TVI); + BinaryOperator *BO = BinaryOperator::Create(TVI->getOpcode(), + FalseVal, NewSel); + BO->copyIRFlags(TVI); + return BO; + } + } + } + } + } + + if (auto *FVI = dyn_cast<BinaryOperator>(FalseVal)) { + if (FVI->hasOneUse() && !isa<Constant>(TrueVal)) { + if (unsigned SFO = getSelectFoldableOperands(FVI)) { + unsigned OpToFold = 0; + if ((SFO & 1) && TrueVal == FVI->getOperand(0)) { + OpToFold = 1; + } else if ((SFO & 2) && TrueVal == FVI->getOperand(1)) { + OpToFold = 2; + } + + if (OpToFold) { + APInt CI = getSelectFoldableConstant(FVI); + Value *OOp = FVI->getOperand(2-OpToFold); + // Avoid creating select between 2 constants unless it's selecting + // between 0, 1 and -1. + const APInt *OOpC; + bool OOpIsAPInt = match(OOp, m_APInt(OOpC)); + if (!isa<Constant>(OOp) || (OOpIsAPInt && isSelect01(CI, *OOpC))) { + Value *C = ConstantInt::get(OOp->getType(), CI); + Value *NewSel = Builder.CreateSelect(SI.getCondition(), C, OOp); + NewSel->takeName(FVI); + BinaryOperator *BO = BinaryOperator::Create(FVI->getOpcode(), + TrueVal, NewSel); + BO->copyIRFlags(FVI); + return BO; + } + } + } + } + } + + return nullptr; +} + +/// We want to turn: +/// (select (icmp eq (and X, Y), 0), (and (lshr X, Z), 1), 1) +/// into: +/// zext (icmp ne i32 (and X, (or Y, (shl 1, Z))), 0) +/// Note: +/// Z may be 0 if lshr is missing. +/// Worst-case scenario is that we will replace 5 instructions with 5 different +/// instructions, but we got rid of select. +static Instruction *foldSelectICmpAndAnd(Type *SelType, const ICmpInst *Cmp, + Value *TVal, Value *FVal, + InstCombiner::BuilderTy &Builder) { + if (!(Cmp->hasOneUse() && Cmp->getOperand(0)->hasOneUse() && + Cmp->getPredicate() == ICmpInst::ICMP_EQ && + match(Cmp->getOperand(1), m_Zero()) && match(FVal, m_One()))) + return nullptr; + + // The TrueVal has general form of: and %B, 1 + Value *B; + if (!match(TVal, m_OneUse(m_And(m_Value(B), m_One())))) + return nullptr; + + // Where %B may be optionally shifted: lshr %X, %Z. + Value *X, *Z; + const bool HasShift = match(B, m_OneUse(m_LShr(m_Value(X), m_Value(Z)))); + if (!HasShift) + X = B; + + Value *Y; + if (!match(Cmp->getOperand(0), m_c_And(m_Specific(X), m_Value(Y)))) + return nullptr; + + // ((X & Y) == 0) ? ((X >> Z) & 1) : 1 --> (X & (Y | (1 << Z))) != 0 + // ((X & Y) == 0) ? (X & 1) : 1 --> (X & (Y | 1)) != 0 + Constant *One = ConstantInt::get(SelType, 1); + Value *MaskB = HasShift ? Builder.CreateShl(One, Z) : One; + Value *FullMask = Builder.CreateOr(Y, MaskB); + Value *MaskedX = Builder.CreateAnd(X, FullMask); + Value *ICmpNeZero = Builder.CreateIsNotNull(MaskedX); + return new ZExtInst(ICmpNeZero, SelType); +} + +/// We want to turn: +/// (select (icmp sgt x, C), lshr (X, Y), ashr (X, Y)); iff C s>= -1 +/// (select (icmp slt x, C), ashr (X, Y), lshr (X, Y)); iff C s>= 0 +/// into: +/// ashr (X, Y) +static Value *foldSelectICmpLshrAshr(const ICmpInst *IC, Value *TrueVal, + Value *FalseVal, + InstCombiner::BuilderTy &Builder) { + ICmpInst::Predicate Pred = IC->getPredicate(); + Value *CmpLHS = IC->getOperand(0); + Value *CmpRHS = IC->getOperand(1); + if (!CmpRHS->getType()->isIntOrIntVectorTy()) + return nullptr; + + Value *X, *Y; + unsigned Bitwidth = CmpRHS->getType()->getScalarSizeInBits(); + if ((Pred != ICmpInst::ICMP_SGT || + !match(CmpRHS, + m_SpecificInt_ICMP(ICmpInst::ICMP_SGE, APInt(Bitwidth, -1)))) && + (Pred != ICmpInst::ICMP_SLT || + !match(CmpRHS, + m_SpecificInt_ICMP(ICmpInst::ICMP_SGE, APInt(Bitwidth, 0))))) + return nullptr; + + // Canonicalize so that ashr is in FalseVal. + if (Pred == ICmpInst::ICMP_SLT) + std::swap(TrueVal, FalseVal); + + if (match(TrueVal, m_LShr(m_Value(X), m_Value(Y))) && + match(FalseVal, m_AShr(m_Specific(X), m_Specific(Y))) && + match(CmpLHS, m_Specific(X))) { + const auto *Ashr = cast<Instruction>(FalseVal); + // if lshr is not exact and ashr is, this new ashr must not be exact. + bool IsExact = Ashr->isExact() && cast<Instruction>(TrueVal)->isExact(); + return Builder.CreateAShr(X, Y, IC->getName(), IsExact); + } + + return nullptr; +} + +/// We want to turn: +/// (select (icmp eq (and X, C1), 0), Y, (or Y, C2)) +/// into: +/// (or (shl (and X, C1), C3), Y) +/// iff: +/// C1 and C2 are both powers of 2 +/// where: +/// C3 = Log(C2) - Log(C1) +/// +/// This transform handles cases where: +/// 1. The icmp predicate is inverted +/// 2. The select operands are reversed +/// 3. The magnitude of C2 and C1 are flipped +static Value *foldSelectICmpAndOr(const ICmpInst *IC, Value *TrueVal, + Value *FalseVal, + InstCombiner::BuilderTy &Builder) { + // Only handle integer compares. Also, if this is a vector select, we need a + // vector compare. + if (!TrueVal->getType()->isIntOrIntVectorTy() || + TrueVal->getType()->isVectorTy() != IC->getType()->isVectorTy()) + return nullptr; + + Value *CmpLHS = IC->getOperand(0); + Value *CmpRHS = IC->getOperand(1); + + Value *V; + unsigned C1Log; + bool IsEqualZero; + bool NeedAnd = false; + if (IC->isEquality()) { + if (!match(CmpRHS, m_Zero())) + return nullptr; + + const APInt *C1; + if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1)))) + return nullptr; + + V = CmpLHS; + C1Log = C1->logBase2(); + IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_EQ; + } else if (IC->getPredicate() == ICmpInst::ICMP_SLT || + IC->getPredicate() == ICmpInst::ICMP_SGT) { + // We also need to recognize (icmp slt (trunc (X)), 0) and + // (icmp sgt (trunc (X)), -1). + IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_SGT; + if ((IsEqualZero && !match(CmpRHS, m_AllOnes())) || + (!IsEqualZero && !match(CmpRHS, m_Zero()))) + return nullptr; + + if (!match(CmpLHS, m_OneUse(m_Trunc(m_Value(V))))) + return nullptr; + + C1Log = CmpLHS->getType()->getScalarSizeInBits() - 1; + NeedAnd = true; + } else { + return nullptr; + } + + const APInt *C2; + bool OrOnTrueVal = false; + bool OrOnFalseVal = match(FalseVal, m_Or(m_Specific(TrueVal), m_Power2(C2))); + if (!OrOnFalseVal) + OrOnTrueVal = match(TrueVal, m_Or(m_Specific(FalseVal), m_Power2(C2))); + + if (!OrOnFalseVal && !OrOnTrueVal) + return nullptr; + + Value *Y = OrOnFalseVal ? TrueVal : FalseVal; + + unsigned C2Log = C2->logBase2(); + + bool NeedXor = (!IsEqualZero && OrOnFalseVal) || (IsEqualZero && OrOnTrueVal); + bool NeedShift = C1Log != C2Log; + bool NeedZExtTrunc = Y->getType()->getScalarSizeInBits() != + V->getType()->getScalarSizeInBits(); + + // Make sure we don't create more instructions than we save. + Value *Or = OrOnFalseVal ? FalseVal : TrueVal; + if ((NeedShift + NeedXor + NeedZExtTrunc) > + (IC->hasOneUse() + Or->hasOneUse())) + return nullptr; + + if (NeedAnd) { + // Insert the AND instruction on the input to the truncate. + APInt C1 = APInt::getOneBitSet(V->getType()->getScalarSizeInBits(), C1Log); + V = Builder.CreateAnd(V, ConstantInt::get(V->getType(), C1)); + } + + if (C2Log > C1Log) { + V = Builder.CreateZExtOrTrunc(V, Y->getType()); + V = Builder.CreateShl(V, C2Log - C1Log); + } else if (C1Log > C2Log) { + V = Builder.CreateLShr(V, C1Log - C2Log); + V = Builder.CreateZExtOrTrunc(V, Y->getType()); + } else + V = Builder.CreateZExtOrTrunc(V, Y->getType()); + + if (NeedXor) + V = Builder.CreateXor(V, *C2); + + return Builder.CreateOr(V, Y); +} + +/// Transform patterns such as (a > b) ? a - b : 0 into usub.sat(a, b). +/// There are 8 commuted/swapped variants of this pattern. +/// TODO: Also support a - UMIN(a,b) patterns. +static Value *canonicalizeSaturatedSubtract(const ICmpInst *ICI, + const Value *TrueVal, + const Value *FalseVal, + InstCombiner::BuilderTy &Builder) { + ICmpInst::Predicate Pred = ICI->getPredicate(); + if (!ICmpInst::isUnsigned(Pred)) + return nullptr; + + // (b > a) ? 0 : a - b -> (b <= a) ? a - b : 0 + if (match(TrueVal, m_Zero())) { + Pred = ICmpInst::getInversePredicate(Pred); + std::swap(TrueVal, FalseVal); + } + if (!match(FalseVal, m_Zero())) + return nullptr; + + Value *A = ICI->getOperand(0); + Value *B = ICI->getOperand(1); + if (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_ULT) { + // (b < a) ? a - b : 0 -> (a > b) ? a - b : 0 + std::swap(A, B); + Pred = ICmpInst::getSwappedPredicate(Pred); + } + + assert((Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_UGT) && + "Unexpected isUnsigned predicate!"); + + // Account for swapped form of subtraction: ((a > b) ? b - a : 0). + bool IsNegative = false; + if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A)))) + IsNegative = true; + else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B)))) + return nullptr; + + // If sub is used anywhere else, we wouldn't be able to eliminate it + // afterwards. + if (!TrueVal->hasOneUse()) + return nullptr; + + // (a > b) ? a - b : 0 -> usub.sat(a, b) + // (a > b) ? b - a : 0 -> -usub.sat(a, b) + Value *Result = Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, A, B); + if (IsNegative) + Result = Builder.CreateNeg(Result); + return Result; +} + +static Value *canonicalizeSaturatedAdd(ICmpInst *Cmp, Value *TVal, Value *FVal, + InstCombiner::BuilderTy &Builder) { + if (!Cmp->hasOneUse()) + return nullptr; + + // Match unsigned saturated add with constant. + Value *Cmp0 = Cmp->getOperand(0); + Value *Cmp1 = Cmp->getOperand(1); + ICmpInst::Predicate Pred = Cmp->getPredicate(); + Value *X; + const APInt *C, *CmpC; + if (Pred == ICmpInst::ICMP_ULT && + match(TVal, m_Add(m_Value(X), m_APInt(C))) && X == Cmp0 && + match(FVal, m_AllOnes()) && match(Cmp1, m_APInt(CmpC)) && *CmpC == ~*C) { + // (X u< ~C) ? (X + C) : -1 --> uadd.sat(X, C) + return Builder.CreateBinaryIntrinsic( + Intrinsic::uadd_sat, X, ConstantInt::get(X->getType(), *C)); + } + + // Match unsigned saturated add of 2 variables with an unnecessary 'not'. + // There are 8 commuted variants. + // Canonicalize -1 (saturated result) to true value of the select. Just + // swapping the compare operands is legal, because the selected value is the + // same in case of equality, so we can interchange u< and u<=. + if (match(FVal, m_AllOnes())) { + std::swap(TVal, FVal); + std::swap(Cmp0, Cmp1); + } + if (!match(TVal, m_AllOnes())) + return nullptr; + + // Canonicalize predicate to 'ULT'. + if (Pred == ICmpInst::ICMP_UGT) { + Pred = ICmpInst::ICMP_ULT; + std::swap(Cmp0, Cmp1); + } + if (Pred != ICmpInst::ICMP_ULT) + return nullptr; + + // Match unsigned saturated add of 2 variables with an unnecessary 'not'. + Value *Y; + if (match(Cmp0, m_Not(m_Value(X))) && + match(FVal, m_c_Add(m_Specific(X), m_Value(Y))) && Y == Cmp1) { + // (~X u< Y) ? -1 : (X + Y) --> uadd.sat(X, Y) + // (~X u< Y) ? -1 : (Y + X) --> uadd.sat(X, Y) + return Builder.CreateBinaryIntrinsic(Intrinsic::uadd_sat, X, Y); + } + // The 'not' op may be included in the sum but not the compare. + X = Cmp0; + Y = Cmp1; + if (match(FVal, m_c_Add(m_Not(m_Specific(X)), m_Specific(Y)))) { + // (X u< Y) ? -1 : (~X + Y) --> uadd.sat(~X, Y) + // (X u< Y) ? -1 : (Y + ~X) --> uadd.sat(Y, ~X) + BinaryOperator *BO = cast<BinaryOperator>(FVal); + return Builder.CreateBinaryIntrinsic( + Intrinsic::uadd_sat, BO->getOperand(0), BO->getOperand(1)); + } + + return nullptr; +} + +/// Fold the following code sequence: +/// \code +/// int a = ctlz(x & -x); +// x ? 31 - a : a; +/// \code +/// +/// into: +/// cttz(x) +static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal, + Value *FalseVal, + InstCombiner::BuilderTy &Builder) { + unsigned BitWidth = TrueVal->getType()->getScalarSizeInBits(); + if (!ICI->isEquality() || !match(ICI->getOperand(1), m_Zero())) + return nullptr; + + if (ICI->getPredicate() == ICmpInst::ICMP_NE) + std::swap(TrueVal, FalseVal); + + if (!match(FalseVal, + m_Xor(m_Deferred(TrueVal), m_SpecificInt(BitWidth - 1)))) + return nullptr; + + if (!match(TrueVal, m_Intrinsic<Intrinsic::ctlz>())) + return nullptr; + + Value *X = ICI->getOperand(0); + auto *II = cast<IntrinsicInst>(TrueVal); + if (!match(II->getOperand(0), m_c_And(m_Specific(X), m_Neg(m_Specific(X))))) + return nullptr; + + Function *F = Intrinsic::getDeclaration(II->getModule(), Intrinsic::cttz, + II->getType()); + return CallInst::Create(F, {X, II->getArgOperand(1)}); +} + +/// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single +/// call to cttz/ctlz with flag 'is_zero_undef' cleared. +/// +/// For example, we can fold the following code sequence: +/// \code +/// %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true) +/// %1 = icmp ne i32 %x, 0 +/// %2 = select i1 %1, i32 %0, i32 32 +/// \code +/// +/// into: +/// %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false) +static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal, + InstCombiner::BuilderTy &Builder) { + ICmpInst::Predicate Pred = ICI->getPredicate(); + Value *CmpLHS = ICI->getOperand(0); + Value *CmpRHS = ICI->getOperand(1); + + // Check if the condition value compares a value for equality against zero. + if (!ICI->isEquality() || !match(CmpRHS, m_Zero())) + return nullptr; + + Value *Count = FalseVal; + Value *ValueOnZero = TrueVal; + if (Pred == ICmpInst::ICMP_NE) + std::swap(Count, ValueOnZero); + + // Skip zero extend/truncate. + Value *V = nullptr; + if (match(Count, m_ZExt(m_Value(V))) || + match(Count, m_Trunc(m_Value(V)))) + Count = V; + + // Check that 'Count' is a call to intrinsic cttz/ctlz. Also check that the + // input to the cttz/ctlz is used as LHS for the compare instruction. + if (!match(Count, m_Intrinsic<Intrinsic::cttz>(m_Specific(CmpLHS))) && + !match(Count, m_Intrinsic<Intrinsic::ctlz>(m_Specific(CmpLHS)))) + return nullptr; + + IntrinsicInst *II = cast<IntrinsicInst>(Count); + + // Check if the value propagated on zero is a constant number equal to the + // sizeof in bits of 'Count'. + unsigned SizeOfInBits = Count->getType()->getScalarSizeInBits(); + if (match(ValueOnZero, m_SpecificInt(SizeOfInBits))) { + // Explicitly clear the 'undef_on_zero' flag. + IntrinsicInst *NewI = cast<IntrinsicInst>(II->clone()); + NewI->setArgOperand(1, ConstantInt::getFalse(NewI->getContext())); + Builder.Insert(NewI); + return Builder.CreateZExtOrTrunc(NewI, ValueOnZero->getType()); + } + + // If the ValueOnZero is not the bitwidth, we can at least make use of the + // fact that the cttz/ctlz result will not be used if the input is zero, so + // it's okay to relax it to undef for that case. + if (II->hasOneUse() && !match(II->getArgOperand(1), m_One())) + II->setArgOperand(1, ConstantInt::getTrue(II->getContext())); + + return nullptr; +} + +/// Return true if we find and adjust an icmp+select pattern where the compare +/// is with a constant that can be incremented or decremented to match the +/// minimum or maximum idiom. +static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) { + ICmpInst::Predicate Pred = Cmp.getPredicate(); + Value *CmpLHS = Cmp.getOperand(0); + Value *CmpRHS = Cmp.getOperand(1); + Value *TrueVal = Sel.getTrueValue(); + Value *FalseVal = Sel.getFalseValue(); + + // We may move or edit the compare, so make sure the select is the only user. + const APInt *CmpC; + if (!Cmp.hasOneUse() || !match(CmpRHS, m_APInt(CmpC))) + return false; + + // These transforms only work for selects of integers or vector selects of + // integer vectors. + Type *SelTy = Sel.getType(); + auto *SelEltTy = dyn_cast<IntegerType>(SelTy->getScalarType()); + if (!SelEltTy || SelTy->isVectorTy() != Cmp.getType()->isVectorTy()) + return false; + + Constant *AdjustedRHS; + if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_SGT) + AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC + 1); + else if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SLT) + AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC - 1); + else + return false; + + // X > C ? X : C+1 --> X < C+1 ? C+1 : X + // X < C ? X : C-1 --> X > C-1 ? C-1 : X + if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) || + (CmpLHS == FalseVal && AdjustedRHS == TrueVal)) { + ; // Nothing to do here. Values match without any sign/zero extension. + } + // Types do not match. Instead of calculating this with mixed types, promote + // all to the larger type. This enables scalar evolution to analyze this + // expression. + else if (CmpRHS->getType()->getScalarSizeInBits() < SelEltTy->getBitWidth()) { + Constant *SextRHS = ConstantExpr::getSExt(AdjustedRHS, SelTy); + + // X = sext x; x >s c ? X : C+1 --> X = sext x; X <s C+1 ? C+1 : X + // X = sext x; x <s c ? X : C-1 --> X = sext x; X >s C-1 ? C-1 : X + // X = sext x; x >u c ? X : C+1 --> X = sext x; X <u C+1 ? C+1 : X + // X = sext x; x <u c ? X : C-1 --> X = sext x; X >u C-1 ? C-1 : X + if (match(TrueVal, m_SExt(m_Specific(CmpLHS))) && SextRHS == FalseVal) { + CmpLHS = TrueVal; + AdjustedRHS = SextRHS; + } else if (match(FalseVal, m_SExt(m_Specific(CmpLHS))) && + SextRHS == TrueVal) { + CmpLHS = FalseVal; + AdjustedRHS = SextRHS; + } else if (Cmp.isUnsigned()) { + Constant *ZextRHS = ConstantExpr::getZExt(AdjustedRHS, SelTy); + // X = zext x; x >u c ? X : C+1 --> X = zext x; X <u C+1 ? C+1 : X + // X = zext x; x <u c ? X : C-1 --> X = zext x; X >u C-1 ? C-1 : X + // zext + signed compare cannot be changed: + // 0xff <s 0x00, but 0x00ff >s 0x0000 + if (match(TrueVal, m_ZExt(m_Specific(CmpLHS))) && ZextRHS == FalseVal) { + CmpLHS = TrueVal; + AdjustedRHS = ZextRHS; + } else if (match(FalseVal, m_ZExt(m_Specific(CmpLHS))) && + ZextRHS == TrueVal) { + CmpLHS = FalseVal; + AdjustedRHS = ZextRHS; + } else { + return false; + } + } else { + return false; + } + } else { + return false; + } + + Pred = ICmpInst::getSwappedPredicate(Pred); + CmpRHS = AdjustedRHS; + std::swap(FalseVal, TrueVal); + Cmp.setPredicate(Pred); + Cmp.setOperand(0, CmpLHS); + Cmp.setOperand(1, CmpRHS); + Sel.setOperand(1, TrueVal); + Sel.setOperand(2, FalseVal); + Sel.swapProfMetadata(); + + // Move the compare instruction right before the select instruction. Otherwise + // the sext/zext value may be defined after the compare instruction uses it. + Cmp.moveBefore(&Sel); + + return true; +} + +/// If this is an integer min/max (icmp + select) with a constant operand, +/// create the canonical icmp for the min/max operation and canonicalize the +/// constant to the 'false' operand of the select: +/// select (icmp Pred X, C1), C2, X --> select (icmp Pred' X, C2), X, C2 +/// Note: if C1 != C2, this will change the icmp constant to the existing +/// constant operand of the select. +static Instruction * +canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp, + InstCombiner::BuilderTy &Builder) { + if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1))) + return nullptr; + + // Canonicalize the compare predicate based on whether we have min or max. + Value *LHS, *RHS; + SelectPatternResult SPR = matchSelectPattern(&Sel, LHS, RHS); + if (!SelectPatternResult::isMinOrMax(SPR.Flavor)) + return nullptr; + + // Is this already canonical? + ICmpInst::Predicate CanonicalPred = getMinMaxPred(SPR.Flavor); + if (Cmp.getOperand(0) == LHS && Cmp.getOperand(1) == RHS && + Cmp.getPredicate() == CanonicalPred) + return nullptr; + + // Create the canonical compare and plug it into the select. + Sel.setCondition(Builder.CreateICmp(CanonicalPred, LHS, RHS)); + + // If the select operands did not change, we're done. + if (Sel.getTrueValue() == LHS && Sel.getFalseValue() == RHS) + return &Sel; + + // If we are swapping the select operands, swap the metadata too. + assert(Sel.getTrueValue() == RHS && Sel.getFalseValue() == LHS && + "Unexpected results from matchSelectPattern"); + Sel.swapValues(); + Sel.swapProfMetadata(); + return &Sel; +} + +/// There are many select variants for each of ABS/NABS. +/// In matchSelectPattern(), there are different compare constants, compare +/// predicates/operands and select operands. +/// In isKnownNegation(), there are different formats of negated operands. +/// Canonicalize all these variants to 1 pattern. +/// This makes CSE more likely. +static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp, + InstCombiner::BuilderTy &Builder) { + if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1))) + return nullptr; + + // Choose a sign-bit check for the compare (likely simpler for codegen). + // ABS: (X <s 0) ? -X : X + // NABS: (X <s 0) ? X : -X + Value *LHS, *RHS; + SelectPatternFlavor SPF = matchSelectPattern(&Sel, LHS, RHS).Flavor; + if (SPF != SelectPatternFlavor::SPF_ABS && + SPF != SelectPatternFlavor::SPF_NABS) + return nullptr; + + Value *TVal = Sel.getTrueValue(); + Value *FVal = Sel.getFalseValue(); + assert(isKnownNegation(TVal, FVal) && + "Unexpected result from matchSelectPattern"); + + // The compare may use the negated abs()/nabs() operand, or it may use + // negation in non-canonical form such as: sub A, B. + bool CmpUsesNegatedOp = match(Cmp.getOperand(0), m_Neg(m_Specific(TVal))) || + match(Cmp.getOperand(0), m_Neg(m_Specific(FVal))); + + bool CmpCanonicalized = !CmpUsesNegatedOp && + match(Cmp.getOperand(1), m_ZeroInt()) && + Cmp.getPredicate() == ICmpInst::ICMP_SLT; + bool RHSCanonicalized = match(RHS, m_Neg(m_Specific(LHS))); + + // Is this already canonical? + if (CmpCanonicalized && RHSCanonicalized) + return nullptr; + + // If RHS is used by other instructions except compare and select, don't + // canonicalize it to not increase the instruction count. + if (!(RHS->hasOneUse() || (RHS->hasNUses(2) && CmpUsesNegatedOp))) + return nullptr; + + // Create the canonical compare: icmp slt LHS 0. + if (!CmpCanonicalized) { + Cmp.setPredicate(ICmpInst::ICMP_SLT); + Cmp.setOperand(1, ConstantInt::getNullValue(Cmp.getOperand(0)->getType())); + if (CmpUsesNegatedOp) + Cmp.setOperand(0, LHS); + } + + // Create the canonical RHS: RHS = sub (0, LHS). + if (!RHSCanonicalized) { + assert(RHS->hasOneUse() && "RHS use number is not right"); + RHS = Builder.CreateNeg(LHS); + if (TVal == LHS) { + Sel.setFalseValue(RHS); + FVal = RHS; + } else { + Sel.setTrueValue(RHS); + TVal = RHS; + } + } + + // If the select operands do not change, we're done. + if (SPF == SelectPatternFlavor::SPF_NABS) { + if (TVal == LHS) + return &Sel; + assert(FVal == LHS && "Unexpected results from matchSelectPattern"); + } else { + if (FVal == LHS) + return &Sel; + assert(TVal == LHS && "Unexpected results from matchSelectPattern"); + } + + // We are swapping the select operands, so swap the metadata too. + Sel.swapValues(); + Sel.swapProfMetadata(); + return &Sel; +} + +static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *ReplaceOp, + const SimplifyQuery &Q) { + // If this is a binary operator, try to simplify it with the replaced op + // because we know Op and ReplaceOp are equivalant. + // For example: V = X + 1, Op = X, ReplaceOp = 42 + // Simplifies as: add(42, 1) --> 43 + if (auto *BO = dyn_cast<BinaryOperator>(V)) { + if (BO->getOperand(0) == Op) + return SimplifyBinOp(BO->getOpcode(), ReplaceOp, BO->getOperand(1), Q); + if (BO->getOperand(1) == Op) + return SimplifyBinOp(BO->getOpcode(), BO->getOperand(0), ReplaceOp, Q); + } + + return nullptr; +} + +/// If we have a select with an equality comparison, then we know the value in +/// one of the arms of the select. See if substituting this value into an arm +/// and simplifying the result yields the same value as the other arm. +/// +/// To make this transform safe, we must drop poison-generating flags +/// (nsw, etc) if we simplified to a binop because the select may be guarding +/// that poison from propagating. If the existing binop already had no +/// poison-generating flags, then this transform can be done by instsimplify. +/// +/// Consider: +/// %cmp = icmp eq i32 %x, 2147483647 +/// %add = add nsw i32 %x, 1 +/// %sel = select i1 %cmp, i32 -2147483648, i32 %add +/// +/// We can't replace %sel with %add unless we strip away the flags. +/// TODO: Wrapping flags could be preserved in some cases with better analysis. +static Value *foldSelectValueEquivalence(SelectInst &Sel, ICmpInst &Cmp, + const SimplifyQuery &Q) { + if (!Cmp.isEquality()) + return nullptr; + + // Canonicalize the pattern to ICMP_EQ by swapping the select operands. + Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue(); + if (Cmp.getPredicate() == ICmpInst::ICMP_NE) + std::swap(TrueVal, FalseVal); + + // Try each equivalence substitution possibility. + // We have an 'EQ' comparison, so the select's false value will propagate. + // Example: + // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1 + // (X == 42) ? (X + 1) : 43 --> (X == 42) ? (42 + 1) : 43 --> 43 + Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1); + if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q) == TrueVal || + simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q) == TrueVal || + simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q) == FalseVal || + simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q) == FalseVal) { + if (auto *FalseInst = dyn_cast<Instruction>(FalseVal)) + FalseInst->dropPoisonGeneratingFlags(); + return FalseVal; + } + return nullptr; +} + +// See if this is a pattern like: +// %old_cmp1 = icmp slt i32 %x, C2 +// %old_replacement = select i1 %old_cmp1, i32 %target_low, i32 %target_high +// %old_x_offseted = add i32 %x, C1 +// %old_cmp0 = icmp ult i32 %old_x_offseted, C0 +// %r = select i1 %old_cmp0, i32 %x, i32 %old_replacement +// This can be rewritten as more canonical pattern: +// %new_cmp1 = icmp slt i32 %x, -C1 +// %new_cmp2 = icmp sge i32 %x, C0-C1 +// %new_clamped_low = select i1 %new_cmp1, i32 %target_low, i32 %x +// %r = select i1 %new_cmp2, i32 %target_high, i32 %new_clamped_low +// Iff -C1 s<= C2 s<= C0-C1 +// Also ULT predicate can also be UGT iff C0 != -1 (+invert result) +// SLT predicate can also be SGT iff C2 != INT_MAX (+invert res.) +static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0, + InstCombiner::BuilderTy &Builder) { + Value *X = Sel0.getTrueValue(); + Value *Sel1 = Sel0.getFalseValue(); + + // First match the condition of the outermost select. + // Said condition must be one-use. + if (!Cmp0.hasOneUse()) + return nullptr; + Value *Cmp00 = Cmp0.getOperand(0); + Constant *C0; + if (!match(Cmp0.getOperand(1), + m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0)))) + return nullptr; + // Canonicalize Cmp0 into the form we expect. + // FIXME: we shouldn't care about lanes that are 'undef' in the end? + switch (Cmp0.getPredicate()) { + case ICmpInst::Predicate::ICMP_ULT: + break; // Great! + case ICmpInst::Predicate::ICMP_ULE: + // We'd have to increment C0 by one, and for that it must not have all-ones + // element, but then it would have been canonicalized to 'ult' before + // we get here. So we can't do anything useful with 'ule'. + return nullptr; + case ICmpInst::Predicate::ICMP_UGT: + // We want to canonicalize it to 'ult', so we'll need to increment C0, + // which again means it must not have any all-ones elements. + if (!match(C0, + m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, + APInt::getAllOnesValue( + C0->getType()->getScalarSizeInBits())))) + return nullptr; // Can't do, have all-ones element[s]. + C0 = AddOne(C0); + std::swap(X, Sel1); + break; + case ICmpInst::Predicate::ICMP_UGE: + // The only way we'd get this predicate if this `icmp` has extra uses, + // but then we won't be able to do this fold. + return nullptr; + default: + return nullptr; // Unknown predicate. + } + + // Now that we've canonicalized the ICmp, we know the X we expect; + // the select in other hand should be one-use. + if (!Sel1->hasOneUse()) + return nullptr; + + // We now can finish matching the condition of the outermost select: + // it should either be the X itself, or an addition of some constant to X. + Constant *C1; + if (Cmp00 == X) + C1 = ConstantInt::getNullValue(Sel0.getType()); + else if (!match(Cmp00, + m_Add(m_Specific(X), + m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C1))))) + return nullptr; + + Value *Cmp1; + ICmpInst::Predicate Pred1; + Constant *C2; + Value *ReplacementLow, *ReplacementHigh; + if (!match(Sel1, m_Select(m_Value(Cmp1), m_Value(ReplacementLow), + m_Value(ReplacementHigh))) || + !match(Cmp1, + m_ICmp(Pred1, m_Specific(X), + m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C2))))) + return nullptr; + + if (!Cmp1->hasOneUse() && (Cmp00 == X || !Cmp00->hasOneUse())) + return nullptr; // Not enough one-use instructions for the fold. + // FIXME: this restriction could be relaxed if Cmp1 can be reused as one of + // two comparisons we'll need to build. + + // Canonicalize Cmp1 into the form we expect. + // FIXME: we shouldn't care about lanes that are 'undef' in the end? + switch (Pred1) { + case ICmpInst::Predicate::ICMP_SLT: + break; + case ICmpInst::Predicate::ICMP_SLE: + // We'd have to increment C2 by one, and for that it must not have signed + // max element, but then it would have been canonicalized to 'slt' before + // we get here. So we can't do anything useful with 'sle'. + return nullptr; + case ICmpInst::Predicate::ICMP_SGT: + // We want to canonicalize it to 'slt', so we'll need to increment C2, + // which again means it must not have any signed max elements. + if (!match(C2, + m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, + APInt::getSignedMaxValue( + C2->getType()->getScalarSizeInBits())))) + return nullptr; // Can't do, have signed max element[s]. + C2 = AddOne(C2); + LLVM_FALLTHROUGH; + case ICmpInst::Predicate::ICMP_SGE: + // Also non-canonical, but here we don't need to change C2, + // so we don't have any restrictions on C2, so we can just handle it. + std::swap(ReplacementLow, ReplacementHigh); + break; + default: + return nullptr; // Unknown predicate. + } + + // The thresholds of this clamp-like pattern. + auto *ThresholdLowIncl = ConstantExpr::getNeg(C1); + auto *ThresholdHighExcl = ConstantExpr::getSub(C0, C1); + + // The fold has a precondition 1: C2 s>= ThresholdLow + auto *Precond1 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SGE, C2, + ThresholdLowIncl); + if (!match(Precond1, m_One())) + return nullptr; + // The fold has a precondition 2: C2 s<= ThresholdHigh + auto *Precond2 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SLE, C2, + ThresholdHighExcl); + if (!match(Precond2, m_One())) + return nullptr; + + // All good, finally emit the new pattern. + Value *ShouldReplaceLow = Builder.CreateICmpSLT(X, ThresholdLowIncl); + Value *ShouldReplaceHigh = Builder.CreateICmpSGE(X, ThresholdHighExcl); + Value *MaybeReplacedLow = + Builder.CreateSelect(ShouldReplaceLow, ReplacementLow, X); + Instruction *MaybeReplacedHigh = + SelectInst::Create(ShouldReplaceHigh, ReplacementHigh, MaybeReplacedLow); + + return MaybeReplacedHigh; +} + +// If we have +// %cmp = icmp [canonical predicate] i32 %x, C0 +// %r = select i1 %cmp, i32 %y, i32 C1 +// Where C0 != C1 and %x may be different from %y, see if the constant that we +// will have if we flip the strictness of the predicate (i.e. without changing +// the result) is identical to the C1 in select. If it matches we can change +// original comparison to one with swapped predicate, reuse the constant, +// and swap the hands of select. +static Instruction * +tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp, + InstCombiner::BuilderTy &Builder) { + ICmpInst::Predicate Pred; + Value *X; + Constant *C0; + if (!match(&Cmp, m_OneUse(m_ICmp( + Pred, m_Value(X), + m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0)))))) + return nullptr; + + // If comparison predicate is non-relational, we won't be able to do anything. + if (ICmpInst::isEquality(Pred)) + return nullptr; + + // If comparison predicate is non-canonical, then we certainly won't be able + // to make it canonical; canonicalizeCmpWithConstant() already tried. + if (!isCanonicalPredicate(Pred)) + return nullptr; + + // If the [input] type of comparison and select type are different, lets abort + // for now. We could try to compare constants with trunc/[zs]ext though. + if (C0->getType() != Sel.getType()) + return nullptr; + + // FIXME: are there any magic icmp predicate+constant pairs we must not touch? + + Value *SelVal0, *SelVal1; // We do not care which one is from where. + match(&Sel, m_Select(m_Value(), m_Value(SelVal0), m_Value(SelVal1))); + // At least one of these values we are selecting between must be a constant + // else we'll never succeed. + if (!match(SelVal0, m_AnyIntegralConstant()) && + !match(SelVal1, m_AnyIntegralConstant())) + return nullptr; + + // Does this constant C match any of the `select` values? + auto MatchesSelectValue = [SelVal0, SelVal1](Constant *C) { + return C->isElementWiseEqual(SelVal0) || C->isElementWiseEqual(SelVal1); + }; + + // If C0 *already* matches true/false value of select, we are done. + if (MatchesSelectValue(C0)) + return nullptr; + + // Check the constant we'd have with flipped-strictness predicate. + auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, C0); + if (!FlippedStrictness) + return nullptr; + + // If said constant doesn't match either, then there is no hope, + if (!MatchesSelectValue(FlippedStrictness->second)) + return nullptr; + + // It matched! Lets insert the new comparison just before select. + InstCombiner::BuilderTy::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(&Sel); + + Pred = ICmpInst::getSwappedPredicate(Pred); // Yes, swapped. + Value *NewCmp = Builder.CreateICmp(Pred, X, FlippedStrictness->second, + Cmp.getName() + ".inv"); + Sel.setCondition(NewCmp); + Sel.swapValues(); + Sel.swapProfMetadata(); + + return &Sel; +} + +/// Visit a SelectInst that has an ICmpInst as its first operand. +Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI, + ICmpInst *ICI) { + if (Value *V = foldSelectValueEquivalence(SI, *ICI, SQ)) + return replaceInstUsesWith(SI, V); + + if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, Builder)) + return NewSel; + + if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, Builder)) + return NewAbs; + + if (Instruction *NewAbs = canonicalizeClampLike(SI, *ICI, Builder)) + return NewAbs; + + if (Instruction *NewSel = + tryToReuseConstantFromSelectInComparison(SI, *ICI, Builder)) + return NewSel; + + bool Changed = adjustMinMax(SI, *ICI); + + if (Value *V = foldSelectICmpAnd(SI, ICI, Builder)) + return replaceInstUsesWith(SI, V); + + // NOTE: if we wanted to, this is where to detect integer MIN/MAX + Value *TrueVal = SI.getTrueValue(); + Value *FalseVal = SI.getFalseValue(); + ICmpInst::Predicate Pred = ICI->getPredicate(); + Value *CmpLHS = ICI->getOperand(0); + Value *CmpRHS = ICI->getOperand(1); + if (CmpRHS != CmpLHS && isa<Constant>(CmpRHS)) { + if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) { + // Transform (X == C) ? X : Y -> (X == C) ? C : Y + SI.setOperand(1, CmpRHS); + Changed = true; + } else if (CmpLHS == FalseVal && Pred == ICmpInst::ICMP_NE) { + // Transform (X != C) ? Y : X -> (X != C) ? Y : C + SI.setOperand(2, CmpRHS); + Changed = true; + } + } + + // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring + // decomposeBitTestICmp() might help. + { + unsigned BitWidth = + DL.getTypeSizeInBits(TrueVal->getType()->getScalarType()); + APInt MinSignedValue = APInt::getSignedMinValue(BitWidth); + Value *X; + const APInt *Y, *C; + bool TrueWhenUnset; + bool IsBitTest = false; + if (ICmpInst::isEquality(Pred) && + match(CmpLHS, m_And(m_Value(X), m_Power2(Y))) && + match(CmpRHS, m_Zero())) { + IsBitTest = true; + TrueWhenUnset = Pred == ICmpInst::ICMP_EQ; + } else if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_Zero())) { + X = CmpLHS; + Y = &MinSignedValue; + IsBitTest = true; + TrueWhenUnset = false; + } else if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_AllOnes())) { + X = CmpLHS; + Y = &MinSignedValue; + IsBitTest = true; + TrueWhenUnset = true; + } + if (IsBitTest) { + Value *V = nullptr; + // (X & Y) == 0 ? X : X ^ Y --> X & ~Y + if (TrueWhenUnset && TrueVal == X && + match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) + V = Builder.CreateAnd(X, ~(*Y)); + // (X & Y) != 0 ? X ^ Y : X --> X & ~Y + else if (!TrueWhenUnset && FalseVal == X && + match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) + V = Builder.CreateAnd(X, ~(*Y)); + // (X & Y) == 0 ? X ^ Y : X --> X | Y + else if (TrueWhenUnset && FalseVal == X && + match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) + V = Builder.CreateOr(X, *Y); + // (X & Y) != 0 ? X : X ^ Y --> X | Y + else if (!TrueWhenUnset && TrueVal == X && + match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C) + V = Builder.CreateOr(X, *Y); + + if (V) + return replaceInstUsesWith(SI, V); + } + } + + if (Instruction *V = + foldSelectICmpAndAnd(SI.getType(), ICI, TrueVal, FalseVal, Builder)) + return V; + + if (Instruction *V = foldSelectCtlzToCttz(ICI, TrueVal, FalseVal, Builder)) + return V; + + if (Value *V = foldSelectICmpAndOr(ICI, TrueVal, FalseVal, Builder)) + return replaceInstUsesWith(SI, V); + + if (Value *V = foldSelectICmpLshrAshr(ICI, TrueVal, FalseVal, Builder)) + return replaceInstUsesWith(SI, V); + + if (Value *V = foldSelectCttzCtlz(ICI, TrueVal, FalseVal, Builder)) + return replaceInstUsesWith(SI, V); + + if (Value *V = canonicalizeSaturatedSubtract(ICI, TrueVal, FalseVal, Builder)) + return replaceInstUsesWith(SI, V); + + if (Value *V = canonicalizeSaturatedAdd(ICI, TrueVal, FalseVal, Builder)) + return replaceInstUsesWith(SI, V); + + return Changed ? &SI : nullptr; +} + +/// SI is a select whose condition is a PHI node (but the two may be in +/// different blocks). See if the true/false values (V) are live in all of the +/// predecessor blocks of the PHI. For example, cases like this can't be mapped: +/// +/// X = phi [ C1, BB1], [C2, BB2] +/// Y = add +/// Z = select X, Y, 0 +/// +/// because Y is not live in BB1/BB2. +static bool canSelectOperandBeMappingIntoPredBlock(const Value *V, + const SelectInst &SI) { + // If the value is a non-instruction value like a constant or argument, it + // can always be mapped. + const Instruction *I = dyn_cast<Instruction>(V); + if (!I) return true; + + // If V is a PHI node defined in the same block as the condition PHI, we can + // map the arguments. + const PHINode *CondPHI = cast<PHINode>(SI.getCondition()); + + if (const PHINode *VP = dyn_cast<PHINode>(I)) + if (VP->getParent() == CondPHI->getParent()) + return true; + + // Otherwise, if the PHI and select are defined in the same block and if V is + // defined in a different block, then we can transform it. + if (SI.getParent() == CondPHI->getParent() && + I->getParent() != CondPHI->getParent()) + return true; + + // Otherwise we have a 'hard' case and we can't tell without doing more + // detailed dominator based analysis, punt. + return false; +} + +/// We have an SPF (e.g. a min or max) of an SPF of the form: +/// SPF2(SPF1(A, B), C) +Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner, + SelectPatternFlavor SPF1, + Value *A, Value *B, + Instruction &Outer, + SelectPatternFlavor SPF2, Value *C) { + if (Outer.getType() != Inner->getType()) + return nullptr; + + if (C == A || C == B) { + // MAX(MAX(A, B), B) -> MAX(A, B) + // MIN(MIN(a, b), a) -> MIN(a, b) + // TODO: This could be done in instsimplify. + if (SPF1 == SPF2 && SelectPatternResult::isMinOrMax(SPF1)) + return replaceInstUsesWith(Outer, Inner); + + // MAX(MIN(a, b), a) -> a + // MIN(MAX(a, b), a) -> a + // TODO: This could be done in instsimplify. + if ((SPF1 == SPF_SMIN && SPF2 == SPF_SMAX) || + (SPF1 == SPF_SMAX && SPF2 == SPF_SMIN) || + (SPF1 == SPF_UMIN && SPF2 == SPF_UMAX) || + (SPF1 == SPF_UMAX && SPF2 == SPF_UMIN)) + return replaceInstUsesWith(Outer, C); + } + + if (SPF1 == SPF2) { + const APInt *CB, *CC; + if (match(B, m_APInt(CB)) && match(C, m_APInt(CC))) { + // MIN(MIN(A, 23), 97) -> MIN(A, 23) + // MAX(MAX(A, 97), 23) -> MAX(A, 97) + // TODO: This could be done in instsimplify. + if ((SPF1 == SPF_UMIN && CB->ule(*CC)) || + (SPF1 == SPF_SMIN && CB->sle(*CC)) || + (SPF1 == SPF_UMAX && CB->uge(*CC)) || + (SPF1 == SPF_SMAX && CB->sge(*CC))) + return replaceInstUsesWith(Outer, Inner); + + // MIN(MIN(A, 97), 23) -> MIN(A, 23) + // MAX(MAX(A, 23), 97) -> MAX(A, 97) + if ((SPF1 == SPF_UMIN && CB->ugt(*CC)) || + (SPF1 == SPF_SMIN && CB->sgt(*CC)) || + (SPF1 == SPF_UMAX && CB->ult(*CC)) || + (SPF1 == SPF_SMAX && CB->slt(*CC))) { + Outer.replaceUsesOfWith(Inner, A); + return &Outer; + } + } + } + + // max(max(A, B), min(A, B)) --> max(A, B) + // min(min(A, B), max(A, B)) --> min(A, B) + // TODO: This could be done in instsimplify. + if (SPF1 == SPF2 && + ((SPF1 == SPF_UMIN && match(C, m_c_UMax(m_Specific(A), m_Specific(B)))) || + (SPF1 == SPF_SMIN && match(C, m_c_SMax(m_Specific(A), m_Specific(B)))) || + (SPF1 == SPF_UMAX && match(C, m_c_UMin(m_Specific(A), m_Specific(B)))) || + (SPF1 == SPF_SMAX && match(C, m_c_SMin(m_Specific(A), m_Specific(B)))))) + return replaceInstUsesWith(Outer, Inner); + + // ABS(ABS(X)) -> ABS(X) + // NABS(NABS(X)) -> NABS(X) + // TODO: This could be done in instsimplify. + if (SPF1 == SPF2 && (SPF1 == SPF_ABS || SPF1 == SPF_NABS)) { + return replaceInstUsesWith(Outer, Inner); + } + + // ABS(NABS(X)) -> ABS(X) + // NABS(ABS(X)) -> NABS(X) + if ((SPF1 == SPF_ABS && SPF2 == SPF_NABS) || + (SPF1 == SPF_NABS && SPF2 == SPF_ABS)) { + SelectInst *SI = cast<SelectInst>(Inner); + Value *NewSI = + Builder.CreateSelect(SI->getCondition(), SI->getFalseValue(), + SI->getTrueValue(), SI->getName(), SI); + return replaceInstUsesWith(Outer, NewSI); + } + + auto IsFreeOrProfitableToInvert = + [&](Value *V, Value *&NotV, bool &ElidesXor) { + if (match(V, m_Not(m_Value(NotV)))) { + // If V has at most 2 uses then we can get rid of the xor operation + // entirely. + ElidesXor |= !V->hasNUsesOrMore(3); + return true; + } + + if (isFreeToInvert(V, !V->hasNUsesOrMore(3))) { + NotV = nullptr; + return true; + } + + return false; + }; + + Value *NotA, *NotB, *NotC; + bool ElidesXor = false; + + // MIN(MIN(~A, ~B), ~C) == ~MAX(MAX(A, B), C) + // MIN(MAX(~A, ~B), ~C) == ~MAX(MIN(A, B), C) + // MAX(MIN(~A, ~B), ~C) == ~MIN(MAX(A, B), C) + // MAX(MAX(~A, ~B), ~C) == ~MIN(MIN(A, B), C) + // + // This transform is performance neutral if we can elide at least one xor from + // the set of three operands, since we'll be tacking on an xor at the very + // end. + if (SelectPatternResult::isMinOrMax(SPF1) && + SelectPatternResult::isMinOrMax(SPF2) && + IsFreeOrProfitableToInvert(A, NotA, ElidesXor) && + IsFreeOrProfitableToInvert(B, NotB, ElidesXor) && + IsFreeOrProfitableToInvert(C, NotC, ElidesXor) && ElidesXor) { + if (!NotA) + NotA = Builder.CreateNot(A); + if (!NotB) + NotB = Builder.CreateNot(B); + if (!NotC) + NotC = Builder.CreateNot(C); + + Value *NewInner = createMinMax(Builder, getInverseMinMaxFlavor(SPF1), NotA, + NotB); + Value *NewOuter = Builder.CreateNot( + createMinMax(Builder, getInverseMinMaxFlavor(SPF2), NewInner, NotC)); + return replaceInstUsesWith(Outer, NewOuter); + } + + return nullptr; +} + +/// Turn select C, (X + Y), (X - Y) --> (X + (select C, Y, (-Y))). +/// This is even legal for FP. +static Instruction *foldAddSubSelect(SelectInst &SI, + InstCombiner::BuilderTy &Builder) { + Value *CondVal = SI.getCondition(); + Value *TrueVal = SI.getTrueValue(); + Value *FalseVal = SI.getFalseValue(); + auto *TI = dyn_cast<Instruction>(TrueVal); + auto *FI = dyn_cast<Instruction>(FalseVal); + if (!TI || !FI || !TI->hasOneUse() || !FI->hasOneUse()) + return nullptr; + + Instruction *AddOp = nullptr, *SubOp = nullptr; + if ((TI->getOpcode() == Instruction::Sub && + FI->getOpcode() == Instruction::Add) || + (TI->getOpcode() == Instruction::FSub && + FI->getOpcode() == Instruction::FAdd)) { + AddOp = FI; + SubOp = TI; + } else if ((FI->getOpcode() == Instruction::Sub && + TI->getOpcode() == Instruction::Add) || + (FI->getOpcode() == Instruction::FSub && + TI->getOpcode() == Instruction::FAdd)) { + AddOp = TI; + SubOp = FI; + } + + if (AddOp) { + Value *OtherAddOp = nullptr; + if (SubOp->getOperand(0) == AddOp->getOperand(0)) { + OtherAddOp = AddOp->getOperand(1); + } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) { + OtherAddOp = AddOp->getOperand(0); + } + + if (OtherAddOp) { + // So at this point we know we have (Y -> OtherAddOp): + // select C, (add X, Y), (sub X, Z) + Value *NegVal; // Compute -Z + if (SI.getType()->isFPOrFPVectorTy()) { + NegVal = Builder.CreateFNeg(SubOp->getOperand(1)); + if (Instruction *NegInst = dyn_cast<Instruction>(NegVal)) { + FastMathFlags Flags = AddOp->getFastMathFlags(); + Flags &= SubOp->getFastMathFlags(); + NegInst->setFastMathFlags(Flags); + } + } else { + NegVal = Builder.CreateNeg(SubOp->getOperand(1)); + } + + Value *NewTrueOp = OtherAddOp; + Value *NewFalseOp = NegVal; + if (AddOp != TI) + std::swap(NewTrueOp, NewFalseOp); + Value *NewSel = Builder.CreateSelect(CondVal, NewTrueOp, NewFalseOp, + SI.getName() + ".p", &SI); + + if (SI.getType()->isFPOrFPVectorTy()) { + Instruction *RI = + BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel); + + FastMathFlags Flags = AddOp->getFastMathFlags(); + Flags &= SubOp->getFastMathFlags(); + RI->setFastMathFlags(Flags); + return RI; + } else + return BinaryOperator::CreateAdd(SubOp->getOperand(0), NewSel); + } + } + return nullptr; +} + +Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) { + Constant *C; + if (!match(Sel.getTrueValue(), m_Constant(C)) && + !match(Sel.getFalseValue(), m_Constant(C))) + return nullptr; + + Instruction *ExtInst; + if (!match(Sel.getTrueValue(), m_Instruction(ExtInst)) && + !match(Sel.getFalseValue(), m_Instruction(ExtInst))) + return nullptr; + + auto ExtOpcode = ExtInst->getOpcode(); + if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt) + return nullptr; + + // If we are extending from a boolean type or if we can create a select that + // has the same size operands as its condition, try to narrow the select. + Value *X = ExtInst->getOperand(0); + Type *SmallType = X->getType(); + Value *Cond = Sel.getCondition(); + auto *Cmp = dyn_cast<CmpInst>(Cond); + if (!SmallType->isIntOrIntVectorTy(1) && + (!Cmp || Cmp->getOperand(0)->getType() != SmallType)) + return nullptr; + + // If the constant is the same after truncation to the smaller type and + // extension to the original type, we can narrow the select. + Type *SelType = Sel.getType(); + Constant *TruncC = ConstantExpr::getTrunc(C, SmallType); + Constant *ExtC = ConstantExpr::getCast(ExtOpcode, TruncC, SelType); + if (ExtC == C) { + Value *TruncCVal = cast<Value>(TruncC); + if (ExtInst == Sel.getFalseValue()) + std::swap(X, TruncCVal); + + // select Cond, (ext X), C --> ext(select Cond, X, C') + // select Cond, C, (ext X) --> ext(select Cond, C', X) + Value *NewSel = Builder.CreateSelect(Cond, X, TruncCVal, "narrow", &Sel); + return CastInst::Create(Instruction::CastOps(ExtOpcode), NewSel, SelType); + } + + // If one arm of the select is the extend of the condition, replace that arm + // with the extension of the appropriate known bool value. + if (Cond == X) { + if (ExtInst == Sel.getTrueValue()) { + // select X, (sext X), C --> select X, -1, C + // select X, (zext X), C --> select X, 1, C + Constant *One = ConstantInt::getTrue(SmallType); + Constant *AllOnesOrOne = ConstantExpr::getCast(ExtOpcode, One, SelType); + return SelectInst::Create(Cond, AllOnesOrOne, C, "", nullptr, &Sel); + } else { + // select X, C, (sext X) --> select X, C, 0 + // select X, C, (zext X) --> select X, C, 0 + Constant *Zero = ConstantInt::getNullValue(SelType); + return SelectInst::Create(Cond, C, Zero, "", nullptr, &Sel); + } + } + + return nullptr; +} + +/// Try to transform a vector select with a constant condition vector into a +/// shuffle for easier combining with other shuffles and insert/extract. +static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) { + Value *CondVal = SI.getCondition(); + Constant *CondC; + if (!CondVal->getType()->isVectorTy() || !match(CondVal, m_Constant(CondC))) + return nullptr; + + unsigned NumElts = CondVal->getType()->getVectorNumElements(); + SmallVector<Constant *, 16> Mask; + Mask.reserve(NumElts); + Type *Int32Ty = Type::getInt32Ty(CondVal->getContext()); + for (unsigned i = 0; i != NumElts; ++i) { + Constant *Elt = CondC->getAggregateElement(i); + if (!Elt) + return nullptr; + + if (Elt->isOneValue()) { + // If the select condition element is true, choose from the 1st vector. + Mask.push_back(ConstantInt::get(Int32Ty, i)); + } else if (Elt->isNullValue()) { + // If the select condition element is false, choose from the 2nd vector. + Mask.push_back(ConstantInt::get(Int32Ty, i + NumElts)); + } else if (isa<UndefValue>(Elt)) { + // Undef in a select condition (choose one of the operands) does not mean + // the same thing as undef in a shuffle mask (any value is acceptable), so + // give up. + return nullptr; + } else { + // Bail out on a constant expression. + return nullptr; + } + } + + return new ShuffleVectorInst(SI.getTrueValue(), SI.getFalseValue(), + ConstantVector::get(Mask)); +} + +/// If we have a select of vectors with a scalar condition, try to convert that +/// to a vector select by splatting the condition. A splat may get folded with +/// other operations in IR and having all operands of a select be vector types +/// is likely better for vector codegen. +static Instruction *canonicalizeScalarSelectOfVecs( + SelectInst &Sel, InstCombiner::BuilderTy &Builder) { + Type *Ty = Sel.getType(); + if (!Ty->isVectorTy()) + return nullptr; + + // We can replace a single-use extract with constant index. + Value *Cond = Sel.getCondition(); + if (!match(Cond, m_OneUse(m_ExtractElement(m_Value(), m_ConstantInt())))) + return nullptr; + + // select (extelt V, Index), T, F --> select (splat V, Index), T, F + // Splatting the extracted condition reduces code (we could directly create a + // splat shuffle of the source vector to eliminate the intermediate step). + unsigned NumElts = Ty->getVectorNumElements(); + Value *SplatCond = Builder.CreateVectorSplat(NumElts, Cond); + Sel.setCondition(SplatCond); + return &Sel; +} + +/// Reuse bitcasted operands between a compare and select: +/// select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) --> +/// bitcast (select (cmp (bitcast C), (bitcast D)), (bitcast C), (bitcast D)) +static Instruction *foldSelectCmpBitcasts(SelectInst &Sel, + InstCombiner::BuilderTy &Builder) { + Value *Cond = Sel.getCondition(); + Value *TVal = Sel.getTrueValue(); + Value *FVal = Sel.getFalseValue(); + + CmpInst::Predicate Pred; + Value *A, *B; + if (!match(Cond, m_Cmp(Pred, m_Value(A), m_Value(B)))) + return nullptr; + + // The select condition is a compare instruction. If the select's true/false + // values are already the same as the compare operands, there's nothing to do. + if (TVal == A || TVal == B || FVal == A || FVal == B) + return nullptr; + + Value *C, *D; + if (!match(A, m_BitCast(m_Value(C))) || !match(B, m_BitCast(m_Value(D)))) + return nullptr; + + // select (cmp (bitcast C), (bitcast D)), (bitcast TSrc), (bitcast FSrc) + Value *TSrc, *FSrc; + if (!match(TVal, m_BitCast(m_Value(TSrc))) || + !match(FVal, m_BitCast(m_Value(FSrc)))) + return nullptr; + + // If the select true/false values are *different bitcasts* of the same source + // operands, make the select operands the same as the compare operands and + // cast the result. This is the canonical select form for min/max. + Value *NewSel; + if (TSrc == C && FSrc == D) { + // select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) --> + // bitcast (select (cmp A, B), A, B) + NewSel = Builder.CreateSelect(Cond, A, B, "", &Sel); + } else if (TSrc == D && FSrc == C) { + // select (cmp (bitcast C), (bitcast D)), (bitcast' D), (bitcast' C) --> + // bitcast (select (cmp A, B), B, A) + NewSel = Builder.CreateSelect(Cond, B, A, "", &Sel); + } else { + return nullptr; + } + return CastInst::CreateBitOrPointerCast(NewSel, Sel.getType()); +} + +/// Try to eliminate select instructions that test the returned flag of cmpxchg +/// instructions. +/// +/// If a select instruction tests the returned flag of a cmpxchg instruction and +/// selects between the returned value of the cmpxchg instruction its compare +/// operand, the result of the select will always be equal to its false value. +/// For example: +/// +/// %0 = cmpxchg i64* %ptr, i64 %compare, i64 %new_value seq_cst seq_cst +/// %1 = extractvalue { i64, i1 } %0, 1 +/// %2 = extractvalue { i64, i1 } %0, 0 +/// %3 = select i1 %1, i64 %compare, i64 %2 +/// ret i64 %3 +/// +/// The returned value of the cmpxchg instruction (%2) is the original value +/// located at %ptr prior to any update. If the cmpxchg operation succeeds, %2 +/// must have been equal to %compare. Thus, the result of the select is always +/// equal to %2, and the code can be simplified to: +/// +/// %0 = cmpxchg i64* %ptr, i64 %compare, i64 %new_value seq_cst seq_cst +/// %1 = extractvalue { i64, i1 } %0, 0 +/// ret i64 %1 +/// +static Instruction *foldSelectCmpXchg(SelectInst &SI) { + // A helper that determines if V is an extractvalue instruction whose + // aggregate operand is a cmpxchg instruction and whose single index is equal + // to I. If such conditions are true, the helper returns the cmpxchg + // instruction; otherwise, a nullptr is returned. + auto isExtractFromCmpXchg = [](Value *V, unsigned I) -> AtomicCmpXchgInst * { + auto *Extract = dyn_cast<ExtractValueInst>(V); + if (!Extract) + return nullptr; + if (Extract->getIndices()[0] != I) + return nullptr; + return dyn_cast<AtomicCmpXchgInst>(Extract->getAggregateOperand()); + }; + + // If the select has a single user, and this user is a select instruction that + // we can simplify, skip the cmpxchg simplification for now. + if (SI.hasOneUse()) + if (auto *Select = dyn_cast<SelectInst>(SI.user_back())) + if (Select->getCondition() == SI.getCondition()) + if (Select->getFalseValue() == SI.getTrueValue() || + Select->getTrueValue() == SI.getFalseValue()) + return nullptr; + + // Ensure the select condition is the returned flag of a cmpxchg instruction. + auto *CmpXchg = isExtractFromCmpXchg(SI.getCondition(), 1); + if (!CmpXchg) + return nullptr; + + // Check the true value case: The true value of the select is the returned + // value of the same cmpxchg used by the condition, and the false value is the + // cmpxchg instruction's compare operand. + if (auto *X = isExtractFromCmpXchg(SI.getTrueValue(), 0)) + if (X == CmpXchg && X->getCompareOperand() == SI.getFalseValue()) { + SI.setTrueValue(SI.getFalseValue()); + return &SI; + } + + // Check the false value case: The false value of the select is the returned + // value of the same cmpxchg used by the condition, and the true value is the + // cmpxchg instruction's compare operand. + if (auto *X = isExtractFromCmpXchg(SI.getFalseValue(), 0)) + if (X == CmpXchg && X->getCompareOperand() == SI.getTrueValue()) { + SI.setTrueValue(SI.getFalseValue()); + return &SI; + } + + return nullptr; +} + +static Instruction *moveAddAfterMinMax(SelectPatternFlavor SPF, Value *X, + Value *Y, + InstCombiner::BuilderTy &Builder) { + assert(SelectPatternResult::isMinOrMax(SPF) && "Expected min/max pattern"); + bool IsUnsigned = SPF == SelectPatternFlavor::SPF_UMIN || + SPF == SelectPatternFlavor::SPF_UMAX; + // TODO: If InstSimplify could fold all cases where C2 <= C1, we could change + // the constant value check to an assert. + Value *A; + const APInt *C1, *C2; + if (IsUnsigned && match(X, m_NUWAdd(m_Value(A), m_APInt(C1))) && + match(Y, m_APInt(C2)) && C2->uge(*C1) && X->hasNUses(2)) { + // umin (add nuw A, C1), C2 --> add nuw (umin A, C2 - C1), C1 + // umax (add nuw A, C1), C2 --> add nuw (umax A, C2 - C1), C1 + Value *NewMinMax = createMinMax(Builder, SPF, A, + ConstantInt::get(X->getType(), *C2 - *C1)); + return BinaryOperator::CreateNUW(BinaryOperator::Add, NewMinMax, + ConstantInt::get(X->getType(), *C1)); + } + + if (!IsUnsigned && match(X, m_NSWAdd(m_Value(A), m_APInt(C1))) && + match(Y, m_APInt(C2)) && X->hasNUses(2)) { + bool Overflow; + APInt Diff = C2->ssub_ov(*C1, Overflow); + if (!Overflow) { + // smin (add nsw A, C1), C2 --> add nsw (smin A, C2 - C1), C1 + // smax (add nsw A, C1), C2 --> add nsw (smax A, C2 - C1), C1 + Value *NewMinMax = createMinMax(Builder, SPF, A, + ConstantInt::get(X->getType(), Diff)); + return BinaryOperator::CreateNSW(BinaryOperator::Add, NewMinMax, + ConstantInt::get(X->getType(), *C1)); + } + } + + return nullptr; +} + +/// Match a sadd_sat or ssub_sat which is using min/max to clamp the value. +Instruction *InstCombiner::matchSAddSubSat(SelectInst &MinMax1) { + Type *Ty = MinMax1.getType(); + + // We are looking for a tree of: + // max(INT_MIN, min(INT_MAX, add(sext(A), sext(B)))) + // Where the min and max could be reversed + Instruction *MinMax2; + BinaryOperator *AddSub; + const APInt *MinValue, *MaxValue; + if (match(&MinMax1, m_SMin(m_Instruction(MinMax2), m_APInt(MaxValue)))) { + if (!match(MinMax2, m_SMax(m_BinOp(AddSub), m_APInt(MinValue)))) + return nullptr; + } else if (match(&MinMax1, + m_SMax(m_Instruction(MinMax2), m_APInt(MinValue)))) { + if (!match(MinMax2, m_SMin(m_BinOp(AddSub), m_APInt(MaxValue)))) + return nullptr; + } else + return nullptr; + + // Check that the constants clamp a saturate, and that the new type would be + // sensible to convert to. + if (!(*MaxValue + 1).isPowerOf2() || -*MinValue != *MaxValue + 1) + return nullptr; + // In what bitwidth can this be treated as saturating arithmetics? + unsigned NewBitWidth = (*MaxValue + 1).logBase2() + 1; + // FIXME: This isn't quite right for vectors, but using the scalar type is a + // good first approximation for what should be done there. + if (!shouldChangeType(Ty->getScalarType()->getIntegerBitWidth(), NewBitWidth)) + return nullptr; + + // Also make sure that the number of uses is as expected. The "3"s are for the + // the two items of min/max (the compare and the select). + if (MinMax2->hasNUsesOrMore(3) || AddSub->hasNUsesOrMore(3)) + return nullptr; + + // Create the new type (which can be a vector type) + Type *NewTy = Ty->getWithNewBitWidth(NewBitWidth); + // Match the two extends from the add/sub + Value *A, *B; + if(!match(AddSub, m_BinOp(m_SExt(m_Value(A)), m_SExt(m_Value(B))))) + return nullptr; + // And check the incoming values are of a type smaller than or equal to the + // size of the saturation. Otherwise the higher bits can cause different + // results. + if (A->getType()->getScalarSizeInBits() > NewBitWidth || + B->getType()->getScalarSizeInBits() > NewBitWidth) + return nullptr; + + Intrinsic::ID IntrinsicID; + if (AddSub->getOpcode() == Instruction::Add) + IntrinsicID = Intrinsic::sadd_sat; + else if (AddSub->getOpcode() == Instruction::Sub) + IntrinsicID = Intrinsic::ssub_sat; + else + return nullptr; + + // Finally create and return the sat intrinsic, truncated to the new type + Function *F = Intrinsic::getDeclaration(MinMax1.getModule(), IntrinsicID, NewTy); + Value *AT = Builder.CreateSExt(A, NewTy); + Value *BT = Builder.CreateSExt(B, NewTy); + Value *Sat = Builder.CreateCall(F, {AT, BT}); + return CastInst::Create(Instruction::SExt, Sat, Ty); +} + +/// Reduce a sequence of min/max with a common operand. +static Instruction *factorizeMinMaxTree(SelectPatternFlavor SPF, Value *LHS, + Value *RHS, + InstCombiner::BuilderTy &Builder) { + assert(SelectPatternResult::isMinOrMax(SPF) && "Expected a min/max"); + // TODO: Allow FP min/max with nnan/nsz. + if (!LHS->getType()->isIntOrIntVectorTy()) + return nullptr; + + // Match 3 of the same min/max ops. Example: umin(umin(), umin()). + Value *A, *B, *C, *D; + SelectPatternResult L = matchSelectPattern(LHS, A, B); + SelectPatternResult R = matchSelectPattern(RHS, C, D); + if (SPF != L.Flavor || L.Flavor != R.Flavor) + return nullptr; + + // Look for a common operand. The use checks are different than usual because + // a min/max pattern typically has 2 uses of each op: 1 by the cmp and 1 by + // the select. + Value *MinMaxOp = nullptr; + Value *ThirdOp = nullptr; + if (!LHS->hasNUsesOrMore(3) && RHS->hasNUsesOrMore(3)) { + // If the LHS is only used in this chain and the RHS is used outside of it, + // reuse the RHS min/max because that will eliminate the LHS. + if (D == A || C == A) { + // min(min(a, b), min(c, a)) --> min(min(c, a), b) + // min(min(a, b), min(a, d)) --> min(min(a, d), b) + MinMaxOp = RHS; + ThirdOp = B; + } else if (D == B || C == B) { + // min(min(a, b), min(c, b)) --> min(min(c, b), a) + // min(min(a, b), min(b, d)) --> min(min(b, d), a) + MinMaxOp = RHS; + ThirdOp = A; + } + } else if (!RHS->hasNUsesOrMore(3)) { + // Reuse the LHS. This will eliminate the RHS. + if (D == A || D == B) { + // min(min(a, b), min(c, a)) --> min(min(a, b), c) + // min(min(a, b), min(c, b)) --> min(min(a, b), c) + MinMaxOp = LHS; + ThirdOp = C; + } else if (C == A || C == B) { + // min(min(a, b), min(b, d)) --> min(min(a, b), d) + // min(min(a, b), min(c, b)) --> min(min(a, b), d) + MinMaxOp = LHS; + ThirdOp = D; + } + } + if (!MinMaxOp || !ThirdOp) + return nullptr; + + CmpInst::Predicate P = getMinMaxPred(SPF); + Value *CmpABC = Builder.CreateICmp(P, MinMaxOp, ThirdOp); + return SelectInst::Create(CmpABC, MinMaxOp, ThirdOp); +} + +/// Try to reduce a rotate pattern that includes a compare and select into a +/// funnel shift intrinsic. Example: +/// rotl32(a, b) --> (b == 0 ? a : ((a >> (32 - b)) | (a << b))) +/// --> call llvm.fshl.i32(a, a, b) +static Instruction *foldSelectRotate(SelectInst &Sel) { + // The false value of the select must be a rotate of the true value. + Value *Or0, *Or1; + if (!match(Sel.getFalseValue(), m_OneUse(m_Or(m_Value(Or0), m_Value(Or1))))) + return nullptr; + + Value *TVal = Sel.getTrueValue(); + Value *SA0, *SA1; + if (!match(Or0, m_OneUse(m_LogicalShift(m_Specific(TVal), m_Value(SA0)))) || + !match(Or1, m_OneUse(m_LogicalShift(m_Specific(TVal), m_Value(SA1))))) + return nullptr; + + auto ShiftOpcode0 = cast<BinaryOperator>(Or0)->getOpcode(); + auto ShiftOpcode1 = cast<BinaryOperator>(Or1)->getOpcode(); + if (ShiftOpcode0 == ShiftOpcode1) + return nullptr; + + // We have one of these patterns so far: + // select ?, TVal, (or (lshr TVal, SA0), (shl TVal, SA1)) + // select ?, TVal, (or (shl TVal, SA0), (lshr TVal, SA1)) + // This must be a power-of-2 rotate for a bitmasking transform to be valid. + unsigned Width = Sel.getType()->getScalarSizeInBits(); + if (!isPowerOf2_32(Width)) + return nullptr; + + // Check the shift amounts to see if they are an opposite pair. + Value *ShAmt; + if (match(SA1, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(SA0))))) + ShAmt = SA0; + else if (match(SA0, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(SA1))))) + ShAmt = SA1; + else + return nullptr; + + // Finally, see if the select is filtering out a shift-by-zero. + Value *Cond = Sel.getCondition(); + ICmpInst::Predicate Pred; + if (!match(Cond, m_OneUse(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()))) || + Pred != ICmpInst::ICMP_EQ) + return nullptr; + + // This is a rotate that avoids shift-by-bitwidth UB in a suboptimal way. + // Convert to funnel shift intrinsic. + bool IsFshl = (ShAmt == SA0 && ShiftOpcode0 == BinaryOperator::Shl) || + (ShAmt == SA1 && ShiftOpcode1 == BinaryOperator::Shl); + Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr; + Function *F = Intrinsic::getDeclaration(Sel.getModule(), IID, Sel.getType()); + return IntrinsicInst::Create(F, { TVal, TVal, ShAmt }); +} + +Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { + Value *CondVal = SI.getCondition(); + Value *TrueVal = SI.getTrueValue(); + Value *FalseVal = SI.getFalseValue(); + Type *SelType = SI.getType(); + + // FIXME: Remove this workaround when freeze related patches are done. + // For select with undef operand which feeds into an equality comparison, + // don't simplify it so loop unswitch can know the equality comparison + // may have an undef operand. This is a workaround for PR31652 caused by + // descrepancy about branch on undef between LoopUnswitch and GVN. + if (isa<UndefValue>(TrueVal) || isa<UndefValue>(FalseVal)) { + if (llvm::any_of(SI.users(), [&](User *U) { + ICmpInst *CI = dyn_cast<ICmpInst>(U); + if (CI && CI->isEquality()) + return true; + return false; + })) { + return nullptr; + } + } + + if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal, + SQ.getWithInstruction(&SI))) + return replaceInstUsesWith(SI, V); + + if (Instruction *I = canonicalizeSelectToShuffle(SI)) + return I; + + if (Instruction *I = canonicalizeScalarSelectOfVecs(SI, Builder)) + return I; + + // Canonicalize a one-use integer compare with a non-canonical predicate by + // inverting the predicate and swapping the select operands. This matches a + // compare canonicalization for conditional branches. + // TODO: Should we do the same for FP compares? + CmpInst::Predicate Pred; + if (match(CondVal, m_OneUse(m_ICmp(Pred, m_Value(), m_Value()))) && + !isCanonicalPredicate(Pred)) { + // Swap true/false values and condition. + CmpInst *Cond = cast<CmpInst>(CondVal); + Cond->setPredicate(CmpInst::getInversePredicate(Pred)); + SI.setOperand(1, FalseVal); + SI.setOperand(2, TrueVal); + SI.swapProfMetadata(); + Worklist.Add(Cond); + return &SI; + } + + if (SelType->isIntOrIntVectorTy(1) && + TrueVal->getType() == CondVal->getType()) { + if (match(TrueVal, m_One())) { + // Change: A = select B, true, C --> A = or B, C + return BinaryOperator::CreateOr(CondVal, FalseVal); + } + if (match(TrueVal, m_Zero())) { + // Change: A = select B, false, C --> A = and !B, C + Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName()); + return BinaryOperator::CreateAnd(NotCond, FalseVal); + } + if (match(FalseVal, m_Zero())) { + // Change: A = select B, C, false --> A = and B, C + return BinaryOperator::CreateAnd(CondVal, TrueVal); + } + if (match(FalseVal, m_One())) { + // Change: A = select B, C, true --> A = or !B, C + Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName()); + return BinaryOperator::CreateOr(NotCond, TrueVal); + } + + // select a, a, b -> a | b + // select a, b, a -> a & b + if (CondVal == TrueVal) + return BinaryOperator::CreateOr(CondVal, FalseVal); + if (CondVal == FalseVal) + return BinaryOperator::CreateAnd(CondVal, TrueVal); + + // select a, ~a, b -> (~a) & b + // select a, b, ~a -> (~a) | b + if (match(TrueVal, m_Not(m_Specific(CondVal)))) + return BinaryOperator::CreateAnd(TrueVal, FalseVal); + if (match(FalseVal, m_Not(m_Specific(CondVal)))) + return BinaryOperator::CreateOr(TrueVal, FalseVal); + } + + // Selecting between two integer or vector splat integer constants? + // + // Note that we don't handle a scalar select of vectors: + // select i1 %c, <2 x i8> <1, 1>, <2 x i8> <0, 0> + // because that may need 3 instructions to splat the condition value: + // extend, insertelement, shufflevector. + if (SelType->isIntOrIntVectorTy() && + CondVal->getType()->isVectorTy() == SelType->isVectorTy()) { + // select C, 1, 0 -> zext C to int + if (match(TrueVal, m_One()) && match(FalseVal, m_Zero())) + return new ZExtInst(CondVal, SelType); + + // select C, -1, 0 -> sext C to int + if (match(TrueVal, m_AllOnes()) && match(FalseVal, m_Zero())) + return new SExtInst(CondVal, SelType); + + // select C, 0, 1 -> zext !C to int + if (match(TrueVal, m_Zero()) && match(FalseVal, m_One())) { + Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName()); + return new ZExtInst(NotCond, SelType); + } + + // select C, 0, -1 -> sext !C to int + if (match(TrueVal, m_Zero()) && match(FalseVal, m_AllOnes())) { + Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName()); + return new SExtInst(NotCond, SelType); + } + } + + // See if we are selecting two values based on a comparison of the two values. + if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) { + if (FCI->getOperand(0) == TrueVal && FCI->getOperand(1) == FalseVal) { + // Canonicalize to use ordered comparisons by swapping the select + // operands. + // + // e.g. + // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X + if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { + FCmpInst::Predicate InvPred = FCI->getInversePredicate(); + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(FCI->getFastMathFlags()); + Value *NewCond = Builder.CreateFCmp(InvPred, TrueVal, FalseVal, + FCI->getName() + ".inv"); + + return SelectInst::Create(NewCond, FalseVal, TrueVal, + SI.getName() + ".p"); + } + + // NOTE: if we wanted to, this is where to detect MIN/MAX + } else if (FCI->getOperand(0) == FalseVal && FCI->getOperand(1) == TrueVal){ + // Canonicalize to use ordered comparisons by swapping the select + // operands. + // + // e.g. + // (X ugt Y) ? X : Y -> (X ole Y) ? X : Y + if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { + FCmpInst::Predicate InvPred = FCI->getInversePredicate(); + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(FCI->getFastMathFlags()); + Value *NewCond = Builder.CreateFCmp(InvPred, FalseVal, TrueVal, + FCI->getName() + ".inv"); + + return SelectInst::Create(NewCond, FalseVal, TrueVal, + SI.getName() + ".p"); + } + + // NOTE: if we wanted to, this is where to detect MIN/MAX + } + } + + // Canonicalize select with fcmp to fabs(). -0.0 makes this tricky. We need + // fast-math-flags (nsz) or fsub with +0.0 (not fneg) for this to work. We + // also require nnan because we do not want to unintentionally change the + // sign of a NaN value. + // FIXME: These folds should test/propagate FMF from the select, not the + // fsub or fneg. + // (X <= +/-0.0) ? (0.0 - X) : X --> fabs(X) + Instruction *FSub; + if (match(CondVal, m_FCmp(Pred, m_Specific(FalseVal), m_AnyZeroFP())) && + match(TrueVal, m_FSub(m_PosZeroFP(), m_Specific(FalseVal))) && + match(TrueVal, m_Instruction(FSub)) && FSub->hasNoNaNs() && + (Pred == FCmpInst::FCMP_OLE || Pred == FCmpInst::FCMP_ULE)) { + Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FalseVal, FSub); + return replaceInstUsesWith(SI, Fabs); + } + // (X > +/-0.0) ? X : (0.0 - X) --> fabs(X) + if (match(CondVal, m_FCmp(Pred, m_Specific(TrueVal), m_AnyZeroFP())) && + match(FalseVal, m_FSub(m_PosZeroFP(), m_Specific(TrueVal))) && + match(FalseVal, m_Instruction(FSub)) && FSub->hasNoNaNs() && + (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_UGT)) { + Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, TrueVal, FSub); + return replaceInstUsesWith(SI, Fabs); + } + // With nnan and nsz: + // (X < +/-0.0) ? -X : X --> fabs(X) + // (X <= +/-0.0) ? -X : X --> fabs(X) + Instruction *FNeg; + if (match(CondVal, m_FCmp(Pred, m_Specific(FalseVal), m_AnyZeroFP())) && + match(TrueVal, m_FNeg(m_Specific(FalseVal))) && + match(TrueVal, m_Instruction(FNeg)) && + FNeg->hasNoNaNs() && FNeg->hasNoSignedZeros() && + (Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_OLE || + Pred == FCmpInst::FCMP_ULT || Pred == FCmpInst::FCMP_ULE)) { + Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FalseVal, FNeg); + return replaceInstUsesWith(SI, Fabs); + } + // With nnan and nsz: + // (X > +/-0.0) ? X : -X --> fabs(X) + // (X >= +/-0.0) ? X : -X --> fabs(X) + if (match(CondVal, m_FCmp(Pred, m_Specific(TrueVal), m_AnyZeroFP())) && + match(FalseVal, m_FNeg(m_Specific(TrueVal))) && + match(FalseVal, m_Instruction(FNeg)) && + FNeg->hasNoNaNs() && FNeg->hasNoSignedZeros() && + (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_OGE || + Pred == FCmpInst::FCMP_UGT || Pred == FCmpInst::FCMP_UGE)) { + Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, TrueVal, FNeg); + return replaceInstUsesWith(SI, Fabs); + } + + // See if we are selecting two values based on a comparison of the two values. + if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal)) + if (Instruction *Result = foldSelectInstWithICmp(SI, ICI)) + return Result; + + if (Instruction *Add = foldAddSubSelect(SI, Builder)) + return Add; + + // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z)) + auto *TI = dyn_cast<Instruction>(TrueVal); + auto *FI = dyn_cast<Instruction>(FalseVal); + if (TI && FI && TI->getOpcode() == FI->getOpcode()) + if (Instruction *IV = foldSelectOpOp(SI, TI, FI)) + return IV; + + if (Instruction *I = foldSelectExtConst(SI)) + return I; + + // See if we can fold the select into one of our operands. + if (SelType->isIntOrIntVectorTy() || SelType->isFPOrFPVectorTy()) { + if (Instruction *FoldI = foldSelectIntoOp(SI, TrueVal, FalseVal)) + return FoldI; + + Value *LHS, *RHS; + Instruction::CastOps CastOp; + SelectPatternResult SPR = matchSelectPattern(&SI, LHS, RHS, &CastOp); + auto SPF = SPR.Flavor; + if (SPF) { + Value *LHS2, *RHS2; + if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor) + if (Instruction *R = foldSPFofSPF(cast<Instruction>(LHS), SPF2, LHS2, + RHS2, SI, SPF, RHS)) + return R; + if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2).Flavor) + if (Instruction *R = foldSPFofSPF(cast<Instruction>(RHS), SPF2, LHS2, + RHS2, SI, SPF, LHS)) + return R; + // TODO. + // ABS(-X) -> ABS(X) + } + + if (SelectPatternResult::isMinOrMax(SPF)) { + // Canonicalize so that + // - type casts are outside select patterns. + // - float clamp is transformed to min/max pattern + + bool IsCastNeeded = LHS->getType() != SelType; + Value *CmpLHS = cast<CmpInst>(CondVal)->getOperand(0); + Value *CmpRHS = cast<CmpInst>(CondVal)->getOperand(1); + if (IsCastNeeded || + (LHS->getType()->isFPOrFPVectorTy() && + ((CmpLHS != LHS && CmpLHS != RHS) || + (CmpRHS != LHS && CmpRHS != RHS)))) { + CmpInst::Predicate MinMaxPred = getMinMaxPred(SPF, SPR.Ordered); + + Value *Cmp; + if (CmpInst::isIntPredicate(MinMaxPred)) { + Cmp = Builder.CreateICmp(MinMaxPred, LHS, RHS); + } else { + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + auto FMF = + cast<FPMathOperator>(SI.getCondition())->getFastMathFlags(); + Builder.setFastMathFlags(FMF); + Cmp = Builder.CreateFCmp(MinMaxPred, LHS, RHS); + } + + Value *NewSI = Builder.CreateSelect(Cmp, LHS, RHS, SI.getName(), &SI); + if (!IsCastNeeded) + return replaceInstUsesWith(SI, NewSI); + + Value *NewCast = Builder.CreateCast(CastOp, NewSI, SelType); + return replaceInstUsesWith(SI, NewCast); + } + + // MAX(~a, ~b) -> ~MIN(a, b) + // MAX(~a, C) -> ~MIN(a, ~C) + // MIN(~a, ~b) -> ~MAX(a, b) + // MIN(~a, C) -> ~MAX(a, ~C) + auto moveNotAfterMinMax = [&](Value *X, Value *Y) -> Instruction * { + Value *A; + if (match(X, m_Not(m_Value(A))) && !X->hasNUsesOrMore(3) && + !isFreeToInvert(A, A->hasOneUse()) && + // Passing false to only consider m_Not and constants. + isFreeToInvert(Y, false)) { + Value *B = Builder.CreateNot(Y); + Value *NewMinMax = createMinMax(Builder, getInverseMinMaxFlavor(SPF), + A, B); + // Copy the profile metadata. + if (MDNode *MD = SI.getMetadata(LLVMContext::MD_prof)) { + cast<SelectInst>(NewMinMax)->setMetadata(LLVMContext::MD_prof, MD); + // Swap the metadata if the operands are swapped. + if (X == SI.getFalseValue() && Y == SI.getTrueValue()) + cast<SelectInst>(NewMinMax)->swapProfMetadata(); + } + + return BinaryOperator::CreateNot(NewMinMax); + } + + return nullptr; + }; + + if (Instruction *I = moveNotAfterMinMax(LHS, RHS)) + return I; + if (Instruction *I = moveNotAfterMinMax(RHS, LHS)) + return I; + + if (Instruction *I = moveAddAfterMinMax(SPF, LHS, RHS, Builder)) + return I; + + if (Instruction *I = factorizeMinMaxTree(SPF, LHS, RHS, Builder)) + return I; + if (Instruction *I = matchSAddSubSat(SI)) + return I; + } + } + + // Canonicalize select of FP values where NaN and -0.0 are not valid as + // minnum/maxnum intrinsics. + if (isa<FPMathOperator>(SI) && SI.hasNoNaNs() && SI.hasNoSignedZeros()) { + Value *X, *Y; + if (match(&SI, m_OrdFMax(m_Value(X), m_Value(Y)))) + return replaceInstUsesWith( + SI, Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, X, Y, &SI)); + + if (match(&SI, m_OrdFMin(m_Value(X), m_Value(Y)))) + return replaceInstUsesWith( + SI, Builder.CreateBinaryIntrinsic(Intrinsic::minnum, X, Y, &SI)); + } + + // See if we can fold the select into a phi node if the condition is a select. + if (auto *PN = dyn_cast<PHINode>(SI.getCondition())) + // The true/false values have to be live in the PHI predecessor's blocks. + if (canSelectOperandBeMappingIntoPredBlock(TrueVal, SI) && + canSelectOperandBeMappingIntoPredBlock(FalseVal, SI)) + if (Instruction *NV = foldOpIntoPhi(SI, PN)) + return NV; + + if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) { + if (TrueSI->getCondition()->getType() == CondVal->getType()) { + // select(C, select(C, a, b), c) -> select(C, a, c) + if (TrueSI->getCondition() == CondVal) { + if (SI.getTrueValue() == TrueSI->getTrueValue()) + return nullptr; + SI.setOperand(1, TrueSI->getTrueValue()); + return &SI; + } + // select(C0, select(C1, a, b), b) -> select(C0&C1, a, b) + // We choose this as normal form to enable folding on the And and shortening + // paths for the values (this helps GetUnderlyingObjects() for example). + if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) { + Value *And = Builder.CreateAnd(CondVal, TrueSI->getCondition()); + SI.setOperand(0, And); + SI.setOperand(1, TrueSI->getTrueValue()); + return &SI; + } + } + } + if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) { + if (FalseSI->getCondition()->getType() == CondVal->getType()) { + // select(C, a, select(C, b, c)) -> select(C, a, c) + if (FalseSI->getCondition() == CondVal) { + if (SI.getFalseValue() == FalseSI->getFalseValue()) + return nullptr; + SI.setOperand(2, FalseSI->getFalseValue()); + return &SI; + } + // select(C0, a, select(C1, a, b)) -> select(C0|C1, a, b) + if (FalseSI->getTrueValue() == TrueVal && FalseSI->hasOneUse()) { + Value *Or = Builder.CreateOr(CondVal, FalseSI->getCondition()); + SI.setOperand(0, Or); + SI.setOperand(2, FalseSI->getFalseValue()); + return &SI; + } + } + } + + auto canMergeSelectThroughBinop = [](BinaryOperator *BO) { + // The select might be preventing a division by 0. + switch (BO->getOpcode()) { + default: + return true; + case Instruction::SRem: + case Instruction::URem: + case Instruction::SDiv: + case Instruction::UDiv: + return false; + } + }; + + // Try to simplify a binop sandwiched between 2 selects with the same + // condition. + // select(C, binop(select(C, X, Y), W), Z) -> select(C, binop(X, W), Z) + BinaryOperator *TrueBO; + if (match(TrueVal, m_OneUse(m_BinOp(TrueBO))) && + canMergeSelectThroughBinop(TrueBO)) { + if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(0))) { + if (TrueBOSI->getCondition() == CondVal) { + TrueBO->setOperand(0, TrueBOSI->getTrueValue()); + Worklist.Add(TrueBO); + return &SI; + } + } + if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(1))) { + if (TrueBOSI->getCondition() == CondVal) { + TrueBO->setOperand(1, TrueBOSI->getTrueValue()); + Worklist.Add(TrueBO); + return &SI; + } + } + } + + // select(C, Z, binop(select(C, X, Y), W)) -> select(C, Z, binop(Y, W)) + BinaryOperator *FalseBO; + if (match(FalseVal, m_OneUse(m_BinOp(FalseBO))) && + canMergeSelectThroughBinop(FalseBO)) { + if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(0))) { + if (FalseBOSI->getCondition() == CondVal) { + FalseBO->setOperand(0, FalseBOSI->getFalseValue()); + Worklist.Add(FalseBO); + return &SI; + } + } + if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(1))) { + if (FalseBOSI->getCondition() == CondVal) { + FalseBO->setOperand(1, FalseBOSI->getFalseValue()); + Worklist.Add(FalseBO); + return &SI; + } + } + } + + Value *NotCond; + if (match(CondVal, m_Not(m_Value(NotCond)))) { + SI.setOperand(0, NotCond); + SI.setOperand(1, FalseVal); + SI.setOperand(2, TrueVal); + SI.swapProfMetadata(); + return &SI; + } + + if (VectorType *VecTy = dyn_cast<VectorType>(SelType)) { + unsigned VWidth = VecTy->getNumElements(); + APInt UndefElts(VWidth, 0); + APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); + if (Value *V = SimplifyDemandedVectorElts(&SI, AllOnesEltMask, UndefElts)) { + if (V != &SI) + return replaceInstUsesWith(SI, V); + return &SI; + } + } + + // If we can compute the condition, there's no need for a select. + // Like the above fold, we are attempting to reduce compile-time cost by + // putting this fold here with limitations rather than in InstSimplify. + // The motivation for this call into value tracking is to take advantage of + // the assumption cache, so make sure that is populated. + if (!CondVal->getType()->isVectorTy() && !AC.assumptions().empty()) { + KnownBits Known(1); + computeKnownBits(CondVal, Known, 0, &SI); + if (Known.One.isOneValue()) + return replaceInstUsesWith(SI, TrueVal); + if (Known.Zero.isOneValue()) + return replaceInstUsesWith(SI, FalseVal); + } + + if (Instruction *BitCastSel = foldSelectCmpBitcasts(SI, Builder)) + return BitCastSel; + + // Simplify selects that test the returned flag of cmpxchg instructions. + if (Instruction *Select = foldSelectCmpXchg(SI)) + return Select; + + if (Instruction *Select = foldSelectBinOpIdentity(SI, TLI)) + return Select; + + if (Instruction *Rot = foldSelectRotate(SI)) + return Rot; + + return nullptr; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp new file mode 100644 index 000000000000..64294838644f --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -0,0 +1,1247 @@ +//===- InstCombineShifts.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the visitShl, visitLShr, and visitAShr functions. +// +//===----------------------------------------------------------------------===// + +#include "InstCombineInternal.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "instcombine" + +// Given pattern: +// (x shiftopcode Q) shiftopcode K +// we should rewrite it as +// x shiftopcode (Q+K) iff (Q+K) u< bitwidth(x) +// This is valid for any shift, but they must be identical. +// +// AnalyzeForSignBitExtraction indicates that we will only analyze whether this +// pattern has any 2 right-shifts that sum to 1 less than original bit width. +Value *InstCombiner::reassociateShiftAmtsOfTwoSameDirectionShifts( + BinaryOperator *Sh0, const SimplifyQuery &SQ, + bool AnalyzeForSignBitExtraction) { + // Look for a shift of some instruction, ignore zext of shift amount if any. + Instruction *Sh0Op0; + Value *ShAmt0; + if (!match(Sh0, + m_Shift(m_Instruction(Sh0Op0), m_ZExtOrSelf(m_Value(ShAmt0))))) + return nullptr; + + // If there is a truncation between the two shifts, we must make note of it + // and look through it. The truncation imposes additional constraints on the + // transform. + Instruction *Sh1; + Value *Trunc = nullptr; + match(Sh0Op0, + m_CombineOr(m_CombineAnd(m_Trunc(m_Instruction(Sh1)), m_Value(Trunc)), + m_Instruction(Sh1))); + + // Inner shift: (x shiftopcode ShAmt1) + // Like with other shift, ignore zext of shift amount if any. + Value *X, *ShAmt1; + if (!match(Sh1, m_Shift(m_Value(X), m_ZExtOrSelf(m_Value(ShAmt1))))) + return nullptr; + + // We have two shift amounts from two different shifts. The types of those + // shift amounts may not match. If that's the case let's bailout now.. + if (ShAmt0->getType() != ShAmt1->getType()) + return nullptr; + + // We are only looking for signbit extraction if we have two right shifts. + bool HadTwoRightShifts = match(Sh0, m_Shr(m_Value(), m_Value())) && + match(Sh1, m_Shr(m_Value(), m_Value())); + // ... and if it's not two right-shifts, we know the answer already. + if (AnalyzeForSignBitExtraction && !HadTwoRightShifts) + return nullptr; + + // The shift opcodes must be identical, unless we are just checking whether + // this pattern can be interpreted as a sign-bit-extraction. + Instruction::BinaryOps ShiftOpcode = Sh0->getOpcode(); + bool IdenticalShOpcodes = Sh0->getOpcode() == Sh1->getOpcode(); + if (!IdenticalShOpcodes && !AnalyzeForSignBitExtraction) + return nullptr; + + // If we saw truncation, we'll need to produce extra instruction, + // and for that one of the operands of the shift must be one-use, + // unless of course we don't actually plan to produce any instructions here. + if (Trunc && !AnalyzeForSignBitExtraction && + !match(Sh0, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) + return nullptr; + + // Can we fold (ShAmt0+ShAmt1) ? + auto *NewShAmt = dyn_cast_or_null<Constant>( + SimplifyAddInst(ShAmt0, ShAmt1, /*isNSW=*/false, /*isNUW=*/false, + SQ.getWithInstruction(Sh0))); + if (!NewShAmt) + return nullptr; // Did not simplify. + unsigned NewShAmtBitWidth = NewShAmt->getType()->getScalarSizeInBits(); + unsigned XBitWidth = X->getType()->getScalarSizeInBits(); + // Is the new shift amount smaller than the bit width of inner/new shift? + if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, + APInt(NewShAmtBitWidth, XBitWidth)))) + return nullptr; // FIXME: could perform constant-folding. + + // If there was a truncation, and we have a right-shift, we can only fold if + // we are left with the original sign bit. Likewise, if we were just checking + // that this is a sighbit extraction, this is the place to check it. + // FIXME: zero shift amount is also legal here, but we can't *easily* check + // more than one predicate so it's not really worth it. + if (HadTwoRightShifts && (Trunc || AnalyzeForSignBitExtraction)) { + // If it's not a sign bit extraction, then we're done. + if (!match(NewShAmt, + m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt(NewShAmtBitWidth, XBitWidth - 1)))) + return nullptr; + // If it is, and that was the question, return the base value. + if (AnalyzeForSignBitExtraction) + return X; + } + + assert(IdenticalShOpcodes && "Should not get here with different shifts."); + + // All good, we can do this fold. + NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, X->getType()); + + BinaryOperator *NewShift = BinaryOperator::Create(ShiftOpcode, X, NewShAmt); + + // The flags can only be propagated if there wasn't a trunc. + if (!Trunc) { + // If the pattern did not involve trunc, and both of the original shifts + // had the same flag set, preserve the flag. + if (ShiftOpcode == Instruction::BinaryOps::Shl) { + NewShift->setHasNoUnsignedWrap(Sh0->hasNoUnsignedWrap() && + Sh1->hasNoUnsignedWrap()); + NewShift->setHasNoSignedWrap(Sh0->hasNoSignedWrap() && + Sh1->hasNoSignedWrap()); + } else { + NewShift->setIsExact(Sh0->isExact() && Sh1->isExact()); + } + } + + Instruction *Ret = NewShift; + if (Trunc) { + Builder.Insert(NewShift); + Ret = CastInst::Create(Instruction::Trunc, NewShift, Sh0->getType()); + } + + return Ret; +} + +// Try to replace `undef` constants in C with Replacement. +static Constant *replaceUndefsWith(Constant *C, Constant *Replacement) { + if (C && match(C, m_Undef())) + return Replacement; + + if (auto *CV = dyn_cast<ConstantVector>(C)) { + llvm::SmallVector<Constant *, 32> NewOps(CV->getNumOperands()); + for (unsigned i = 0, NumElts = NewOps.size(); i != NumElts; ++i) { + Constant *EltC = CV->getOperand(i); + NewOps[i] = EltC && match(EltC, m_Undef()) ? Replacement : EltC; + } + return ConstantVector::get(NewOps); + } + + // Don't know how to deal with this constant. + return C; +} + +// If we have some pattern that leaves only some low bits set, and then performs +// left-shift of those bits, if none of the bits that are left after the final +// shift are modified by the mask, we can omit the mask. +// +// There are many variants to this pattern: +// a) (x & ((1 << MaskShAmt) - 1)) << ShiftShAmt +// b) (x & (~(-1 << MaskShAmt))) << ShiftShAmt +// c) (x & (-1 >> MaskShAmt)) << ShiftShAmt +// d) (x & ((-1 << MaskShAmt) >> MaskShAmt)) << ShiftShAmt +// e) ((x << MaskShAmt) l>> MaskShAmt) << ShiftShAmt +// f) ((x << MaskShAmt) a>> MaskShAmt) << ShiftShAmt +// All these patterns can be simplified to just: +// x << ShiftShAmt +// iff: +// a,b) (MaskShAmt+ShiftShAmt) u>= bitwidth(x) +// c,d,e,f) (ShiftShAmt-MaskShAmt) s>= 0 (i.e. ShiftShAmt u>= MaskShAmt) +static Instruction * +dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift, + const SimplifyQuery &Q, + InstCombiner::BuilderTy &Builder) { + assert(OuterShift->getOpcode() == Instruction::BinaryOps::Shl && + "The input must be 'shl'!"); + + Value *Masked, *ShiftShAmt; + match(OuterShift, m_Shift(m_Value(Masked), m_Value(ShiftShAmt))); + + Type *NarrowestTy = OuterShift->getType(); + Type *WidestTy = Masked->getType(); + // The mask must be computed in a type twice as wide to ensure + // that no bits are lost if the sum-of-shifts is wider than the base type. + Type *ExtendedTy = WidestTy->getExtendedType(); + + Value *MaskShAmt; + + // ((1 << MaskShAmt) - 1) + auto MaskA = m_Add(m_Shl(m_One(), m_Value(MaskShAmt)), m_AllOnes()); + // (~(-1 << maskNbits)) + auto MaskB = m_Xor(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_AllOnes()); + // (-1 >> MaskShAmt) + auto MaskC = m_Shr(m_AllOnes(), m_Value(MaskShAmt)); + // ((-1 << MaskShAmt) >> MaskShAmt) + auto MaskD = + m_Shr(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_Deferred(MaskShAmt)); + + Value *X; + Constant *NewMask; + + if (match(Masked, m_c_And(m_CombineOr(MaskA, MaskB), m_Value(X)))) { + // Can we simplify (MaskShAmt+ShiftShAmt) ? + auto *SumOfShAmts = dyn_cast_or_null<Constant>(SimplifyAddInst( + MaskShAmt, ShiftShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q)); + if (!SumOfShAmts) + return nullptr; // Did not simplify. + // In this pattern SumOfShAmts correlates with the number of low bits + // that shall remain in the root value (OuterShift). + + // An extend of an undef value becomes zero because the high bits are never + // completely unknown. Replace the the `undef` shift amounts with final + // shift bitwidth to ensure that the value remains undef when creating the + // subsequent shift op. + SumOfShAmts = replaceUndefsWith( + SumOfShAmts, ConstantInt::get(SumOfShAmts->getType()->getScalarType(), + ExtendedTy->getScalarSizeInBits())); + auto *ExtendedSumOfShAmts = ConstantExpr::getZExt(SumOfShAmts, ExtendedTy); + // And compute the mask as usual: ~(-1 << (SumOfShAmts)) + auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy); + auto *ExtendedInvertedMask = + ConstantExpr::getShl(ExtendedAllOnes, ExtendedSumOfShAmts); + NewMask = ConstantExpr::getNot(ExtendedInvertedMask); + } else if (match(Masked, m_c_And(m_CombineOr(MaskC, MaskD), m_Value(X))) || + match(Masked, m_Shr(m_Shl(m_Value(X), m_Value(MaskShAmt)), + m_Deferred(MaskShAmt)))) { + // Can we simplify (ShiftShAmt-MaskShAmt) ? + auto *ShAmtsDiff = dyn_cast_or_null<Constant>(SimplifySubInst( + ShiftShAmt, MaskShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q)); + if (!ShAmtsDiff) + return nullptr; // Did not simplify. + // In this pattern ShAmtsDiff correlates with the number of high bits that + // shall be unset in the root value (OuterShift). + + // An extend of an undef value becomes zero because the high bits are never + // completely unknown. Replace the the `undef` shift amounts with negated + // bitwidth of innermost shift to ensure that the value remains undef when + // creating the subsequent shift op. + unsigned WidestTyBitWidth = WidestTy->getScalarSizeInBits(); + ShAmtsDiff = replaceUndefsWith( + ShAmtsDiff, ConstantInt::get(ShAmtsDiff->getType()->getScalarType(), + -WidestTyBitWidth)); + auto *ExtendedNumHighBitsToClear = ConstantExpr::getZExt( + ConstantExpr::getSub(ConstantInt::get(ShAmtsDiff->getType(), + WidestTyBitWidth, + /*isSigned=*/false), + ShAmtsDiff), + ExtendedTy); + // And compute the mask as usual: (-1 l>> (NumHighBitsToClear)) + auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy); + NewMask = + ConstantExpr::getLShr(ExtendedAllOnes, ExtendedNumHighBitsToClear); + } else + return nullptr; // Don't know anything about this pattern. + + NewMask = ConstantExpr::getTrunc(NewMask, NarrowestTy); + + // Does this mask has any unset bits? If not then we can just not apply it. + bool NeedMask = !match(NewMask, m_AllOnes()); + + // If we need to apply a mask, there are several more restrictions we have. + if (NeedMask) { + // The old masking instruction must go away. + if (!Masked->hasOneUse()) + return nullptr; + // The original "masking" instruction must not have been`ashr`. + if (match(Masked, m_AShr(m_Value(), m_Value()))) + return nullptr; + } + + // No 'NUW'/'NSW'! We no longer know that we won't shift-out non-0 bits. + auto *NewShift = BinaryOperator::Create(OuterShift->getOpcode(), X, + OuterShift->getOperand(1)); + + if (!NeedMask) + return NewShift; + + Builder.Insert(NewShift); + return BinaryOperator::Create(Instruction::And, NewShift, NewMask); +} + +Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + assert(Op0->getType() == Op1->getType()); + + // If the shift amount is a one-use `sext`, we can demote it to `zext`. + Value *Y; + if (match(Op1, m_OneUse(m_SExt(m_Value(Y))))) { + Value *NewExt = Builder.CreateZExt(Y, I.getType(), Op1->getName()); + return BinaryOperator::Create(I.getOpcode(), Op0, NewExt); + } + + // See if we can fold away this shift. + if (SimplifyDemandedInstructionBits(I)) + return &I; + + // Try to fold constant and into select arguments. + if (isa<Constant>(Op0)) + if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + + if (Constant *CUI = dyn_cast<Constant>(Op1)) + if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I)) + return Res; + + if (auto *NewShift = cast_or_null<Instruction>( + reassociateShiftAmtsOfTwoSameDirectionShifts(&I, SQ))) + return NewShift; + + // (C1 shift (A add C2)) -> (C1 shift C2) shift A) + // iff A and C2 are both positive. + Value *A; + Constant *C; + if (match(Op0, m_Constant()) && match(Op1, m_Add(m_Value(A), m_Constant(C)))) + if (isKnownNonNegative(A, DL, 0, &AC, &I, &DT) && + isKnownNonNegative(C, DL, 0, &AC, &I, &DT)) + return BinaryOperator::Create( + I.getOpcode(), Builder.CreateBinOp(I.getOpcode(), Op0, C), A); + + // X shift (A srem B) -> X shift (A and B-1) iff B is a power of 2. + // Because shifts by negative values (which could occur if A were negative) + // are undefined. + const APInt *B; + if (Op1->hasOneUse() && match(Op1, m_SRem(m_Value(A), m_Power2(B)))) { + // FIXME: Should this get moved into SimplifyDemandedBits by saying we don't + // demand the sign bit (and many others) here?? + Value *Rem = Builder.CreateAnd(A, ConstantInt::get(I.getType(), *B - 1), + Op1->getName()); + I.setOperand(1, Rem); + return &I; + } + + return nullptr; +} + +/// Return true if we can simplify two logical (either left or right) shifts +/// that have constant shift amounts: OuterShift (InnerShift X, C1), C2. +static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl, + Instruction *InnerShift, InstCombiner &IC, + Instruction *CxtI) { + assert(InnerShift->isLogicalShift() && "Unexpected instruction type"); + + // We need constant scalar or constant splat shifts. + const APInt *InnerShiftConst; + if (!match(InnerShift->getOperand(1), m_APInt(InnerShiftConst))) + return false; + + // Two logical shifts in the same direction: + // shl (shl X, C1), C2 --> shl X, C1 + C2 + // lshr (lshr X, C1), C2 --> lshr X, C1 + C2 + bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl; + if (IsInnerShl == IsOuterShl) + return true; + + // Equal shift amounts in opposite directions become bitwise 'and': + // lshr (shl X, C), C --> and X, C' + // shl (lshr X, C), C --> and X, C' + if (*InnerShiftConst == OuterShAmt) + return true; + + // If the 2nd shift is bigger than the 1st, we can fold: + // lshr (shl X, C1), C2 --> and (shl X, C1 - C2), C3 + // shl (lshr X, C1), C2 --> and (lshr X, C1 - C2), C3 + // but it isn't profitable unless we know the and'd out bits are already zero. + // Also, check that the inner shift is valid (less than the type width) or + // we'll crash trying to produce the bit mask for the 'and'. + unsigned TypeWidth = InnerShift->getType()->getScalarSizeInBits(); + if (InnerShiftConst->ugt(OuterShAmt) && InnerShiftConst->ult(TypeWidth)) { + unsigned InnerShAmt = InnerShiftConst->getZExtValue(); + unsigned MaskShift = + IsInnerShl ? TypeWidth - InnerShAmt : InnerShAmt - OuterShAmt; + APInt Mask = APInt::getLowBitsSet(TypeWidth, OuterShAmt) << MaskShift; + if (IC.MaskedValueIsZero(InnerShift->getOperand(0), Mask, 0, CxtI)) + return true; + } + + return false; +} + +/// See if we can compute the specified value, but shifted logically to the left +/// or right by some number of bits. This should return true if the expression +/// can be computed for the same cost as the current expression tree. This is +/// used to eliminate extraneous shifting from things like: +/// %C = shl i128 %A, 64 +/// %D = shl i128 %B, 96 +/// %E = or i128 %C, %D +/// %F = lshr i128 %E, 64 +/// where the client will ask if E can be computed shifted right by 64-bits. If +/// this succeeds, getShiftedValue() will be called to produce the value. +static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift, + InstCombiner &IC, Instruction *CxtI) { + // We can always evaluate constants shifted. + if (isa<Constant>(V)) + return true; + + Instruction *I = dyn_cast<Instruction>(V); + if (!I) return false; + + // If this is the opposite shift, we can directly reuse the input of the shift + // if the needed bits are already zero in the input. This allows us to reuse + // the value which means that we don't care if the shift has multiple uses. + // TODO: Handle opposite shift by exact value. + ConstantInt *CI = nullptr; + if ((IsLeftShift && match(I, m_LShr(m_Value(), m_ConstantInt(CI)))) || + (!IsLeftShift && match(I, m_Shl(m_Value(), m_ConstantInt(CI))))) { + if (CI->getValue() == NumBits) { + // TODO: Check that the input bits are already zero with MaskedValueIsZero +#if 0 + // If this is a truncate of a logical shr, we can truncate it to a smaller + // lshr iff we know that the bits we would otherwise be shifting in are + // already zeros. + uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits(); + uint32_t BitWidth = Ty->getScalarSizeInBits(); + if (MaskedValueIsZero(I->getOperand(0), + APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth)) && + CI->getLimitedValue(BitWidth) < BitWidth) { + return CanEvaluateTruncated(I->getOperand(0), Ty); + } +#endif + + } + } + + // We can't mutate something that has multiple uses: doing so would + // require duplicating the instruction in general, which isn't profitable. + if (!I->hasOneUse()) return false; + + switch (I->getOpcode()) { + default: return false; + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted. + return canEvaluateShifted(I->getOperand(0), NumBits, IsLeftShift, IC, I) && + canEvaluateShifted(I->getOperand(1), NumBits, IsLeftShift, IC, I); + + case Instruction::Shl: + case Instruction::LShr: + return canEvaluateShiftedShift(NumBits, IsLeftShift, I, IC, CxtI); + + case Instruction::Select: { + SelectInst *SI = cast<SelectInst>(I); + Value *TrueVal = SI->getTrueValue(); + Value *FalseVal = SI->getFalseValue(); + return canEvaluateShifted(TrueVal, NumBits, IsLeftShift, IC, SI) && + canEvaluateShifted(FalseVal, NumBits, IsLeftShift, IC, SI); + } + case Instruction::PHI: { + // We can change a phi if we can change all operands. Note that we never + // get into trouble with cyclic PHIs here because we only consider + // instructions with a single use. + PHINode *PN = cast<PHINode>(I); + for (Value *IncValue : PN->incoming_values()) + if (!canEvaluateShifted(IncValue, NumBits, IsLeftShift, IC, PN)) + return false; + return true; + } + } +} + +/// Fold OuterShift (InnerShift X, C1), C2. +/// See canEvaluateShiftedShift() for the constraints on these instructions. +static Value *foldShiftedShift(BinaryOperator *InnerShift, unsigned OuterShAmt, + bool IsOuterShl, + InstCombiner::BuilderTy &Builder) { + bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl; + Type *ShType = InnerShift->getType(); + unsigned TypeWidth = ShType->getScalarSizeInBits(); + + // We only accept shifts-by-a-constant in canEvaluateShifted(). + const APInt *C1; + match(InnerShift->getOperand(1), m_APInt(C1)); + unsigned InnerShAmt = C1->getZExtValue(); + + // Change the shift amount and clear the appropriate IR flags. + auto NewInnerShift = [&](unsigned ShAmt) { + InnerShift->setOperand(1, ConstantInt::get(ShType, ShAmt)); + if (IsInnerShl) { + InnerShift->setHasNoUnsignedWrap(false); + InnerShift->setHasNoSignedWrap(false); + } else { + InnerShift->setIsExact(false); + } + return InnerShift; + }; + + // Two logical shifts in the same direction: + // shl (shl X, C1), C2 --> shl X, C1 + C2 + // lshr (lshr X, C1), C2 --> lshr X, C1 + C2 + if (IsInnerShl == IsOuterShl) { + // If this is an oversized composite shift, then unsigned shifts get 0. + if (InnerShAmt + OuterShAmt >= TypeWidth) + return Constant::getNullValue(ShType); + + return NewInnerShift(InnerShAmt + OuterShAmt); + } + + // Equal shift amounts in opposite directions become bitwise 'and': + // lshr (shl X, C), C --> and X, C' + // shl (lshr X, C), C --> and X, C' + if (InnerShAmt == OuterShAmt) { + APInt Mask = IsInnerShl + ? APInt::getLowBitsSet(TypeWidth, TypeWidth - OuterShAmt) + : APInt::getHighBitsSet(TypeWidth, TypeWidth - OuterShAmt); + Value *And = Builder.CreateAnd(InnerShift->getOperand(0), + ConstantInt::get(ShType, Mask)); + if (auto *AndI = dyn_cast<Instruction>(And)) { + AndI->moveBefore(InnerShift); + AndI->takeName(InnerShift); + } + return And; + } + + assert(InnerShAmt > OuterShAmt && + "Unexpected opposite direction logical shift pair"); + + // In general, we would need an 'and' for this transform, but + // canEvaluateShiftedShift() guarantees that the masked-off bits are not used. + // lshr (shl X, C1), C2 --> shl X, C1 - C2 + // shl (lshr X, C1), C2 --> lshr X, C1 - C2 + return NewInnerShift(InnerShAmt - OuterShAmt); +} + +/// When canEvaluateShifted() returns true for an expression, this function +/// inserts the new computation that produces the shifted value. +static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, + InstCombiner &IC, const DataLayout &DL) { + // We can always evaluate constants shifted. + if (Constant *C = dyn_cast<Constant>(V)) { + if (isLeftShift) + V = IC.Builder.CreateShl(C, NumBits); + else + V = IC.Builder.CreateLShr(C, NumBits); + // If we got a constantexpr back, try to simplify it with TD info. + if (auto *C = dyn_cast<Constant>(V)) + if (auto *FoldedC = + ConstantFoldConstant(C, DL, &IC.getTargetLibraryInfo())) + V = FoldedC; + return V; + } + + Instruction *I = cast<Instruction>(V); + IC.Worklist.Add(I); + + switch (I->getOpcode()) { + default: llvm_unreachable("Inconsistency with CanEvaluateShifted"); + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted. + I->setOperand( + 0, getShiftedValue(I->getOperand(0), NumBits, isLeftShift, IC, DL)); + I->setOperand( + 1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL)); + return I; + + case Instruction::Shl: + case Instruction::LShr: + return foldShiftedShift(cast<BinaryOperator>(I), NumBits, isLeftShift, + IC.Builder); + + case Instruction::Select: + I->setOperand( + 1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL)); + I->setOperand( + 2, getShiftedValue(I->getOperand(2), NumBits, isLeftShift, IC, DL)); + return I; + case Instruction::PHI: { + // We can change a phi if we can change all operands. Note that we never + // get into trouble with cyclic PHIs here because we only consider + // instructions with a single use. + PHINode *PN = cast<PHINode>(I); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + PN->setIncomingValue(i, getShiftedValue(PN->getIncomingValue(i), NumBits, + isLeftShift, IC, DL)); + return PN; + } + } +} + +// If this is a bitwise operator or add with a constant RHS we might be able +// to pull it through a shift. +static bool canShiftBinOpWithConstantRHS(BinaryOperator &Shift, + BinaryOperator *BO) { + switch (BO->getOpcode()) { + default: + return false; // Do not perform transform! + case Instruction::Add: + return Shift.getOpcode() == Instruction::Shl; + case Instruction::Or: + case Instruction::Xor: + case Instruction::And: + return true; + } +} + +Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1, + BinaryOperator &I) { + bool isLeftShift = I.getOpcode() == Instruction::Shl; + + const APInt *Op1C; + if (!match(Op1, m_APInt(Op1C))) + return nullptr; + + // See if we can propagate this shift into the input, this covers the trivial + // cast of lshr(shl(x,c1),c2) as well as other more complex cases. + if (I.getOpcode() != Instruction::AShr && + canEvaluateShifted(Op0, Op1C->getZExtValue(), isLeftShift, *this, &I)) { + LLVM_DEBUG( + dbgs() << "ICE: GetShiftedValue propagating shift through expression" + " to eliminate shift:\n IN: " + << *Op0 << "\n SH: " << I << "\n"); + + return replaceInstUsesWith( + I, getShiftedValue(Op0, Op1C->getZExtValue(), isLeftShift, *this, DL)); + } + + // See if we can simplify any instructions used by the instruction whose sole + // purpose is to compute bits we don't care about. + unsigned TypeBits = Op0->getType()->getScalarSizeInBits(); + + assert(!Op1C->uge(TypeBits) && + "Shift over the type width should have been removed already"); + + if (Instruction *FoldedShift = foldBinOpIntoSelectOrPhi(I)) + return FoldedShift; + + // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2)) + if (TruncInst *TI = dyn_cast<TruncInst>(Op0)) { + Instruction *TrOp = dyn_cast<Instruction>(TI->getOperand(0)); + // If 'shift2' is an ashr, we would have to get the sign bit into a funny + // place. Don't try to do this transformation in this case. Also, we + // require that the input operand is a shift-by-constant so that we have + // confidence that the shifts will get folded together. We could do this + // xform in more cases, but it is unlikely to be profitable. + if (TrOp && I.isLogicalShift() && TrOp->isShift() && + isa<ConstantInt>(TrOp->getOperand(1))) { + // Okay, we'll do this xform. Make the shift of shift. + Constant *ShAmt = + ConstantExpr::getZExt(cast<Constant>(Op1), TrOp->getType()); + // (shift2 (shift1 & 0x00FF), c2) + Value *NSh = Builder.CreateBinOp(I.getOpcode(), TrOp, ShAmt, I.getName()); + + // For logical shifts, the truncation has the effect of making the high + // part of the register be zeros. Emulate this by inserting an AND to + // clear the top bits as needed. This 'and' will usually be zapped by + // other xforms later if dead. + unsigned SrcSize = TrOp->getType()->getScalarSizeInBits(); + unsigned DstSize = TI->getType()->getScalarSizeInBits(); + APInt MaskV(APInt::getLowBitsSet(SrcSize, DstSize)); + + // The mask we constructed says what the trunc would do if occurring + // between the shifts. We want to know the effect *after* the second + // shift. We know that it is a logical shift by a constant, so adjust the + // mask as appropriate. + if (I.getOpcode() == Instruction::Shl) + MaskV <<= Op1C->getZExtValue(); + else { + assert(I.getOpcode() == Instruction::LShr && "Unknown logical shift"); + MaskV.lshrInPlace(Op1C->getZExtValue()); + } + + // shift1 & 0x00FF + Value *And = Builder.CreateAnd(NSh, + ConstantInt::get(I.getContext(), MaskV), + TI->getName()); + + // Return the value truncated to the interesting size. + return new TruncInst(And, I.getType()); + } + } + + if (Op0->hasOneUse()) { + if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) { + // Turn ((X >> C) + Y) << C -> (X + (Y << C)) & (~0 << C) + Value *V1, *V2; + ConstantInt *CC; + switch (Op0BO->getOpcode()) { + default: break; + case Instruction::Add: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // These operators commute. + // Turn (Y + (X >> C)) << C -> (X + (Y << C)) & (~0 << C) + if (isLeftShift && Op0BO->getOperand(1)->hasOneUse() && + match(Op0BO->getOperand(1), m_Shr(m_Value(V1), + m_Specific(Op1)))) { + Value *YS = // (Y << C) + Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName()); + // (X + (Y << C)) + Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), YS, V1, + Op0BO->getOperand(1)->getName()); + unsigned Op1Val = Op1C->getLimitedValue(TypeBits); + + APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val); + Constant *Mask = ConstantInt::get(I.getContext(), Bits); + if (VectorType *VT = dyn_cast<VectorType>(X->getType())) + Mask = ConstantVector::getSplat(VT->getNumElements(), Mask); + return BinaryOperator::CreateAnd(X, Mask); + } + + // Turn (Y + ((X >> C) & CC)) << C -> ((X & (CC << C)) + (Y << C)) + Value *Op0BOOp1 = Op0BO->getOperand(1); + if (isLeftShift && Op0BOOp1->hasOneUse() && + match(Op0BOOp1, + m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))), + m_ConstantInt(CC)))) { + Value *YS = // (Y << C) + Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName()); + // X & (CC << C) + Value *XM = Builder.CreateAnd(V1, ConstantExpr::getShl(CC, Op1), + V1->getName()+".mask"); + return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM); + } + LLVM_FALLTHROUGH; + } + + case Instruction::Sub: { + // Turn ((X >> C) + Y) << C -> (X + (Y << C)) & (~0 << C) + if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() && + match(Op0BO->getOperand(0), m_Shr(m_Value(V1), + m_Specific(Op1)))) { + Value *YS = // (Y << C) + Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName()); + // (X + (Y << C)) + Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), V1, YS, + Op0BO->getOperand(0)->getName()); + unsigned Op1Val = Op1C->getLimitedValue(TypeBits); + + APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val); + Constant *Mask = ConstantInt::get(I.getContext(), Bits); + if (VectorType *VT = dyn_cast<VectorType>(X->getType())) + Mask = ConstantVector::getSplat(VT->getNumElements(), Mask); + return BinaryOperator::CreateAnd(X, Mask); + } + + // Turn (((X >> C)&CC) + Y) << C -> (X + (Y << C)) & (CC << C) + if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() && + match(Op0BO->getOperand(0), + m_And(m_OneUse(m_Shr(m_Value(V1), m_Value(V2))), + m_ConstantInt(CC))) && V2 == Op1) { + Value *YS = // (Y << C) + Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName()); + // X & (CC << C) + Value *XM = Builder.CreateAnd(V1, ConstantExpr::getShl(CC, Op1), + V1->getName()+".mask"); + + return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS); + } + + break; + } + } + + + // If the operand is a bitwise operator with a constant RHS, and the + // shift is the only use, we can pull it out of the shift. + const APInt *Op0C; + if (match(Op0BO->getOperand(1), m_APInt(Op0C))) { + if (canShiftBinOpWithConstantRHS(I, Op0BO)) { + Constant *NewRHS = ConstantExpr::get(I.getOpcode(), + cast<Constant>(Op0BO->getOperand(1)), Op1); + + Value *NewShift = + Builder.CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1); + NewShift->takeName(Op0BO); + + return BinaryOperator::Create(Op0BO->getOpcode(), NewShift, + NewRHS); + } + } + + // If the operand is a subtract with a constant LHS, and the shift + // is the only use, we can pull it out of the shift. + // This folds (shl (sub C1, X), C2) -> (sub (C1 << C2), (shl X, C2)) + if (isLeftShift && Op0BO->getOpcode() == Instruction::Sub && + match(Op0BO->getOperand(0), m_APInt(Op0C))) { + Constant *NewRHS = ConstantExpr::get(I.getOpcode(), + cast<Constant>(Op0BO->getOperand(0)), Op1); + + Value *NewShift = Builder.CreateShl(Op0BO->getOperand(1), Op1); + NewShift->takeName(Op0BO); + + return BinaryOperator::CreateSub(NewRHS, NewShift); + } + } + + // If we have a select that conditionally executes some binary operator, + // see if we can pull it the select and operator through the shift. + // + // For example, turning: + // shl (select C, (add X, C1), X), C2 + // Into: + // Y = shl X, C2 + // select C, (add Y, C1 << C2), Y + Value *Cond; + BinaryOperator *TBO; + Value *FalseVal; + if (match(Op0, m_Select(m_Value(Cond), m_OneUse(m_BinOp(TBO)), + m_Value(FalseVal)))) { + const APInt *C; + if (!isa<Constant>(FalseVal) && TBO->getOperand(0) == FalseVal && + match(TBO->getOperand(1), m_APInt(C)) && + canShiftBinOpWithConstantRHS(I, TBO)) { + Constant *NewRHS = ConstantExpr::get(I.getOpcode(), + cast<Constant>(TBO->getOperand(1)), Op1); + + Value *NewShift = + Builder.CreateBinOp(I.getOpcode(), FalseVal, Op1); + Value *NewOp = Builder.CreateBinOp(TBO->getOpcode(), NewShift, + NewRHS); + return SelectInst::Create(Cond, NewOp, NewShift); + } + } + + BinaryOperator *FBO; + Value *TrueVal; + if (match(Op0, m_Select(m_Value(Cond), m_Value(TrueVal), + m_OneUse(m_BinOp(FBO))))) { + const APInt *C; + if (!isa<Constant>(TrueVal) && FBO->getOperand(0) == TrueVal && + match(FBO->getOperand(1), m_APInt(C)) && + canShiftBinOpWithConstantRHS(I, FBO)) { + Constant *NewRHS = ConstantExpr::get(I.getOpcode(), + cast<Constant>(FBO->getOperand(1)), Op1); + + Value *NewShift = + Builder.CreateBinOp(I.getOpcode(), TrueVal, Op1); + Value *NewOp = Builder.CreateBinOp(FBO->getOpcode(), NewShift, + NewRHS); + return SelectInst::Create(Cond, NewShift, NewOp); + } + } + } + + return nullptr; +} + +Instruction *InstCombiner::visitShl(BinaryOperator &I) { + const SimplifyQuery Q = SQ.getWithInstruction(&I); + + if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1), + I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), Q)) + return replaceInstUsesWith(I, V); + + if (Instruction *X = foldVectorBinop(I)) + return X; + + if (Instruction *V = commonShiftTransforms(I)) + return V; + + if (Instruction *V = dropRedundantMaskingOfLeftShiftInput(&I, Q, Builder)) + return V; + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + Type *Ty = I.getType(); + unsigned BitWidth = Ty->getScalarSizeInBits(); + + const APInt *ShAmtAPInt; + if (match(Op1, m_APInt(ShAmtAPInt))) { + unsigned ShAmt = ShAmtAPInt->getZExtValue(); + + // shl (zext X), ShAmt --> zext (shl X, ShAmt) + // This is only valid if X would have zeros shifted out. + Value *X; + if (match(Op0, m_OneUse(m_ZExt(m_Value(X))))) { + unsigned SrcWidth = X->getType()->getScalarSizeInBits(); + if (ShAmt < SrcWidth && + MaskedValueIsZero(X, APInt::getHighBitsSet(SrcWidth, ShAmt), 0, &I)) + return new ZExtInst(Builder.CreateShl(X, ShAmt), Ty); + } + + // (X >> C) << C --> X & (-1 << C) + if (match(Op0, m_Shr(m_Value(X), m_Specific(Op1)))) { + APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)); + return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask)); + } + + // FIXME: we do not yet transform non-exact shr's. The backend (DAGCombine) + // needs a few fixes for the rotate pattern recognition first. + const APInt *ShOp1; + if (match(Op0, m_Exact(m_Shr(m_Value(X), m_APInt(ShOp1))))) { + unsigned ShrAmt = ShOp1->getZExtValue(); + if (ShrAmt < ShAmt) { + // If C1 < C2: (X >>?,exact C1) << C2 --> X << (C2 - C1) + Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShrAmt); + auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff); + NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); + NewShl->setHasNoSignedWrap(I.hasNoSignedWrap()); + return NewShl; + } + if (ShrAmt > ShAmt) { + // If C1 > C2: (X >>?exact C1) << C2 --> X >>?exact (C1 - C2) + Constant *ShiftDiff = ConstantInt::get(Ty, ShrAmt - ShAmt); + auto *NewShr = BinaryOperator::Create( + cast<BinaryOperator>(Op0)->getOpcode(), X, ShiftDiff); + NewShr->setIsExact(true); + return NewShr; + } + } + + if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1)))) { + unsigned AmtSum = ShAmt + ShOp1->getZExtValue(); + // Oversized shifts are simplified to zero in InstSimplify. + if (AmtSum < BitWidth) + // (X << C1) << C2 --> X << (C1 + C2) + return BinaryOperator::CreateShl(X, ConstantInt::get(Ty, AmtSum)); + } + + // If the shifted-out value is known-zero, then this is a NUW shift. + if (!I.hasNoUnsignedWrap() && + MaskedValueIsZero(Op0, APInt::getHighBitsSet(BitWidth, ShAmt), 0, &I)) { + I.setHasNoUnsignedWrap(); + return &I; + } + + // If the shifted-out value is all signbits, then this is a NSW shift. + if (!I.hasNoSignedWrap() && ComputeNumSignBits(Op0, 0, &I) > ShAmt) { + I.setHasNoSignedWrap(); + return &I; + } + } + + // Transform (x >> y) << y to x & (-1 << y) + // Valid for any type of right-shift. + Value *X; + if (match(Op0, m_OneUse(m_Shr(m_Value(X), m_Specific(Op1))))) { + Constant *AllOnes = ConstantInt::getAllOnesValue(Ty); + Value *Mask = Builder.CreateShl(AllOnes, Op1); + return BinaryOperator::CreateAnd(Mask, X); + } + + Constant *C1; + if (match(Op1, m_Constant(C1))) { + Constant *C2; + Value *X; + // (C2 << X) << C1 --> (C2 << C1) << X + if (match(Op0, m_OneUse(m_Shl(m_Constant(C2), m_Value(X))))) + return BinaryOperator::CreateShl(ConstantExpr::getShl(C2, C1), X); + + // (X * C2) << C1 --> X * (C2 << C1) + if (match(Op0, m_Mul(m_Value(X), m_Constant(C2)))) + return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1)); + + // shl (zext i1 X), C1 --> select (X, 1 << C1, 0) + if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) { + auto *NewC = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C1); + return SelectInst::Create(X, NewC, ConstantInt::getNullValue(Ty)); + } + } + + // (1 << (C - x)) -> ((1 << C) >> x) if C is bitwidth - 1 + if (match(Op0, m_One()) && + match(Op1, m_Sub(m_SpecificInt(BitWidth - 1), m_Value(X)))) + return BinaryOperator::CreateLShr( + ConstantInt::get(Ty, APInt::getSignMask(BitWidth)), X); + + return nullptr; +} + +Instruction *InstCombiner::visitLShr(BinaryOperator &I) { + if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (Instruction *X = foldVectorBinop(I)) + return X; + + if (Instruction *R = commonShiftTransforms(I)) + return R; + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + Type *Ty = I.getType(); + const APInt *ShAmtAPInt; + if (match(Op1, m_APInt(ShAmtAPInt))) { + unsigned ShAmt = ShAmtAPInt->getZExtValue(); + unsigned BitWidth = Ty->getScalarSizeInBits(); + auto *II = dyn_cast<IntrinsicInst>(Op0); + if (II && isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == ShAmt && + (II->getIntrinsicID() == Intrinsic::ctlz || + II->getIntrinsicID() == Intrinsic::cttz || + II->getIntrinsicID() == Intrinsic::ctpop)) { + // ctlz.i32(x)>>5 --> zext(x == 0) + // cttz.i32(x)>>5 --> zext(x == 0) + // ctpop.i32(x)>>5 --> zext(x == -1) + bool IsPop = II->getIntrinsicID() == Intrinsic::ctpop; + Constant *RHS = ConstantInt::getSigned(Ty, IsPop ? -1 : 0); + Value *Cmp = Builder.CreateICmpEQ(II->getArgOperand(0), RHS); + return new ZExtInst(Cmp, Ty); + } + + Value *X; + const APInt *ShOp1; + if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1))) && ShOp1->ult(BitWidth)) { + if (ShOp1->ult(ShAmt)) { + unsigned ShlAmt = ShOp1->getZExtValue(); + Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt); + if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) { + // (X <<nuw C1) >>u C2 --> X >>u (C2 - C1) + auto *NewLShr = BinaryOperator::CreateLShr(X, ShiftDiff); + NewLShr->setIsExact(I.isExact()); + return NewLShr; + } + // (X << C1) >>u C2 --> (X >>u (C2 - C1)) & (-1 >> C2) + Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact()); + APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt)); + return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask)); + } + if (ShOp1->ugt(ShAmt)) { + unsigned ShlAmt = ShOp1->getZExtValue(); + Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt); + if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) { + // (X <<nuw C1) >>u C2 --> X <<nuw (C1 - C2) + auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff); + NewShl->setHasNoUnsignedWrap(true); + return NewShl; + } + // (X << C1) >>u C2 --> X << (C1 - C2) & (-1 >> C2) + Value *NewShl = Builder.CreateShl(X, ShiftDiff); + APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt)); + return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask)); + } + assert(*ShOp1 == ShAmt); + // (X << C) >>u C --> X & (-1 >>u C) + APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt)); + return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask)); + } + + if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) && + (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) { + assert(ShAmt < X->getType()->getScalarSizeInBits() && + "Big shift not simplified to zero?"); + // lshr (zext iM X to iN), C --> zext (lshr X, C) to iN + Value *NewLShr = Builder.CreateLShr(X, ShAmt); + return new ZExtInst(NewLShr, Ty); + } + + if (match(Op0, m_SExt(m_Value(X))) && + (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) { + // Are we moving the sign bit to the low bit and widening with high zeros? + unsigned SrcTyBitWidth = X->getType()->getScalarSizeInBits(); + if (ShAmt == BitWidth - 1) { + // lshr (sext i1 X to iN), N-1 --> zext X to iN + if (SrcTyBitWidth == 1) + return new ZExtInst(X, Ty); + + // lshr (sext iM X to iN), N-1 --> zext (lshr X, M-1) to iN + if (Op0->hasOneUse()) { + Value *NewLShr = Builder.CreateLShr(X, SrcTyBitWidth - 1); + return new ZExtInst(NewLShr, Ty); + } + } + + // lshr (sext iM X to iN), N-M --> zext (ashr X, min(N-M, M-1)) to iN + if (ShAmt == BitWidth - SrcTyBitWidth && Op0->hasOneUse()) { + // The new shift amount can't be more than the narrow source type. + unsigned NewShAmt = std::min(ShAmt, SrcTyBitWidth - 1); + Value *AShr = Builder.CreateAShr(X, NewShAmt); + return new ZExtInst(AShr, Ty); + } + } + + if (match(Op0, m_LShr(m_Value(X), m_APInt(ShOp1)))) { + unsigned AmtSum = ShAmt + ShOp1->getZExtValue(); + // Oversized shifts are simplified to zero in InstSimplify. + if (AmtSum < BitWidth) + // (X >>u C1) >>u C2 --> X >>u (C1 + C2) + return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum)); + } + + // If the shifted-out value is known-zero, then this is an exact shift. + if (!I.isExact() && + MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) { + I.setIsExact(); + return &I; + } + } + + // Transform (x << y) >> y to x & (-1 >> y) + Value *X; + if (match(Op0, m_OneUse(m_Shl(m_Value(X), m_Specific(Op1))))) { + Constant *AllOnes = ConstantInt::getAllOnesValue(Ty); + Value *Mask = Builder.CreateLShr(AllOnes, Op1); + return BinaryOperator::CreateAnd(Mask, X); + } + + return nullptr; +} + +Instruction * +InstCombiner::foldVariableSignZeroExtensionOfVariableHighBitExtract( + BinaryOperator &OldAShr) { + assert(OldAShr.getOpcode() == Instruction::AShr && + "Must be called with arithmetic right-shift instruction only."); + + // Check that constant C is a splat of the element-wise bitwidth of V. + auto BitWidthSplat = [](Constant *C, Value *V) { + return match( + C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt(C->getType()->getScalarSizeInBits(), + V->getType()->getScalarSizeInBits()))); + }; + + // It should look like variable-length sign-extension on the outside: + // (Val << (bitwidth(Val)-Nbits)) a>> (bitwidth(Val)-Nbits) + Value *NBits; + Instruction *MaybeTrunc; + Constant *C1, *C2; + if (!match(&OldAShr, + m_AShr(m_Shl(m_Instruction(MaybeTrunc), + m_ZExtOrSelf(m_Sub(m_Constant(C1), + m_ZExtOrSelf(m_Value(NBits))))), + m_ZExtOrSelf(m_Sub(m_Constant(C2), + m_ZExtOrSelf(m_Deferred(NBits)))))) || + !BitWidthSplat(C1, &OldAShr) || !BitWidthSplat(C2, &OldAShr)) + return nullptr; + + // There may or may not be a truncation after outer two shifts. + Instruction *HighBitExtract; + match(MaybeTrunc, m_TruncOrSelf(m_Instruction(HighBitExtract))); + bool HadTrunc = MaybeTrunc != HighBitExtract; + + // And finally, the innermost part of the pattern must be a right-shift. + Value *X, *NumLowBitsToSkip; + if (!match(HighBitExtract, m_Shr(m_Value(X), m_Value(NumLowBitsToSkip)))) + return nullptr; + + // Said right-shift must extract high NBits bits - C0 must be it's bitwidth. + Constant *C0; + if (!match(NumLowBitsToSkip, + m_ZExtOrSelf( + m_Sub(m_Constant(C0), m_ZExtOrSelf(m_Specific(NBits))))) || + !BitWidthSplat(C0, HighBitExtract)) + return nullptr; + + // Since the NBits is identical for all shifts, if the outermost and + // innermost shifts are identical, then outermost shifts are redundant. + // If we had truncation, do keep it though. + if (HighBitExtract->getOpcode() == OldAShr.getOpcode()) + return replaceInstUsesWith(OldAShr, MaybeTrunc); + + // Else, if there was a truncation, then we need to ensure that one + // instruction will go away. + if (HadTrunc && !match(&OldAShr, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) + return nullptr; + + // Finally, bypass two innermost shifts, and perform the outermost shift on + // the operands of the innermost shift. + Instruction *NewAShr = + BinaryOperator::Create(OldAShr.getOpcode(), X, NumLowBitsToSkip); + NewAShr->copyIRFlags(HighBitExtract); // We can preserve 'exact'-ness. + if (!HadTrunc) + return NewAShr; + + Builder.Insert(NewAShr); + return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType()); +} + +Instruction *InstCombiner::visitAShr(BinaryOperator &I) { + if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(), + SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + if (Instruction *X = foldVectorBinop(I)) + return X; + + if (Instruction *R = commonShiftTransforms(I)) + return R; + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + Type *Ty = I.getType(); + unsigned BitWidth = Ty->getScalarSizeInBits(); + const APInt *ShAmtAPInt; + if (match(Op1, m_APInt(ShAmtAPInt)) && ShAmtAPInt->ult(BitWidth)) { + unsigned ShAmt = ShAmtAPInt->getZExtValue(); + + // If the shift amount equals the difference in width of the destination + // and source scalar types: + // ashr (shl (zext X), C), C --> sext X + Value *X; + if (match(Op0, m_Shl(m_ZExt(m_Value(X)), m_Specific(Op1))) && + ShAmt == BitWidth - X->getType()->getScalarSizeInBits()) + return new SExtInst(X, Ty); + + // We can't handle (X << C1) >>s C2. It shifts arbitrary bits in. However, + // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits. + const APInt *ShOp1; + if (match(Op0, m_NSWShl(m_Value(X), m_APInt(ShOp1))) && + ShOp1->ult(BitWidth)) { + unsigned ShlAmt = ShOp1->getZExtValue(); + if (ShlAmt < ShAmt) { + // (X <<nsw C1) >>s C2 --> X >>s (C2 - C1) + Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt); + auto *NewAShr = BinaryOperator::CreateAShr(X, ShiftDiff); + NewAShr->setIsExact(I.isExact()); + return NewAShr; + } + if (ShlAmt > ShAmt) { + // (X <<nsw C1) >>s C2 --> X <<nsw (C1 - C2) + Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt); + auto *NewShl = BinaryOperator::Create(Instruction::Shl, X, ShiftDiff); + NewShl->setHasNoSignedWrap(true); + return NewShl; + } + } + + if (match(Op0, m_AShr(m_Value(X), m_APInt(ShOp1))) && + ShOp1->ult(BitWidth)) { + unsigned AmtSum = ShAmt + ShOp1->getZExtValue(); + // Oversized arithmetic shifts replicate the sign bit. + AmtSum = std::min(AmtSum, BitWidth - 1); + // (X >>s C1) >>s C2 --> X >>s (C1 + C2) + return BinaryOperator::CreateAShr(X, ConstantInt::get(Ty, AmtSum)); + } + + if (match(Op0, m_OneUse(m_SExt(m_Value(X)))) && + (Ty->isVectorTy() || shouldChangeType(Ty, X->getType()))) { + // ashr (sext X), C --> sext (ashr X, C') + Type *SrcTy = X->getType(); + ShAmt = std::min(ShAmt, SrcTy->getScalarSizeInBits() - 1); + Value *NewSh = Builder.CreateAShr(X, ConstantInt::get(SrcTy, ShAmt)); + return new SExtInst(NewSh, Ty); + } + + // If the shifted-out value is known-zero, then this is an exact shift. + if (!I.isExact() && + MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) { + I.setIsExact(); + return &I; + } + } + + if (Instruction *R = foldVariableSignZeroExtensionOfVariableHighBitExtract(I)) + return R; + + // See if we can turn a signed shr into an unsigned shr. + if (MaskedValueIsZero(Op0, APInt::getSignMask(BitWidth), 0, &I)) + return BinaryOperator::CreateLShr(Op0, Op1); + + return nullptr; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp new file mode 100644 index 000000000000..d30ab8001897 --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -0,0 +1,1733 @@ +//===- InstCombineSimplifyDemanded.cpp ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains logic for simplifying instructions based on information +// about how they are used. +// +//===----------------------------------------------------------------------===// + +#include "InstCombineInternal.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/KnownBits.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "instcombine" + +namespace { + +struct AMDGPUImageDMaskIntrinsic { + unsigned Intr; +}; + +#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL +#include "InstCombineTables.inc" + +} // end anonymous namespace + +/// Check to see if the specified operand of the specified instruction is a +/// constant integer. If so, check to see if there are any bits set in the +/// constant that are not demanded. If so, shrink the constant and return true. +static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, + const APInt &Demanded) { + assert(I && "No instruction?"); + assert(OpNo < I->getNumOperands() && "Operand index too large"); + + // The operand must be a constant integer or splat integer. + Value *Op = I->getOperand(OpNo); + const APInt *C; + if (!match(Op, m_APInt(C))) + return false; + + // If there are no bits set that aren't demanded, nothing to do. + if (C->isSubsetOf(Demanded)) + return false; + + // This instruction is producing bits that are not demanded. Shrink the RHS. + I->setOperand(OpNo, ConstantInt::get(Op->getType(), *C & Demanded)); + + return true; +} + + + +/// Inst is an integer instruction that SimplifyDemandedBits knows about. See if +/// the instruction has any properties that allow us to simplify its operands. +bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) { + unsigned BitWidth = Inst.getType()->getScalarSizeInBits(); + KnownBits Known(BitWidth); + APInt DemandedMask(APInt::getAllOnesValue(BitWidth)); + + Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, Known, + 0, &Inst); + if (!V) return false; + if (V == &Inst) return true; + replaceInstUsesWith(Inst, V); + return true; +} + +/// This form of SimplifyDemandedBits simplifies the specified instruction +/// operand if possible, updating it in place. It returns true if it made any +/// change and false otherwise. +bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo, + const APInt &DemandedMask, + KnownBits &Known, + unsigned Depth) { + Use &U = I->getOperandUse(OpNo); + Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known, + Depth, I); + if (!NewVal) return false; + U = NewVal; + return true; +} + + +/// This function attempts to replace V with a simpler value based on the +/// demanded bits. When this function is called, it is known that only the bits +/// set in DemandedMask of the result of V are ever used downstream. +/// Consequently, depending on the mask and V, it may be possible to replace V +/// with a constant or one of its operands. In such cases, this function does +/// the replacement and returns true. In all other cases, it returns false after +/// analyzing the expression and setting KnownOne and known to be one in the +/// expression. Known.Zero contains all the bits that are known to be zero in +/// the expression. These are provided to potentially allow the caller (which +/// might recursively be SimplifyDemandedBits itself) to simplify the +/// expression. +/// Known.One and Known.Zero always follow the invariant that: +/// Known.One & Known.Zero == 0. +/// That is, a bit can't be both 1 and 0. Note that the bits in Known.One and +/// Known.Zero may only be accurate for those bits set in DemandedMask. Note +/// also that the bitwidth of V, DemandedMask, Known.Zero and Known.One must all +/// be the same. +/// +/// This returns null if it did not change anything and it permits no +/// simplification. This returns V itself if it did some simplification of V's +/// operands based on the information about what bits are demanded. This returns +/// some other non-null value if it found out that V is equal to another value +/// in the context where the specified bits are demanded, but not for all users. +Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, + KnownBits &Known, unsigned Depth, + Instruction *CxtI) { + assert(V != nullptr && "Null pointer of Value???"); + assert(Depth <= 6 && "Limit Search Depth"); + uint32_t BitWidth = DemandedMask.getBitWidth(); + Type *VTy = V->getType(); + assert( + (!VTy->isIntOrIntVectorTy() || VTy->getScalarSizeInBits() == BitWidth) && + Known.getBitWidth() == BitWidth && + "Value *V, DemandedMask and Known must have same BitWidth"); + + if (isa<Constant>(V)) { + computeKnownBits(V, Known, Depth, CxtI); + return nullptr; + } + + Known.resetAll(); + if (DemandedMask.isNullValue()) // Not demanding any bits from V. + return UndefValue::get(VTy); + + if (Depth == 6) // Limit search depth. + return nullptr; + + Instruction *I = dyn_cast<Instruction>(V); + if (!I) { + computeKnownBits(V, Known, Depth, CxtI); + return nullptr; // Only analyze instructions. + } + + // If there are multiple uses of this value and we aren't at the root, then + // we can't do any simplifications of the operands, because DemandedMask + // only reflects the bits demanded by *one* of the users. + if (Depth != 0 && !I->hasOneUse()) + return SimplifyMultipleUseDemandedBits(I, DemandedMask, Known, Depth, CxtI); + + KnownBits LHSKnown(BitWidth), RHSKnown(BitWidth); + + // If this is the root being simplified, allow it to have multiple uses, + // just set the DemandedMask to all bits so that we can try to simplify the + // operands. This allows visitTruncInst (for example) to simplify the + // operand of a trunc without duplicating all the logic below. + if (Depth == 0 && !V->hasOneUse()) + DemandedMask.setAllBits(); + + switch (I->getOpcode()) { + default: + computeKnownBits(I, Known, Depth, CxtI); + break; + case Instruction::And: { + // If either the LHS or the RHS are Zero, the result is zero. + if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) || + SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.Zero, LHSKnown, + Depth + 1)) + return I; + assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); + assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); + + // Output known-0 are known to be clear if zero in either the LHS | RHS. + APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero; + // Output known-1 bits are only known if set in both the LHS & RHS. + APInt IKnownOne = RHSKnown.One & LHSKnown.One; + + // If the client is only demanding bits that we know, return the known + // constant. + if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) + return Constant::getIntegerValue(VTy, IKnownOne); + + // If all of the demanded bits are known 1 on one side, return the other. + // These bits cannot contribute to the result of the 'and'. + if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One)) + return I->getOperand(0); + if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One)) + return I->getOperand(1); + + // If the RHS is a constant, see if we can simplify it. + if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnown.Zero)) + return I; + + Known.Zero = std::move(IKnownZero); + Known.One = std::move(IKnownOne); + break; + } + case Instruction::Or: { + // If either the LHS or the RHS are One, the result is One. + if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) || + SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.One, LHSKnown, + Depth + 1)) + return I; + assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); + assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); + + // Output known-0 bits are only known if clear in both the LHS & RHS. + APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero; + // Output known-1 are known. to be set if s.et in either the LHS | RHS. + APInt IKnownOne = RHSKnown.One | LHSKnown.One; + + // If the client is only demanding bits that we know, return the known + // constant. + if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) + return Constant::getIntegerValue(VTy, IKnownOne); + + // If all of the demanded bits are known zero on one side, return the other. + // These bits cannot contribute to the result of the 'or'. + if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero)) + return I->getOperand(0); + if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero)) + return I->getOperand(1); + + // If the RHS is a constant, see if we can simplify it. + if (ShrinkDemandedConstant(I, 1, DemandedMask)) + return I; + + Known.Zero = std::move(IKnownZero); + Known.One = std::move(IKnownOne); + break; + } + case Instruction::Xor: { + if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) || + SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1)) + return I; + assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); + assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); + + // Output known-0 bits are known if clear or set in both the LHS & RHS. + APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) | + (RHSKnown.One & LHSKnown.One); + // Output known-1 are known to be set if set in only one of the LHS, RHS. + APInt IKnownOne = (RHSKnown.Zero & LHSKnown.One) | + (RHSKnown.One & LHSKnown.Zero); + + // If the client is only demanding bits that we know, return the known + // constant. + if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) + return Constant::getIntegerValue(VTy, IKnownOne); + + // If all of the demanded bits are known zero on one side, return the other. + // These bits cannot contribute to the result of the 'xor'. + if (DemandedMask.isSubsetOf(RHSKnown.Zero)) + return I->getOperand(0); + if (DemandedMask.isSubsetOf(LHSKnown.Zero)) + return I->getOperand(1); + + // If all of the demanded bits are known to be zero on one side or the + // other, turn this into an *inclusive* or. + // e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0 + if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero)) { + Instruction *Or = + BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1), + I->getName()); + return InsertNewInstWith(Or, *I); + } + + // If all of the demanded bits on one side are known, and all of the set + // bits on that side are also known to be set on the other side, turn this + // into an AND, as we know the bits will be cleared. + // e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2 + if (DemandedMask.isSubsetOf(RHSKnown.Zero|RHSKnown.One) && + RHSKnown.One.isSubsetOf(LHSKnown.One)) { + Constant *AndC = Constant::getIntegerValue(VTy, + ~RHSKnown.One & DemandedMask); + Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC); + return InsertNewInstWith(And, *I); + } + + // If the RHS is a constant, see if we can simplify it. + // FIXME: for XOR, we prefer to force bits to 1 if they will make a -1. + if (ShrinkDemandedConstant(I, 1, DemandedMask)) + return I; + + // If our LHS is an 'and' and if it has one use, and if any of the bits we + // are flipping are known to be set, then the xor is just resetting those + // bits to zero. We can just knock out bits from the 'and' and the 'xor', + // simplifying both of them. + if (Instruction *LHSInst = dyn_cast<Instruction>(I->getOperand(0))) + if (LHSInst->getOpcode() == Instruction::And && LHSInst->hasOneUse() && + isa<ConstantInt>(I->getOperand(1)) && + isa<ConstantInt>(LHSInst->getOperand(1)) && + (LHSKnown.One & RHSKnown.One & DemandedMask) != 0) { + ConstantInt *AndRHS = cast<ConstantInt>(LHSInst->getOperand(1)); + ConstantInt *XorRHS = cast<ConstantInt>(I->getOperand(1)); + APInt NewMask = ~(LHSKnown.One & RHSKnown.One & DemandedMask); + + Constant *AndC = + ConstantInt::get(I->getType(), NewMask & AndRHS->getValue()); + Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC); + InsertNewInstWith(NewAnd, *I); + + Constant *XorC = + ConstantInt::get(I->getType(), NewMask & XorRHS->getValue()); + Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC); + return InsertNewInstWith(NewXor, *I); + } + + // Output known-0 bits are known if clear or set in both the LHS & RHS. + Known.Zero = std::move(IKnownZero); + // Output known-1 are known to be set if set in only one of the LHS, RHS. + Known.One = std::move(IKnownOne); + break; + } + case Instruction::Select: { + Value *LHS, *RHS; + SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor; + if (SPF == SPF_UMAX) { + // UMax(A, C) == A if ... + // The lowest non-zero bit of DemandMask is higher than the highest + // non-zero bit of C. + const APInt *C; + unsigned CTZ = DemandedMask.countTrailingZeros(); + if (match(RHS, m_APInt(C)) && CTZ >= C->getActiveBits()) + return LHS; + } else if (SPF == SPF_UMIN) { + // UMin(A, C) == A if ... + // The lowest non-zero bit of DemandMask is higher than the highest + // non-one bit of C. + // This comes from using DeMorgans on the above umax example. + const APInt *C; + unsigned CTZ = DemandedMask.countTrailingZeros(); + if (match(RHS, m_APInt(C)) && + CTZ >= C->getBitWidth() - C->countLeadingOnes()) + return LHS; + } + + // If this is a select as part of any other min/max pattern, don't simplify + // any further in case we break the structure. + if (SPF != SPF_UNKNOWN) + return nullptr; + + if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) || + SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1)) + return I; + assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); + assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); + + // If the operands are constants, see if we can simplify them. + if (ShrinkDemandedConstant(I, 1, DemandedMask) || + ShrinkDemandedConstant(I, 2, DemandedMask)) + return I; + + // Only known if known in both the LHS and RHS. + Known.One = RHSKnown.One & LHSKnown.One; + Known.Zero = RHSKnown.Zero & LHSKnown.Zero; + break; + } + case Instruction::ZExt: + case Instruction::Trunc: { + unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits(); + + APInt InputDemandedMask = DemandedMask.zextOrTrunc(SrcBitWidth); + KnownBits InputKnown(SrcBitWidth); + if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1)) + return I; + assert(InputKnown.getBitWidth() == SrcBitWidth && "Src width changed?"); + Known = InputKnown.zextOrTrunc(BitWidth, + true /* ExtendedBitsAreKnownZero */); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + break; + } + case Instruction::BitCast: + if (!I->getOperand(0)->getType()->isIntOrIntVectorTy()) + return nullptr; // vector->int or fp->int? + + if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) { + if (VectorType *SrcVTy = + dyn_cast<VectorType>(I->getOperand(0)->getType())) { + if (DstVTy->getNumElements() != SrcVTy->getNumElements()) + // Don't touch a bitcast between vectors of different element counts. + return nullptr; + } else + // Don't touch a scalar-to-vector bitcast. + return nullptr; + } else if (I->getOperand(0)->getType()->isVectorTy()) + // Don't touch a vector-to-scalar bitcast. + return nullptr; + + if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1)) + return I; + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + break; + case Instruction::SExt: { + // Compute the bits in the result that are not present in the input. + unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits(); + + APInt InputDemandedBits = DemandedMask.trunc(SrcBitWidth); + + // If any of the sign extended bits are demanded, we know that the sign + // bit is demanded. + if (DemandedMask.getActiveBits() > SrcBitWidth) + InputDemandedBits.setBit(SrcBitWidth-1); + + KnownBits InputKnown(SrcBitWidth); + if (SimplifyDemandedBits(I, 0, InputDemandedBits, InputKnown, Depth + 1)) + return I; + + // If the input sign bit is known zero, or if the NewBits are not demanded + // convert this into a zero extension. + if (InputKnown.isNonNegative() || + DemandedMask.getActiveBits() <= SrcBitWidth) { + // Convert to ZExt cast. + CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName()); + return InsertNewInstWith(NewCast, *I); + } + + // If the sign bit of the input is known set or clear, then we know the + // top bits of the result. + Known = InputKnown.sext(BitWidth); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + break; + } + case Instruction::Add: + case Instruction::Sub: { + /// If the high-bits of an ADD/SUB are not demanded, then we do not care + /// about the high bits of the operands. + unsigned NLZ = DemandedMask.countLeadingZeros(); + // Right fill the mask of bits for this ADD/SUB to demand the most + // significant bit and all those below it. + APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ)); + if (ShrinkDemandedConstant(I, 0, DemandedFromOps) || + SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnown, Depth + 1) || + ShrinkDemandedConstant(I, 1, DemandedFromOps) || + SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnown, Depth + 1)) { + if (NLZ > 0) { + // Disable the nsw and nuw flags here: We can no longer guarantee that + // we won't wrap after simplification. Removing the nsw/nuw flags is + // legal here because the top bit is not demanded. + BinaryOperator &BinOP = *cast<BinaryOperator>(I); + BinOP.setHasNoSignedWrap(false); + BinOP.setHasNoUnsignedWrap(false); + } + return I; + } + + // If we are known to be adding/subtracting zeros to every bit below + // the highest demanded bit, we just return the other side. + if (DemandedFromOps.isSubsetOf(RHSKnown.Zero)) + return I->getOperand(0); + // We can't do this with the LHS for subtraction, unless we are only + // demanding the LSB. + if ((I->getOpcode() == Instruction::Add || + DemandedFromOps.isOneValue()) && + DemandedFromOps.isSubsetOf(LHSKnown.Zero)) + return I->getOperand(1); + + // Otherwise just compute the known bits of the result. + bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap(); + Known = KnownBits::computeForAddSub(I->getOpcode() == Instruction::Add, + NSW, LHSKnown, RHSKnown); + break; + } + case Instruction::Shl: { + const APInt *SA; + if (match(I->getOperand(1), m_APInt(SA))) { + const APInt *ShrAmt; + if (match(I->getOperand(0), m_Shr(m_Value(), m_APInt(ShrAmt)))) + if (Instruction *Shr = dyn_cast<Instruction>(I->getOperand(0))) + if (Value *R = simplifyShrShlDemandedBits(Shr, *ShrAmt, I, *SA, + DemandedMask, Known)) + return R; + + uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1); + APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt)); + + // If the shift is NUW/NSW, then it does demand the high bits. + ShlOperator *IOp = cast<ShlOperator>(I); + if (IOp->hasNoSignedWrap()) + DemandedMaskIn.setHighBits(ShiftAmt+1); + else if (IOp->hasNoUnsignedWrap()) + DemandedMaskIn.setHighBits(ShiftAmt); + + if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) + return I; + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero <<= ShiftAmt; + Known.One <<= ShiftAmt; + // low bits known zero. + if (ShiftAmt) + Known.Zero.setLowBits(ShiftAmt); + } + break; + } + case Instruction::LShr: { + const APInt *SA; + if (match(I->getOperand(1), m_APInt(SA))) { + uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1); + + // Unsigned shift right. + APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt)); + + // If the shift is exact, then it does demand the low bits (and knows that + // they are zero). + if (cast<LShrOperator>(I)->isExact()) + DemandedMaskIn.setLowBits(ShiftAmt); + + if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) + return I; + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero.lshrInPlace(ShiftAmt); + Known.One.lshrInPlace(ShiftAmt); + if (ShiftAmt) + Known.Zero.setHighBits(ShiftAmt); // high bits known zero. + } + break; + } + case Instruction::AShr: { + // If this is an arithmetic shift right and only the low-bit is set, we can + // always convert this into a logical shr, even if the shift amount is + // variable. The low bit of the shift cannot be an input sign bit unless + // the shift amount is >= the size of the datatype, which is undefined. + if (DemandedMask.isOneValue()) { + // Perform the logical shift right. + Instruction *NewVal = BinaryOperator::CreateLShr( + I->getOperand(0), I->getOperand(1), I->getName()); + return InsertNewInstWith(NewVal, *I); + } + + // If the sign bit is the only bit demanded by this ashr, then there is no + // need to do it, the shift doesn't change the high bit. + if (DemandedMask.isSignMask()) + return I->getOperand(0); + + const APInt *SA; + if (match(I->getOperand(1), m_APInt(SA))) { + uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1); + + // Signed shift right. + APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt)); + // If any of the high bits are demanded, we should set the sign bit as + // demanded. + if (DemandedMask.countLeadingZeros() <= ShiftAmt) + DemandedMaskIn.setSignBit(); + + // If the shift is exact, then it does demand the low bits (and knows that + // they are zero). + if (cast<AShrOperator>(I)->isExact()) + DemandedMaskIn.setLowBits(ShiftAmt); + + if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) + return I; + + unsigned SignBits = ComputeNumSignBits(I->getOperand(0), Depth + 1, CxtI); + + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + // Compute the new bits that are at the top now plus sign bits. + APInt HighBits(APInt::getHighBitsSet( + BitWidth, std::min(SignBits + ShiftAmt - 1, BitWidth))); + Known.Zero.lshrInPlace(ShiftAmt); + Known.One.lshrInPlace(ShiftAmt); + + // If the input sign bit is known to be zero, or if none of the top bits + // are demanded, turn this into an unsigned shift right. + assert(BitWidth > ShiftAmt && "Shift amount not saturated?"); + if (Known.Zero[BitWidth-ShiftAmt-1] || + !DemandedMask.intersects(HighBits)) { + BinaryOperator *LShr = BinaryOperator::CreateLShr(I->getOperand(0), + I->getOperand(1)); + LShr->setIsExact(cast<BinaryOperator>(I)->isExact()); + return InsertNewInstWith(LShr, *I); + } else if (Known.One[BitWidth-ShiftAmt-1]) { // New bits are known one. + Known.One |= HighBits; + } + } + break; + } + case Instruction::UDiv: { + // UDiv doesn't demand low bits that are zero in the divisor. + const APInt *SA; + if (match(I->getOperand(1), m_APInt(SA))) { + // If the shift is exact, then it does demand the low bits. + if (cast<UDivOperator>(I)->isExact()) + break; + + // FIXME: Take the demanded mask of the result into account. + unsigned RHSTrailingZeros = SA->countTrailingZeros(); + APInt DemandedMaskIn = + APInt::getHighBitsSet(BitWidth, BitWidth - RHSTrailingZeros); + if (SimplifyDemandedBits(I, 0, DemandedMaskIn, LHSKnown, Depth + 1)) + return I; + + // Propagate zero bits from the input. + Known.Zero.setHighBits(std::min( + BitWidth, LHSKnown.Zero.countLeadingOnes() + RHSTrailingZeros)); + } + break; + } + case Instruction::SRem: + if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) { + // X % -1 demands all the bits because we don't want to introduce + // INT_MIN % -1 (== undef) by accident. + if (Rem->isMinusOne()) + break; + APInt RA = Rem->getValue().abs(); + if (RA.isPowerOf2()) { + if (DemandedMask.ult(RA)) // srem won't affect demanded bits + return I->getOperand(0); + + APInt LowBits = RA - 1; + APInt Mask2 = LowBits | APInt::getSignMask(BitWidth); + if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1)) + return I; + + // The low bits of LHS are unchanged by the srem. + Known.Zero = LHSKnown.Zero & LowBits; + Known.One = LHSKnown.One & LowBits; + + // If LHS is non-negative or has all low bits zero, then the upper bits + // are all zero. + if (LHSKnown.isNonNegative() || LowBits.isSubsetOf(LHSKnown.Zero)) + Known.Zero |= ~LowBits; + + // If LHS is negative and not all low bits are zero, then the upper bits + // are all one. + if (LHSKnown.isNegative() && LowBits.intersects(LHSKnown.One)) + Known.One |= ~LowBits; + + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + break; + } + } + + // The sign bit is the LHS's sign bit, except when the result of the + // remainder is zero. + if (DemandedMask.isSignBitSet()) { + computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, CxtI); + // If it's known zero, our sign bit is also zero. + if (LHSKnown.isNonNegative()) + Known.makeNonNegative(); + } + break; + case Instruction::URem: { + KnownBits Known2(BitWidth); + APInt AllOnes = APInt::getAllOnesValue(BitWidth); + if (SimplifyDemandedBits(I, 0, AllOnes, Known2, Depth + 1) || + SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1)) + return I; + + unsigned Leaders = Known2.countMinLeadingZeros(); + Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask; + break; + } + case Instruction::Call: + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::bswap: { + // If the only bits demanded come from one byte of the bswap result, + // just shift the input byte into position to eliminate the bswap. + unsigned NLZ = DemandedMask.countLeadingZeros(); + unsigned NTZ = DemandedMask.countTrailingZeros(); + + // Round NTZ down to the next byte. If we have 11 trailing zeros, then + // we need all the bits down to bit 8. Likewise, round NLZ. If we + // have 14 leading zeros, round to 8. + NLZ &= ~7; + NTZ &= ~7; + // If we need exactly one byte, we can do this transformation. + if (BitWidth-NLZ-NTZ == 8) { + unsigned ResultBit = NTZ; + unsigned InputBit = BitWidth-NTZ-8; + + // Replace this with either a left or right shift to get the byte into + // the right place. + Instruction *NewVal; + if (InputBit > ResultBit) + NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0), + ConstantInt::get(I->getType(), InputBit-ResultBit)); + else + NewVal = BinaryOperator::CreateShl(II->getArgOperand(0), + ConstantInt::get(I->getType(), ResultBit-InputBit)); + NewVal->takeName(I); + return InsertNewInstWith(NewVal, *I); + } + + // TODO: Could compute known zero/one bits based on the input. + break; + } + case Intrinsic::fshr: + case Intrinsic::fshl: { + const APInt *SA; + if (!match(I->getOperand(2), m_APInt(SA))) + break; + + // Normalize to funnel shift left. APInt shifts of BitWidth are well- + // defined, so no need to special-case zero shifts here. + uint64_t ShiftAmt = SA->urem(BitWidth); + if (II->getIntrinsicID() == Intrinsic::fshr) + ShiftAmt = BitWidth - ShiftAmt; + + APInt DemandedMaskLHS(DemandedMask.lshr(ShiftAmt)); + APInt DemandedMaskRHS(DemandedMask.shl(BitWidth - ShiftAmt)); + if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1) || + SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1)) + return I; + + Known.Zero = LHSKnown.Zero.shl(ShiftAmt) | + RHSKnown.Zero.lshr(BitWidth - ShiftAmt); + Known.One = LHSKnown.One.shl(ShiftAmt) | + RHSKnown.One.lshr(BitWidth - ShiftAmt); + break; + } + case Intrinsic::x86_mmx_pmovmskb: + case Intrinsic::x86_sse_movmsk_ps: + case Intrinsic::x86_sse2_movmsk_pd: + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx_movmsk_ps_256: + case Intrinsic::x86_avx_movmsk_pd_256: + case Intrinsic::x86_avx2_pmovmskb: { + // MOVMSK copies the vector elements' sign bits to the low bits + // and zeros the high bits. + unsigned ArgWidth; + if (II->getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { + ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. + } else { + auto Arg = II->getArgOperand(0); + auto ArgType = cast<VectorType>(Arg->getType()); + ArgWidth = ArgType->getNumElements(); + } + + // If we don't need any of low bits then return zero, + // we know that DemandedMask is non-zero already. + APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); + if (DemandedElts.isNullValue()) + return ConstantInt::getNullValue(VTy); + + // We know that the upper bits are set to zero. + Known.Zero.setBitsFrom(ArgWidth); + return nullptr; + } + case Intrinsic::x86_sse42_crc32_64_64: + Known.Zero.setBitsFrom(32); + return nullptr; + } + } + computeKnownBits(V, Known, Depth, CxtI); + break; + } + + // If the client is only demanding bits that we know, return the known + // constant. + if (DemandedMask.isSubsetOf(Known.Zero|Known.One)) + return Constant::getIntegerValue(VTy, Known.One); + return nullptr; +} + +/// Helper routine of SimplifyDemandedUseBits. It computes Known +/// bits. It also tries to handle simplifications that can be done based on +/// DemandedMask, but without modifying the Instruction. +Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I, + const APInt &DemandedMask, + KnownBits &Known, + unsigned Depth, + Instruction *CxtI) { + unsigned BitWidth = DemandedMask.getBitWidth(); + Type *ITy = I->getType(); + + KnownBits LHSKnown(BitWidth); + KnownBits RHSKnown(BitWidth); + + // Despite the fact that we can't simplify this instruction in all User's + // context, we can at least compute the known bits, and we can + // do simplifications that apply to *just* the one user if we know that + // this instruction has a simpler value in that context. + switch (I->getOpcode()) { + case Instruction::And: { + // If either the LHS or the RHS are Zero, the result is zero. + computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI); + computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, + CxtI); + + // Output known-0 are known to be clear if zero in either the LHS | RHS. + APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero; + // Output known-1 bits are only known if set in both the LHS & RHS. + APInt IKnownOne = RHSKnown.One & LHSKnown.One; + + // If the client is only demanding bits that we know, return the known + // constant. + if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) + return Constant::getIntegerValue(ITy, IKnownOne); + + // If all of the demanded bits are known 1 on one side, return the other. + // These bits cannot contribute to the result of the 'and' in this + // context. + if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One)) + return I->getOperand(0); + if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One)) + return I->getOperand(1); + + Known.Zero = std::move(IKnownZero); + Known.One = std::move(IKnownOne); + break; + } + case Instruction::Or: { + // We can simplify (X|Y) -> X or Y in the user's context if we know that + // only bits from X or Y are demanded. + + // If either the LHS or the RHS are One, the result is One. + computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI); + computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, + CxtI); + + // Output known-0 bits are only known if clear in both the LHS & RHS. + APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero; + // Output known-1 are known to be set if set in either the LHS | RHS. + APInt IKnownOne = RHSKnown.One | LHSKnown.One; + + // If the client is only demanding bits that we know, return the known + // constant. + if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) + return Constant::getIntegerValue(ITy, IKnownOne); + + // If all of the demanded bits are known zero on one side, return the + // other. These bits cannot contribute to the result of the 'or' in this + // context. + if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero)) + return I->getOperand(0); + if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero)) + return I->getOperand(1); + + Known.Zero = std::move(IKnownZero); + Known.One = std::move(IKnownOne); + break; + } + case Instruction::Xor: { + // We can simplify (X^Y) -> X or Y in the user's context if we know that + // only bits from X or Y are demanded. + + computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI); + computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, + CxtI); + + // Output known-0 bits are known if clear or set in both the LHS & RHS. + APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) | + (RHSKnown.One & LHSKnown.One); + // Output known-1 are known to be set if set in only one of the LHS, RHS. + APInt IKnownOne = (RHSKnown.Zero & LHSKnown.One) | + (RHSKnown.One & LHSKnown.Zero); + + // If the client is only demanding bits that we know, return the known + // constant. + if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne)) + return Constant::getIntegerValue(ITy, IKnownOne); + + // If all of the demanded bits are known zero on one side, return the + // other. + if (DemandedMask.isSubsetOf(RHSKnown.Zero)) + return I->getOperand(0); + if (DemandedMask.isSubsetOf(LHSKnown.Zero)) + return I->getOperand(1); + + // Output known-0 bits are known if clear or set in both the LHS & RHS. + Known.Zero = std::move(IKnownZero); + // Output known-1 are known to be set if set in only one of the LHS, RHS. + Known.One = std::move(IKnownOne); + break; + } + default: + // Compute the Known bits to simplify things downstream. + computeKnownBits(I, Known, Depth, CxtI); + + // If this user is only demanding bits that we know, return the known + // constant. + if (DemandedMask.isSubsetOf(Known.Zero|Known.One)) + return Constant::getIntegerValue(ITy, Known.One); + + break; + } + + return nullptr; +} + + +/// Helper routine of SimplifyDemandedUseBits. It tries to simplify +/// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into +/// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign +/// of "C2-C1". +/// +/// Suppose E1 and E2 are generally different in bits S={bm, bm+1, +/// ..., bn}, without considering the specific value X is holding. +/// This transformation is legal iff one of following conditions is hold: +/// 1) All the bit in S are 0, in this case E1 == E2. +/// 2) We don't care those bits in S, per the input DemandedMask. +/// 3) Combination of 1) and 2). Some bits in S are 0, and we don't care the +/// rest bits. +/// +/// Currently we only test condition 2). +/// +/// As with SimplifyDemandedUseBits, it returns NULL if the simplification was +/// not successful. +Value * +InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1, + Instruction *Shl, const APInt &ShlOp1, + const APInt &DemandedMask, + KnownBits &Known) { + if (!ShlOp1 || !ShrOp1) + return nullptr; // No-op. + + Value *VarX = Shr->getOperand(0); + Type *Ty = VarX->getType(); + unsigned BitWidth = Ty->getScalarSizeInBits(); + if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth)) + return nullptr; // Undef. + + unsigned ShlAmt = ShlOp1.getZExtValue(); + unsigned ShrAmt = ShrOp1.getZExtValue(); + + Known.One.clearAllBits(); + Known.Zero.setLowBits(ShlAmt - 1); + Known.Zero &= DemandedMask; + + APInt BitMask1(APInt::getAllOnesValue(BitWidth)); + APInt BitMask2(APInt::getAllOnesValue(BitWidth)); + + bool isLshr = (Shr->getOpcode() == Instruction::LShr); + BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) : + (BitMask1.ashr(ShrAmt) << ShlAmt); + + if (ShrAmt <= ShlAmt) { + BitMask2 <<= (ShlAmt - ShrAmt); + } else { + BitMask2 = isLshr ? BitMask2.lshr(ShrAmt - ShlAmt): + BitMask2.ashr(ShrAmt - ShlAmt); + } + + // Check if condition-2 (see the comment to this function) is satified. + if ((BitMask1 & DemandedMask) == (BitMask2 & DemandedMask)) { + if (ShrAmt == ShlAmt) + return VarX; + + if (!Shr->hasOneUse()) + return nullptr; + + BinaryOperator *New; + if (ShrAmt < ShlAmt) { + Constant *Amt = ConstantInt::get(VarX->getType(), ShlAmt - ShrAmt); + New = BinaryOperator::CreateShl(VarX, Amt); + BinaryOperator *Orig = cast<BinaryOperator>(Shl); + New->setHasNoSignedWrap(Orig->hasNoSignedWrap()); + New->setHasNoUnsignedWrap(Orig->hasNoUnsignedWrap()); + } else { + Constant *Amt = ConstantInt::get(VarX->getType(), ShrAmt - ShlAmt); + New = isLshr ? BinaryOperator::CreateLShr(VarX, Amt) : + BinaryOperator::CreateAShr(VarX, Amt); + if (cast<BinaryOperator>(Shr)->isExact()) + New->setIsExact(true); + } + + return InsertNewInstWith(New, *Shl); + } + + return nullptr; +} + +/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. +/// +/// Note: This only supports non-TFE/LWE image intrinsic calls; those have +/// struct returns. +Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, + APInt DemandedElts, + int DMaskIdx) { + + // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported. + if (DMaskIdx < 0 && + II->getType()->getScalarSizeInBits() != 32 && + DemandedElts.getActiveBits() == 3) + return nullptr; + + unsigned VWidth = II->getType()->getVectorNumElements(); + if (VWidth == 1) + return nullptr; + + ConstantInt *NewDMask = nullptr; + + if (DMaskIdx < 0) { + // Pretend that a prefix of elements is demanded to simplify the code + // below. + DemandedElts = (1 << DemandedElts.getActiveBits()) - 1; + } else { + ConstantInt *DMask = cast<ConstantInt>(II->getArgOperand(DMaskIdx)); + unsigned DMaskVal = DMask->getZExtValue() & 0xf; + + // Mask off values that are undefined because the dmask doesn't cover them + DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; + + unsigned NewDMaskVal = 0; + unsigned OrigLoadIdx = 0; + for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { + const unsigned Bit = 1 << SrcIdx; + if (!!(DMaskVal & Bit)) { + if (!!DemandedElts[OrigLoadIdx]) + NewDMaskVal |= Bit; + OrigLoadIdx++; + } + } + + if (DMaskVal != NewDMaskVal) + NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal); + } + + unsigned NewNumElts = DemandedElts.countPopulation(); + if (!NewNumElts) + return UndefValue::get(II->getType()); + + if (NewNumElts >= VWidth && DemandedElts.isMask()) { + if (NewDMask) + II->setArgOperand(DMaskIdx, NewDMask); + return nullptr; + } + + // Determine the overload types of the original intrinsic. + auto IID = II->getIntrinsicID(); + SmallVector<Intrinsic::IITDescriptor, 16> Table; + getIntrinsicInfoTableEntries(IID, Table); + ArrayRef<Intrinsic::IITDescriptor> TableRef = Table; + + // Validate function argument and return types, extracting overloaded types + // along the way. + FunctionType *FTy = II->getCalledFunction()->getFunctionType(); + SmallVector<Type *, 6> OverloadTys; + Intrinsic::matchIntrinsicSignature(FTy, TableRef, OverloadTys); + + Module *M = II->getParent()->getParent()->getParent(); + Type *EltTy = II->getType()->getVectorElementType(); + Type *NewTy = (NewNumElts == 1) ? EltTy : VectorType::get(EltTy, NewNumElts); + + OverloadTys[0] = NewTy; + Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys); + + SmallVector<Value *, 16> Args; + for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I) + Args.push_back(II->getArgOperand(I)); + + if (NewDMask) + Args[DMaskIdx] = NewDMask; + + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(II); + + CallInst *NewCall = Builder.CreateCall(NewIntrin, Args); + NewCall->takeName(II); + NewCall->copyMetadata(*II); + + if (NewNumElts == 1) { + return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall, + DemandedElts.countTrailingZeros()); + } + + SmallVector<uint32_t, 8> EltMask; + unsigned NewLoadIdx = 0; + for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { + if (!!DemandedElts[OrigLoadIdx]) + EltMask.push_back(NewLoadIdx++); + else + EltMask.push_back(NewNumElts); + } + + Value *Shuffle = + Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask); + + return Shuffle; +} + +/// The specified value produces a vector with any number of elements. +/// This method analyzes which elements of the operand are undef and returns +/// that information in UndefElts. +/// +/// DemandedElts contains the set of elements that are actually used by the +/// caller, and by default (AllowMultipleUsers equals false) the value is +/// simplified only if it has a single caller. If AllowMultipleUsers is set +/// to true, DemandedElts refers to the union of sets of elements that are +/// used by all callers. +/// +/// If the information about demanded elements can be used to simplify the +/// operation, the operation is simplified, then the resultant value is +/// returned. This returns null if no change was made. +Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, + APInt &UndefElts, + unsigned Depth, + bool AllowMultipleUsers) { + unsigned VWidth = V->getType()->getVectorNumElements(); + APInt EltMask(APInt::getAllOnesValue(VWidth)); + assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!"); + + if (isa<UndefValue>(V)) { + // If the entire vector is undefined, just return this info. + UndefElts = EltMask; + return nullptr; + } + + if (DemandedElts.isNullValue()) { // If nothing is demanded, provide undef. + UndefElts = EltMask; + return UndefValue::get(V->getType()); + } + + UndefElts = 0; + + if (auto *C = dyn_cast<Constant>(V)) { + // Check if this is identity. If so, return 0 since we are not simplifying + // anything. + if (DemandedElts.isAllOnesValue()) + return nullptr; + + Type *EltTy = cast<VectorType>(V->getType())->getElementType(); + Constant *Undef = UndefValue::get(EltTy); + SmallVector<Constant*, 16> Elts; + for (unsigned i = 0; i != VWidth; ++i) { + if (!DemandedElts[i]) { // If not demanded, set to undef. + Elts.push_back(Undef); + UndefElts.setBit(i); + continue; + } + + Constant *Elt = C->getAggregateElement(i); + if (!Elt) return nullptr; + + if (isa<UndefValue>(Elt)) { // Already undef. + Elts.push_back(Undef); + UndefElts.setBit(i); + } else { // Otherwise, defined. + Elts.push_back(Elt); + } + } + + // If we changed the constant, return it. + Constant *NewCV = ConstantVector::get(Elts); + return NewCV != C ? NewCV : nullptr; + } + + // Limit search depth. + if (Depth == 10) + return nullptr; + + if (!AllowMultipleUsers) { + // If multiple users are using the root value, proceed with + // simplification conservatively assuming that all elements + // are needed. + if (!V->hasOneUse()) { + // Quit if we find multiple users of a non-root value though. + // They'll be handled when it's their turn to be visited by + // the main instcombine process. + if (Depth != 0) + // TODO: Just compute the UndefElts information recursively. + return nullptr; + + // Conservatively assume that all elements are needed. + DemandedElts = EltMask; + } + } + + Instruction *I = dyn_cast<Instruction>(V); + if (!I) return nullptr; // Only analyze instructions. + + bool MadeChange = false; + auto simplifyAndSetOp = [&](Instruction *Inst, unsigned OpNum, + APInt Demanded, APInt &Undef) { + auto *II = dyn_cast<IntrinsicInst>(Inst); + Value *Op = II ? II->getArgOperand(OpNum) : Inst->getOperand(OpNum); + if (Value *V = SimplifyDemandedVectorElts(Op, Demanded, Undef, Depth + 1)) { + if (II) + II->setArgOperand(OpNum, V); + else + Inst->setOperand(OpNum, V); + MadeChange = true; + } + }; + + APInt UndefElts2(VWidth, 0); + APInt UndefElts3(VWidth, 0); + switch (I->getOpcode()) { + default: break; + + case Instruction::GetElementPtr: { + // The LangRef requires that struct geps have all constant indices. As + // such, we can't convert any operand to partial undef. + auto mayIndexStructType = [](GetElementPtrInst &GEP) { + for (auto I = gep_type_begin(GEP), E = gep_type_end(GEP); + I != E; I++) + if (I.isStruct()) + return true;; + return false; + }; + if (mayIndexStructType(cast<GetElementPtrInst>(*I))) + break; + + // Conservatively track the demanded elements back through any vector + // operands we may have. We know there must be at least one, or we + // wouldn't have a vector result to get here. Note that we intentionally + // merge the undef bits here since gepping with either an undef base or + // index results in undef. + for (unsigned i = 0; i < I->getNumOperands(); i++) { + if (isa<UndefValue>(I->getOperand(i))) { + // If the entire vector is undefined, just return this info. + UndefElts = EltMask; + return nullptr; + } + if (I->getOperand(i)->getType()->isVectorTy()) { + APInt UndefEltsOp(VWidth, 0); + simplifyAndSetOp(I, i, DemandedElts, UndefEltsOp); + UndefElts |= UndefEltsOp; + } + } + + break; + } + case Instruction::InsertElement: { + // If this is a variable index, we don't know which element it overwrites. + // demand exactly the same input as we produce. + ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2)); + if (!Idx) { + // Note that we can't propagate undef elt info, because we don't know + // which elt is getting updated. + simplifyAndSetOp(I, 0, DemandedElts, UndefElts2); + break; + } + + // The element inserted overwrites whatever was there, so the input demanded + // set is simpler than the output set. + unsigned IdxNo = Idx->getZExtValue(); + APInt PreInsertDemandedElts = DemandedElts; + if (IdxNo < VWidth) + PreInsertDemandedElts.clearBit(IdxNo); + + simplifyAndSetOp(I, 0, PreInsertDemandedElts, UndefElts); + + // If this is inserting an element that isn't demanded, remove this + // insertelement. + if (IdxNo >= VWidth || !DemandedElts[IdxNo]) { + Worklist.Add(I); + return I->getOperand(0); + } + + // The inserted element is defined. + UndefElts.clearBit(IdxNo); + break; + } + case Instruction::ShuffleVector: { + ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I); + unsigned LHSVWidth = + Shuffle->getOperand(0)->getType()->getVectorNumElements(); + APInt LeftDemanded(LHSVWidth, 0), RightDemanded(LHSVWidth, 0); + for (unsigned i = 0; i < VWidth; i++) { + if (DemandedElts[i]) { + unsigned MaskVal = Shuffle->getMaskValue(i); + if (MaskVal != -1u) { + assert(MaskVal < LHSVWidth * 2 && + "shufflevector mask index out of range!"); + if (MaskVal < LHSVWidth) + LeftDemanded.setBit(MaskVal); + else + RightDemanded.setBit(MaskVal - LHSVWidth); + } + } + } + + APInt LHSUndefElts(LHSVWidth, 0); + simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts); + + APInt RHSUndefElts(LHSVWidth, 0); + simplifyAndSetOp(I, 1, RightDemanded, RHSUndefElts); + + bool NewUndefElts = false; + unsigned LHSIdx = -1u, LHSValIdx = -1u; + unsigned RHSIdx = -1u, RHSValIdx = -1u; + bool LHSUniform = true; + bool RHSUniform = true; + for (unsigned i = 0; i < VWidth; i++) { + unsigned MaskVal = Shuffle->getMaskValue(i); + if (MaskVal == -1u) { + UndefElts.setBit(i); + } else if (!DemandedElts[i]) { + NewUndefElts = true; + UndefElts.setBit(i); + } else if (MaskVal < LHSVWidth) { + if (LHSUndefElts[MaskVal]) { + NewUndefElts = true; + UndefElts.setBit(i); + } else { + LHSIdx = LHSIdx == -1u ? i : LHSVWidth; + LHSValIdx = LHSValIdx == -1u ? MaskVal : LHSVWidth; + LHSUniform = LHSUniform && (MaskVal == i); + } + } else { + if (RHSUndefElts[MaskVal - LHSVWidth]) { + NewUndefElts = true; + UndefElts.setBit(i); + } else { + RHSIdx = RHSIdx == -1u ? i : LHSVWidth; + RHSValIdx = RHSValIdx == -1u ? MaskVal - LHSVWidth : LHSVWidth; + RHSUniform = RHSUniform && (MaskVal - LHSVWidth == i); + } + } + } + + // Try to transform shuffle with constant vector and single element from + // this constant vector to single insertelement instruction. + // shufflevector V, C, <v1, v2, .., ci, .., vm> -> + // insertelement V, C[ci], ci-n + if (LHSVWidth == Shuffle->getType()->getNumElements()) { + Value *Op = nullptr; + Constant *Value = nullptr; + unsigned Idx = -1u; + + // Find constant vector with the single element in shuffle (LHS or RHS). + if (LHSIdx < LHSVWidth && RHSUniform) { + if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(0))) { + Op = Shuffle->getOperand(1); + Value = CV->getOperand(LHSValIdx); + Idx = LHSIdx; + } + } + if (RHSIdx < LHSVWidth && LHSUniform) { + if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(1))) { + Op = Shuffle->getOperand(0); + Value = CV->getOperand(RHSValIdx); + Idx = RHSIdx; + } + } + // Found constant vector with single element - convert to insertelement. + if (Op && Value) { + Instruction *New = InsertElementInst::Create( + Op, Value, ConstantInt::get(Type::getInt32Ty(I->getContext()), Idx), + Shuffle->getName()); + InsertNewInstWith(New, *Shuffle); + return New; + } + } + if (NewUndefElts) { + // Add additional discovered undefs. + SmallVector<Constant*, 16> Elts; + for (unsigned i = 0; i < VWidth; ++i) { + if (UndefElts[i]) + Elts.push_back(UndefValue::get(Type::getInt32Ty(I->getContext()))); + else + Elts.push_back(ConstantInt::get(Type::getInt32Ty(I->getContext()), + Shuffle->getMaskValue(i))); + } + I->setOperand(2, ConstantVector::get(Elts)); + MadeChange = true; + } + break; + } + case Instruction::Select: { + // If this is a vector select, try to transform the select condition based + // on the current demanded elements. + SelectInst *Sel = cast<SelectInst>(I); + if (Sel->getCondition()->getType()->isVectorTy()) { + // TODO: We are not doing anything with UndefElts based on this call. + // It is overwritten below based on the other select operands. If an + // element of the select condition is known undef, then we are free to + // choose the output value from either arm of the select. If we know that + // one of those values is undef, then the output can be undef. + simplifyAndSetOp(I, 0, DemandedElts, UndefElts); + } + + // Next, see if we can transform the arms of the select. + APInt DemandedLHS(DemandedElts), DemandedRHS(DemandedElts); + if (auto *CV = dyn_cast<ConstantVector>(Sel->getCondition())) { + for (unsigned i = 0; i < VWidth; i++) { + // isNullValue() always returns false when called on a ConstantExpr. + // Skip constant expressions to avoid propagating incorrect information. + Constant *CElt = CV->getAggregateElement(i); + if (isa<ConstantExpr>(CElt)) + continue; + // TODO: If a select condition element is undef, we can demand from + // either side. If one side is known undef, choosing that side would + // propagate undef. + if (CElt->isNullValue()) + DemandedLHS.clearBit(i); + else + DemandedRHS.clearBit(i); + } + } + + simplifyAndSetOp(I, 1, DemandedLHS, UndefElts2); + simplifyAndSetOp(I, 2, DemandedRHS, UndefElts3); + + // Output elements are undefined if the element from each arm is undefined. + // TODO: This can be improved. See comment in select condition handling. + UndefElts = UndefElts2 & UndefElts3; + break; + } + case Instruction::BitCast: { + // Vector->vector casts only. + VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType()); + if (!VTy) break; + unsigned InVWidth = VTy->getNumElements(); + APInt InputDemandedElts(InVWidth, 0); + UndefElts2 = APInt(InVWidth, 0); + unsigned Ratio; + + if (VWidth == InVWidth) { + // If we are converting from <4 x i32> -> <4 x f32>, we demand the same + // elements as are demanded of us. + Ratio = 1; + InputDemandedElts = DemandedElts; + } else if ((VWidth % InVWidth) == 0) { + // If the number of elements in the output is a multiple of the number of + // elements in the input then an input element is live if any of the + // corresponding output elements are live. + Ratio = VWidth / InVWidth; + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) + if (DemandedElts[OutIdx]) + InputDemandedElts.setBit(OutIdx / Ratio); + } else if ((InVWidth % VWidth) == 0) { + // If the number of elements in the input is a multiple of the number of + // elements in the output then an input element is live if the + // corresponding output element is live. + Ratio = InVWidth / VWidth; + for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) + if (DemandedElts[InIdx / Ratio]) + InputDemandedElts.setBit(InIdx); + } else { + // Unsupported so far. + break; + } + + simplifyAndSetOp(I, 0, InputDemandedElts, UndefElts2); + + if (VWidth == InVWidth) { + UndefElts = UndefElts2; + } else if ((VWidth % InVWidth) == 0) { + // If the number of elements in the output is a multiple of the number of + // elements in the input then an output element is undef if the + // corresponding input element is undef. + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) + if (UndefElts2[OutIdx / Ratio]) + UndefElts.setBit(OutIdx); + } else if ((InVWidth % VWidth) == 0) { + // If the number of elements in the input is a multiple of the number of + // elements in the output then an output element is undef if all of the + // corresponding input elements are undef. + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { + APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio); + if (SubUndef.countPopulation() == Ratio) + UndefElts.setBit(OutIdx); + } + } else { + llvm_unreachable("Unimp"); + } + break; + } + case Instruction::FPTrunc: + case Instruction::FPExt: + simplifyAndSetOp(I, 0, DemandedElts, UndefElts); + break; + + case Instruction::Call: { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); + if (!II) break; + switch (II->getIntrinsicID()) { + case Intrinsic::masked_gather: // fallthrough + case Intrinsic::masked_load: { + // Subtlety: If we load from a pointer, the pointer must be valid + // regardless of whether the element is demanded. Doing otherwise risks + // segfaults which didn't exist in the original program. + APInt DemandedPtrs(APInt::getAllOnesValue(VWidth)), + DemandedPassThrough(DemandedElts); + if (auto *CV = dyn_cast<ConstantVector>(II->getOperand(2))) + for (unsigned i = 0; i < VWidth; i++) { + Constant *CElt = CV->getAggregateElement(i); + if (CElt->isNullValue()) + DemandedPtrs.clearBit(i); + else if (CElt->isAllOnesValue()) + DemandedPassThrough.clearBit(i); + } + if (II->getIntrinsicID() == Intrinsic::masked_gather) + simplifyAndSetOp(II, 0, DemandedPtrs, UndefElts2); + simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3); + + // Output elements are undefined if the element from both sources are. + // TODO: can strengthen via mask as well. + UndefElts = UndefElts2 & UndefElts3; + break; + } + case Intrinsic::x86_xop_vfrcz_ss: + case Intrinsic::x86_xop_vfrcz_sd: + // The instructions for these intrinsics are speced to zero upper bits not + // pass them through like other scalar intrinsics. So we shouldn't just + // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. + // Instead we should return a zero vector. + if (!DemandedElts[0]) { + Worklist.Add(II); + return ConstantAggregateZero::get(II->getType()); + } + + // Only the lower element is used. + DemandedElts = 1; + simplifyAndSetOp(II, 0, DemandedElts, UndefElts); + + // Only the lower element is undefined. The high elements are zero. + UndefElts = UndefElts[0]; + break; + + // Unary scalar-as-vector operations that work column-wise. + case Intrinsic::x86_sse_rcp_ss: + case Intrinsic::x86_sse_rsqrt_ss: + simplifyAndSetOp(II, 0, DemandedElts, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + Worklist.Add(II); + return II->getArgOperand(0); + } + // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions + // checks). + break; + + // Binary scalar-as-vector operations that work column-wise. The high + // elements come from operand 0. The low element is a function of both + // operands. + case Intrinsic::x86_sse_min_ss: + case Intrinsic::x86_sse_max_ss: + case Intrinsic::x86_sse_cmp_ss: + case Intrinsic::x86_sse2_min_sd: + case Intrinsic::x86_sse2_max_sd: + case Intrinsic::x86_sse2_cmp_sd: { + simplifyAndSetOp(II, 0, DemandedElts, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + Worklist.Add(II); + return II->getArgOperand(0); + } + + // Only lower element is used for operand 1. + DemandedElts = 1; + simplifyAndSetOp(II, 1, DemandedElts, UndefElts2); + + // Lower element is undefined if both lower elements are undefined. + // Consider things like undef&0. The result is known zero, not undef. + if (!UndefElts2[0]) + UndefElts.clearBit(0); + + break; + } + + // Binary scalar-as-vector operations that work column-wise. The high + // elements come from operand 0 and the low element comes from operand 1. + case Intrinsic::x86_sse41_round_ss: + case Intrinsic::x86_sse41_round_sd: { + // Don't use the low element of operand 0. + APInt DemandedElts2 = DemandedElts; + DemandedElts2.clearBit(0); + simplifyAndSetOp(II, 0, DemandedElts2, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + Worklist.Add(II); + return II->getArgOperand(0); + } + + // Only lower element is used for operand 1. + DemandedElts = 1; + simplifyAndSetOp(II, 1, DemandedElts, UndefElts2); + + // Take the high undef elements from operand 0 and take the lower element + // from operand 1. + UndefElts.clearBit(0); + UndefElts |= UndefElts2[0]; + break; + } + + // Three input scalar-as-vector operations that work column-wise. The high + // elements come from operand 0 and the low element is a function of all + // three inputs. + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_max_ss_round: + case Intrinsic::x86_avx512_mask_min_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + case Intrinsic::x86_avx512_mask_max_sd_round: + case Intrinsic::x86_avx512_mask_min_sd_round: + simplifyAndSetOp(II, 0, DemandedElts, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + Worklist.Add(II); + return II->getArgOperand(0); + } + + // Only lower element is used for operand 1 and 2. + DemandedElts = 1; + simplifyAndSetOp(II, 1, DemandedElts, UndefElts2); + simplifyAndSetOp(II, 2, DemandedElts, UndefElts3); + + // Lower element is undefined if all three lower elements are undefined. + // Consider things like undef&0. The result is known zero, not undef. + if (!UndefElts2[0] || !UndefElts3[0]) + UndefElts.clearBit(0); + + break; + + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + case Intrinsic::x86_avx2_packusdw: + case Intrinsic::x86_avx2_packuswb: + case Intrinsic::x86_avx512_packssdw_512: + case Intrinsic::x86_avx512_packsswb_512: + case Intrinsic::x86_avx512_packusdw_512: + case Intrinsic::x86_avx512_packuswb_512: { + auto *Ty0 = II->getArgOperand(0)->getType(); + unsigned InnerVWidth = Ty0->getVectorNumElements(); + assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); + + unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; + unsigned VWidthPerLane = VWidth / NumLanes; + unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; + + // Per lane, pack the elements of the first input and then the second. + // e.g. + // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) + // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) + for (int OpNum = 0; OpNum != 2; ++OpNum) { + APInt OpDemandedElts(InnerVWidth, 0); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + unsigned LaneIdx = Lane * VWidthPerLane; + for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { + unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; + if (DemandedElts[Idx]) + OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); + } + } + + // Demand elements from the operand. + APInt OpUndefElts(InnerVWidth, 0); + simplifyAndSetOp(II, OpNum, OpDemandedElts, OpUndefElts); + + // Pack the operand's UNDEF elements, one lane at a time. + OpUndefElts = OpUndefElts.zext(VWidth); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); + LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); + LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); + UndefElts |= LaneElts; + } + } + break; + } + + // PSHUFB + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + case Intrinsic::x86_avx512_pshuf_b_512: + // PERMILVAR + case Intrinsic::x86_avx_vpermilvar_ps: + case Intrinsic::x86_avx_vpermilvar_ps_256: + case Intrinsic::x86_avx512_vpermilvar_ps_512: + case Intrinsic::x86_avx_vpermilvar_pd: + case Intrinsic::x86_avx_vpermilvar_pd_256: + case Intrinsic::x86_avx512_vpermilvar_pd_512: + // PERMV + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: { + simplifyAndSetOp(II, 1, DemandedElts, UndefElts); + break; + } + + // SSE4A instructions leave the upper 64-bits of the 128-bit result + // in an undefined state. + case Intrinsic::x86_sse4a_extrq: + case Intrinsic::x86_sse4a_extrqi: + case Intrinsic::x86_sse4a_insertq: + case Intrinsic::x86_sse4a_insertqi: + UndefElts.setHighBits(VWidth / 2); + break; + case Intrinsic::amdgcn_buffer_load: + case Intrinsic::amdgcn_buffer_load_format: + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_buffer_load_format: + case Intrinsic::amdgcn_raw_tbuffer_load: + case Intrinsic::amdgcn_struct_buffer_load: + case Intrinsic::amdgcn_struct_buffer_load_format: + case Intrinsic::amdgcn_struct_tbuffer_load: + case Intrinsic::amdgcn_tbuffer_load: + return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts); + default: { + if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID())) + return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0); + + break; + } + } // switch on IntrinsicID + break; + } // case Call + } // switch on Opcode + + // TODO: We bail completely on integer div/rem and shifts because they have + // UB/poison potential, but that should be refined. + BinaryOperator *BO; + if (match(I, m_BinOp(BO)) && !BO->isIntDivRem() && !BO->isShift()) { + simplifyAndSetOp(I, 0, DemandedElts, UndefElts); + simplifyAndSetOp(I, 1, DemandedElts, UndefElts2); + + // Any change to an instruction with potential poison must clear those flags + // because we can not guarantee those constraints now. Other analysis may + // determine that it is safe to re-apply the flags. + if (MadeChange) + BO->dropPoisonGeneratingFlags(); + + // Output elements are undefined if both are undefined. Consider things + // like undef & 0. The result is known zero, not undef. + UndefElts &= UndefElts2; + } + + // If we've proven all of the lanes undef, return an undef value. + // TODO: Intersect w/demanded lanes + if (UndefElts.isAllOnesValue()) + return UndefValue::get(I->getType());; + + return MadeChange ? I : nullptr; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineTables.td b/llvm/lib/Transforms/InstCombine/InstCombineTables.td new file mode 100644 index 000000000000..98b2adc442fa --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineTables.td @@ -0,0 +1,11 @@ +include "llvm/TableGen/SearchableTable.td" +include "llvm/IR/Intrinsics.td" + +def AMDGPUImageDMaskIntrinsicTable : GenericTable { + let FilterClass = "AMDGPUImageDMaskIntrinsic"; + let Fields = ["Intr"]; + + let PrimaryKey = ["Intr"]; + let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic"; + let PrimaryKeyEarlyOut = 1; +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp new file mode 100644 index 000000000000..9c890748e5ab --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -0,0 +1,2246 @@ +//===- InstCombineVectorOps.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements instcombine for ExtractElement, InsertElement and +// ShuffleVector. +// +//===----------------------------------------------------------------------===// + +#include "InstCombineInternal.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" +#include <cassert> +#include <cstdint> +#include <iterator> +#include <utility> + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "instcombine" + +/// Return true if the value is cheaper to scalarize than it is to leave as a +/// vector operation. IsConstantExtractIndex indicates whether we are extracting +/// one known element from a vector constant. +/// +/// FIXME: It's possible to create more instructions than previously existed. +static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) { + // If we can pick a scalar constant value out of a vector, that is free. + if (auto *C = dyn_cast<Constant>(V)) + return IsConstantExtractIndex || C->getSplatValue(); + + // An insertelement to the same constant index as our extract will simplify + // to the scalar inserted element. An insertelement to a different constant + // index is irrelevant to our extract. + if (match(V, m_InsertElement(m_Value(), m_Value(), m_ConstantInt()))) + return IsConstantExtractIndex; + + if (match(V, m_OneUse(m_Load(m_Value())))) + return true; + + Value *V0, *V1; + if (match(V, m_OneUse(m_BinOp(m_Value(V0), m_Value(V1))))) + if (cheapToScalarize(V0, IsConstantExtractIndex) || + cheapToScalarize(V1, IsConstantExtractIndex)) + return true; + + CmpInst::Predicate UnusedPred; + if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1))))) + if (cheapToScalarize(V0, IsConstantExtractIndex) || + cheapToScalarize(V1, IsConstantExtractIndex)) + return true; + + return false; +} + +// If we have a PHI node with a vector type that is only used to feed +// itself and be an operand of extractelement at a constant location, +// try to replace the PHI of the vector type with a PHI of a scalar type. +Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { + SmallVector<Instruction *, 2> Extracts; + // The users we want the PHI to have are: + // 1) The EI ExtractElement (we already know this) + // 2) Possibly more ExtractElements with the same index. + // 3) Another operand, which will feed back into the PHI. + Instruction *PHIUser = nullptr; + for (auto U : PN->users()) { + if (ExtractElementInst *EU = dyn_cast<ExtractElementInst>(U)) { + if (EI.getIndexOperand() == EU->getIndexOperand()) + Extracts.push_back(EU); + else + return nullptr; + } else if (!PHIUser) { + PHIUser = cast<Instruction>(U); + } else { + return nullptr; + } + } + + if (!PHIUser) + return nullptr; + + // Verify that this PHI user has one use, which is the PHI itself, + // and that it is a binary operation which is cheap to scalarize. + // otherwise return nullptr. + if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) || + !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true)) + return nullptr; + + // Create a scalar PHI node that will replace the vector PHI node + // just before the current PHI node. + PHINode *scalarPHI = cast<PHINode>(InsertNewInstWith( + PHINode::Create(EI.getType(), PN->getNumIncomingValues(), ""), *PN)); + // Scalarize each PHI operand. + for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) { + Value *PHIInVal = PN->getIncomingValue(i); + BasicBlock *inBB = PN->getIncomingBlock(i); + Value *Elt = EI.getIndexOperand(); + // If the operand is the PHI induction variable: + if (PHIInVal == PHIUser) { + // Scalarize the binary operation. Its first operand is the + // scalar PHI, and the second operand is extracted from the other + // vector operand. + BinaryOperator *B0 = cast<BinaryOperator>(PHIUser); + unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0; + Value *Op = InsertNewInstWith( + ExtractElementInst::Create(B0->getOperand(opId), Elt, + B0->getOperand(opId)->getName() + ".Elt"), + *B0); + Value *newPHIUser = InsertNewInstWith( + BinaryOperator::CreateWithCopiedFlags(B0->getOpcode(), + scalarPHI, Op, B0), *B0); + scalarPHI->addIncoming(newPHIUser, inBB); + } else { + // Scalarize PHI input: + Instruction *newEI = ExtractElementInst::Create(PHIInVal, Elt, ""); + // Insert the new instruction into the predecessor basic block. + Instruction *pos = dyn_cast<Instruction>(PHIInVal); + BasicBlock::iterator InsertPos; + if (pos && !isa<PHINode>(pos)) { + InsertPos = ++pos->getIterator(); + } else { + InsertPos = inBB->getFirstInsertionPt(); + } + + InsertNewInstWith(newEI, *InsertPos); + + scalarPHI->addIncoming(newEI, inBB); + } + } + + for (auto E : Extracts) + replaceInstUsesWith(*E, scalarPHI); + + return &EI; +} + +static Instruction *foldBitcastExtElt(ExtractElementInst &Ext, + InstCombiner::BuilderTy &Builder, + bool IsBigEndian) { + Value *X; + uint64_t ExtIndexC; + if (!match(Ext.getVectorOperand(), m_BitCast(m_Value(X))) || + !X->getType()->isVectorTy() || + !match(Ext.getIndexOperand(), m_ConstantInt(ExtIndexC))) + return nullptr; + + // If this extractelement is using a bitcast from a vector of the same number + // of elements, see if we can find the source element from the source vector: + // extelt (bitcast VecX), IndexC --> bitcast X[IndexC] + Type *SrcTy = X->getType(); + Type *DestTy = Ext.getType(); + unsigned NumSrcElts = SrcTy->getVectorNumElements(); + unsigned NumElts = Ext.getVectorOperandType()->getNumElements(); + if (NumSrcElts == NumElts) + if (Value *Elt = findScalarElement(X, ExtIndexC)) + return new BitCastInst(Elt, DestTy); + + // If the source elements are wider than the destination, try to shift and + // truncate a subset of scalar bits of an insert op. + if (NumSrcElts < NumElts) { + Value *Scalar; + uint64_t InsIndexC; + if (!match(X, m_InsertElement(m_Value(), m_Value(Scalar), + m_ConstantInt(InsIndexC)))) + return nullptr; + + // The extract must be from the subset of vector elements that we inserted + // into. Example: if we inserted element 1 of a <2 x i64> and we are + // extracting an i16 (narrowing ratio = 4), then this extract must be from 1 + // of elements 4-7 of the bitcasted vector. + unsigned NarrowingRatio = NumElts / NumSrcElts; + if (ExtIndexC / NarrowingRatio != InsIndexC) + return nullptr; + + // We are extracting part of the original scalar. How that scalar is + // inserted into the vector depends on the endian-ness. Example: + // Vector Byte Elt Index: 0 1 2 3 4 5 6 7 + // +--+--+--+--+--+--+--+--+ + // inselt <2 x i32> V, <i32> S, 1: |V0|V1|V2|V3|S0|S1|S2|S3| + // extelt <4 x i16> V', 3: | |S2|S3| + // +--+--+--+--+--+--+--+--+ + // If this is little-endian, S2|S3 are the MSB of the 32-bit 'S' value. + // If this is big-endian, S2|S3 are the LSB of the 32-bit 'S' value. + // In this example, we must right-shift little-endian. Big-endian is just a + // truncate. + unsigned Chunk = ExtIndexC % NarrowingRatio; + if (IsBigEndian) + Chunk = NarrowingRatio - 1 - Chunk; + + // Bail out if this is an FP vector to FP vector sequence. That would take + // more instructions than we started with unless there is no shift, and it + // may not be handled as well in the backend. + bool NeedSrcBitcast = SrcTy->getScalarType()->isFloatingPointTy(); + bool NeedDestBitcast = DestTy->isFloatingPointTy(); + if (NeedSrcBitcast && NeedDestBitcast) + return nullptr; + + unsigned SrcWidth = SrcTy->getScalarSizeInBits(); + unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); + unsigned ShAmt = Chunk * DestWidth; + + // TODO: This limitation is more strict than necessary. We could sum the + // number of new instructions and subtract the number eliminated to know if + // we can proceed. + if (!X->hasOneUse() || !Ext.getVectorOperand()->hasOneUse()) + if (NeedSrcBitcast || NeedDestBitcast) + return nullptr; + + if (NeedSrcBitcast) { + Type *SrcIntTy = IntegerType::getIntNTy(Scalar->getContext(), SrcWidth); + Scalar = Builder.CreateBitCast(Scalar, SrcIntTy); + } + + if (ShAmt) { + // Bail out if we could end with more instructions than we started with. + if (!Ext.getVectorOperand()->hasOneUse()) + return nullptr; + Scalar = Builder.CreateLShr(Scalar, ShAmt); + } + + if (NeedDestBitcast) { + Type *DestIntTy = IntegerType::getIntNTy(Scalar->getContext(), DestWidth); + return new BitCastInst(Builder.CreateTrunc(Scalar, DestIntTy), DestTy); + } + return new TruncInst(Scalar, DestTy); + } + + return nullptr; +} + +/// Find elements of V demanded by UserInstr. +static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) { + unsigned VWidth = V->getType()->getVectorNumElements(); + + // Conservatively assume that all elements are needed. + APInt UsedElts(APInt::getAllOnesValue(VWidth)); + + switch (UserInstr->getOpcode()) { + case Instruction::ExtractElement: { + ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr); + assert(EEI->getVectorOperand() == V); + ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand()); + if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) { + UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue()); + } + break; + } + case Instruction::ShuffleVector: { + ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr); + unsigned MaskNumElts = UserInstr->getType()->getVectorNumElements(); + + UsedElts = APInt(VWidth, 0); + for (unsigned i = 0; i < MaskNumElts; i++) { + unsigned MaskVal = Shuffle->getMaskValue(i); + if (MaskVal == -1u || MaskVal >= 2 * VWidth) + continue; + if (Shuffle->getOperand(0) == V && (MaskVal < VWidth)) + UsedElts.setBit(MaskVal); + if (Shuffle->getOperand(1) == V && + ((MaskVal >= VWidth) && (MaskVal < 2 * VWidth))) + UsedElts.setBit(MaskVal - VWidth); + } + break; + } + default: + break; + } + return UsedElts; +} + +/// Find union of elements of V demanded by all its users. +/// If it is known by querying findDemandedEltsBySingleUser that +/// no user demands an element of V, then the corresponding bit +/// remains unset in the returned value. +static APInt findDemandedEltsByAllUsers(Value *V) { + unsigned VWidth = V->getType()->getVectorNumElements(); + + APInt UnionUsedElts(VWidth, 0); + for (const Use &U : V->uses()) { + if (Instruction *I = dyn_cast<Instruction>(U.getUser())) { + UnionUsedElts |= findDemandedEltsBySingleUser(V, I); + } else { + UnionUsedElts = APInt::getAllOnesValue(VWidth); + break; + } + + if (UnionUsedElts.isAllOnesValue()) + break; + } + + return UnionUsedElts; +} + +Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { + Value *SrcVec = EI.getVectorOperand(); + Value *Index = EI.getIndexOperand(); + if (Value *V = SimplifyExtractElementInst(SrcVec, Index, + SQ.getWithInstruction(&EI))) + return replaceInstUsesWith(EI, V); + + // If extracting a specified index from the vector, see if we can recursively + // find a previously computed scalar that was inserted into the vector. + auto *IndexC = dyn_cast<ConstantInt>(Index); + if (IndexC) { + unsigned NumElts = EI.getVectorOperandType()->getNumElements(); + + // InstSimplify should handle cases where the index is invalid. + if (!IndexC->getValue().ule(NumElts)) + return nullptr; + + // This instruction only demands the single element from the input vector. + if (NumElts != 1) { + // If the input vector has a single use, simplify it based on this use + // property. + if (SrcVec->hasOneUse()) { + APInt UndefElts(NumElts, 0); + APInt DemandedElts(NumElts, 0); + DemandedElts.setBit(IndexC->getZExtValue()); + if (Value *V = + SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts)) { + EI.setOperand(0, V); + return &EI; + } + } else { + // If the input vector has multiple uses, simplify it based on a union + // of all elements used. + APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec); + if (!DemandedElts.isAllOnesValue()) { + APInt UndefElts(NumElts, 0); + if (Value *V = SimplifyDemandedVectorElts( + SrcVec, DemandedElts, UndefElts, 0 /* Depth */, + true /* AllowMultipleUsers */)) { + if (V != SrcVec) { + SrcVec->replaceAllUsesWith(V); + return &EI; + } + } + } + } + } + if (Instruction *I = foldBitcastExtElt(EI, Builder, DL.isBigEndian())) + return I; + + // If there's a vector PHI feeding a scalar use through this extractelement + // instruction, try to scalarize the PHI. + if (auto *Phi = dyn_cast<PHINode>(SrcVec)) + if (Instruction *ScalarPHI = scalarizePHI(EI, Phi)) + return ScalarPHI; + } + + BinaryOperator *BO; + if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, IndexC)) { + // extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index) + Value *X = BO->getOperand(0), *Y = BO->getOperand(1); + Value *E0 = Builder.CreateExtractElement(X, Index); + Value *E1 = Builder.CreateExtractElement(Y, Index); + return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(), E0, E1, BO); + } + + Value *X, *Y; + CmpInst::Predicate Pred; + if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) && + cheapToScalarize(SrcVec, IndexC)) { + // extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index) + Value *E0 = Builder.CreateExtractElement(X, Index); + Value *E1 = Builder.CreateExtractElement(Y, Index); + return CmpInst::Create(cast<CmpInst>(SrcVec)->getOpcode(), Pred, E0, E1); + } + + if (auto *I = dyn_cast<Instruction>(SrcVec)) { + if (auto *IE = dyn_cast<InsertElementInst>(I)) { + // Extracting the inserted element? + if (IE->getOperand(2) == Index) + return replaceInstUsesWith(EI, IE->getOperand(1)); + // If the inserted and extracted elements are constants, they must not + // be the same value, extract from the pre-inserted value instead. + if (isa<Constant>(IE->getOperand(2)) && IndexC) { + Worklist.AddValue(SrcVec); + EI.setOperand(0, IE->getOperand(0)); + return &EI; + } + } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) { + // If this is extracting an element from a shufflevector, figure out where + // it came from and extract from the appropriate input element instead. + if (auto *Elt = dyn_cast<ConstantInt>(Index)) { + int SrcIdx = SVI->getMaskValue(Elt->getZExtValue()); + Value *Src; + unsigned LHSWidth = + SVI->getOperand(0)->getType()->getVectorNumElements(); + + if (SrcIdx < 0) + return replaceInstUsesWith(EI, UndefValue::get(EI.getType())); + if (SrcIdx < (int)LHSWidth) + Src = SVI->getOperand(0); + else { + SrcIdx -= LHSWidth; + Src = SVI->getOperand(1); + } + Type *Int32Ty = Type::getInt32Ty(EI.getContext()); + return ExtractElementInst::Create(Src, + ConstantInt::get(Int32Ty, + SrcIdx, false)); + } + } else if (auto *CI = dyn_cast<CastInst>(I)) { + // Canonicalize extractelement(cast) -> cast(extractelement). + // Bitcasts can change the number of vector elements, and they cost + // nothing. + if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) { + Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index); + Worklist.AddValue(EE); + return CastInst::Create(CI->getOpcode(), EE, EI.getType()); + } + } + } + return nullptr; +} + +/// If V is a shuffle of values that ONLY returns elements from either LHS or +/// RHS, return the shuffle mask and true. Otherwise, return false. +static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, + SmallVectorImpl<Constant*> &Mask) { + assert(LHS->getType() == RHS->getType() && + "Invalid CollectSingleShuffleElements"); + unsigned NumElts = V->getType()->getVectorNumElements(); + + if (isa<UndefValue>(V)) { + Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext()))); + return true; + } + + if (V == LHS) { + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i)); + return true; + } + + if (V == RHS) { + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), + i+NumElts)); + return true; + } + + if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) { + // If this is an insert of an extract from some other vector, include it. + Value *VecOp = IEI->getOperand(0); + Value *ScalarOp = IEI->getOperand(1); + Value *IdxOp = IEI->getOperand(2); + + if (!isa<ConstantInt>(IdxOp)) + return false; + unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue(); + + if (isa<UndefValue>(ScalarOp)) { // inserting undef into vector. + // We can handle this if the vector we are inserting into is + // transitively ok. + if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { + // If so, update the mask to reflect the inserted undef. + Mask[InsertedIdx] = UndefValue::get(Type::getInt32Ty(V->getContext())); + return true; + } + } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){ + if (isa<ConstantInt>(EI->getOperand(1))) { + unsigned ExtractedIdx = + cast<ConstantInt>(EI->getOperand(1))->getZExtValue(); + unsigned NumLHSElts = LHS->getType()->getVectorNumElements(); + + // This must be extracting from either LHS or RHS. + if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) { + // We can handle this if the vector we are inserting into is + // transitively ok. + if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { + // If so, update the mask to reflect the inserted value. + if (EI->getOperand(0) == LHS) { + Mask[InsertedIdx % NumElts] = + ConstantInt::get(Type::getInt32Ty(V->getContext()), + ExtractedIdx); + } else { + assert(EI->getOperand(0) == RHS); + Mask[InsertedIdx % NumElts] = + ConstantInt::get(Type::getInt32Ty(V->getContext()), + ExtractedIdx + NumLHSElts); + } + return true; + } + } + } + } + } + + return false; +} + +/// If we have insertion into a vector that is wider than the vector that we +/// are extracting from, try to widen the source vector to allow a single +/// shufflevector to replace one or more insert/extract pairs. +static void replaceExtractElements(InsertElementInst *InsElt, + ExtractElementInst *ExtElt, + InstCombiner &IC) { + VectorType *InsVecType = InsElt->getType(); + VectorType *ExtVecType = ExtElt->getVectorOperandType(); + unsigned NumInsElts = InsVecType->getVectorNumElements(); + unsigned NumExtElts = ExtVecType->getVectorNumElements(); + + // The inserted-to vector must be wider than the extracted-from vector. + if (InsVecType->getElementType() != ExtVecType->getElementType() || + NumExtElts >= NumInsElts) + return; + + // Create a shuffle mask to widen the extended-from vector using undefined + // values. The mask selects all of the values of the original vector followed + // by as many undefined values as needed to create a vector of the same length + // as the inserted-to vector. + SmallVector<Constant *, 16> ExtendMask; + IntegerType *IntType = Type::getInt32Ty(InsElt->getContext()); + for (unsigned i = 0; i < NumExtElts; ++i) + ExtendMask.push_back(ConstantInt::get(IntType, i)); + for (unsigned i = NumExtElts; i < NumInsElts; ++i) + ExtendMask.push_back(UndefValue::get(IntType)); + + Value *ExtVecOp = ExtElt->getVectorOperand(); + auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp); + BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst)) + ? ExtVecOpInst->getParent() + : ExtElt->getParent(); + + // TODO: This restriction matches the basic block check below when creating + // new extractelement instructions. If that limitation is removed, this one + // could also be removed. But for now, we just bail out to ensure that we + // will replace the extractelement instruction that is feeding our + // insertelement instruction. This allows the insertelement to then be + // replaced by a shufflevector. If the insertelement is not replaced, we can + // induce infinite looping because there's an optimization for extractelement + // that will delete our widening shuffle. This would trigger another attempt + // here to create that shuffle, and we spin forever. + if (InsertionBlock != InsElt->getParent()) + return; + + // TODO: This restriction matches the check in visitInsertElementInst() and + // prevents an infinite loop caused by not turning the extract/insert pair + // into a shuffle. We really should not need either check, but we're lacking + // folds for shufflevectors because we're afraid to generate shuffle masks + // that the backend can't handle. + if (InsElt->hasOneUse() && isa<InsertElementInst>(InsElt->user_back())) + return; + + auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType), + ConstantVector::get(ExtendMask)); + + // Insert the new shuffle after the vector operand of the extract is defined + // (as long as it's not a PHI) or at the start of the basic block of the + // extract, so any subsequent extracts in the same basic block can use it. + // TODO: Insert before the earliest ExtractElementInst that is replaced. + if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst)) + WideVec->insertAfter(ExtVecOpInst); + else + IC.InsertNewInstWith(WideVec, *ExtElt->getParent()->getFirstInsertionPt()); + + // Replace extracts from the original narrow vector with extracts from the new + // wide vector. + for (User *U : ExtVecOp->users()) { + ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U); + if (!OldExt || OldExt->getParent() != WideVec->getParent()) + continue; + auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1)); + NewExt->insertAfter(OldExt); + IC.replaceInstUsesWith(*OldExt, NewExt); + } +} + +/// We are building a shuffle to create V, which is a sequence of insertelement, +/// extractelement pairs. If PermittedRHS is set, then we must either use it or +/// not rely on the second vector source. Return a std::pair containing the +/// left and right vectors of the proposed shuffle (or 0), and set the Mask +/// parameter as required. +/// +/// Note: we intentionally don't try to fold earlier shuffles since they have +/// often been chosen carefully to be efficiently implementable on the target. +using ShuffleOps = std::pair<Value *, Value *>; + +static ShuffleOps collectShuffleElements(Value *V, + SmallVectorImpl<Constant *> &Mask, + Value *PermittedRHS, + InstCombiner &IC) { + assert(V->getType()->isVectorTy() && "Invalid shuffle!"); + unsigned NumElts = V->getType()->getVectorNumElements(); + + if (isa<UndefValue>(V)) { + Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext()))); + return std::make_pair( + PermittedRHS ? UndefValue::get(PermittedRHS->getType()) : V, nullptr); + } + + if (isa<ConstantAggregateZero>(V)) { + Mask.assign(NumElts, ConstantInt::get(Type::getInt32Ty(V->getContext()),0)); + return std::make_pair(V, nullptr); + } + + if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) { + // If this is an insert of an extract from some other vector, include it. + Value *VecOp = IEI->getOperand(0); + Value *ScalarOp = IEI->getOperand(1); + Value *IdxOp = IEI->getOperand(2); + + if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) { + if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) { + unsigned ExtractedIdx = + cast<ConstantInt>(EI->getOperand(1))->getZExtValue(); + unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue(); + + // Either the extracted from or inserted into vector must be RHSVec, + // otherwise we'd end up with a shuffle of three inputs. + if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) { + Value *RHS = EI->getOperand(0); + ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC); + assert(LR.second == nullptr || LR.second == RHS); + + if (LR.first->getType() != RHS->getType()) { + // Although we are giving up for now, see if we can create extracts + // that match the inserts for another round of combining. + replaceExtractElements(IEI, EI, IC); + + // We tried our best, but we can't find anything compatible with RHS + // further up the chain. Return a trivial shuffle. + for (unsigned i = 0; i < NumElts; ++i) + Mask[i] = ConstantInt::get(Type::getInt32Ty(V->getContext()), i); + return std::make_pair(V, nullptr); + } + + unsigned NumLHSElts = RHS->getType()->getVectorNumElements(); + Mask[InsertedIdx % NumElts] = + ConstantInt::get(Type::getInt32Ty(V->getContext()), + NumLHSElts+ExtractedIdx); + return std::make_pair(LR.first, RHS); + } + + if (VecOp == PermittedRHS) { + // We've gone as far as we can: anything on the other side of the + // extractelement will already have been converted into a shuffle. + unsigned NumLHSElts = + EI->getOperand(0)->getType()->getVectorNumElements(); + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(ConstantInt::get( + Type::getInt32Ty(V->getContext()), + i == InsertedIdx ? ExtractedIdx : NumLHSElts + i)); + return std::make_pair(EI->getOperand(0), PermittedRHS); + } + + // If this insertelement is a chain that comes from exactly these two + // vectors, return the vector and the effective shuffle. + if (EI->getOperand(0)->getType() == PermittedRHS->getType() && + collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS, + Mask)) + return std::make_pair(EI->getOperand(0), PermittedRHS); + } + } + } + + // Otherwise, we can't do anything fancy. Return an identity vector. + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i)); + return std::make_pair(V, nullptr); +} + +/// Try to find redundant insertvalue instructions, like the following ones: +/// %0 = insertvalue { i8, i32 } undef, i8 %x, 0 +/// %1 = insertvalue { i8, i32 } %0, i8 %y, 0 +/// Here the second instruction inserts values at the same indices, as the +/// first one, making the first one redundant. +/// It should be transformed to: +/// %0 = insertvalue { i8, i32 } undef, i8 %y, 0 +Instruction *InstCombiner::visitInsertValueInst(InsertValueInst &I) { + bool IsRedundant = false; + ArrayRef<unsigned int> FirstIndices = I.getIndices(); + + // If there is a chain of insertvalue instructions (each of them except the + // last one has only one use and it's another insertvalue insn from this + // chain), check if any of the 'children' uses the same indices as the first + // instruction. In this case, the first one is redundant. + Value *V = &I; + unsigned Depth = 0; + while (V->hasOneUse() && Depth < 10) { + User *U = V->user_back(); + auto UserInsInst = dyn_cast<InsertValueInst>(U); + if (!UserInsInst || U->getOperand(0) != V) + break; + if (UserInsInst->getIndices() == FirstIndices) { + IsRedundant = true; + break; + } + V = UserInsInst; + Depth++; + } + + if (IsRedundant) + return replaceInstUsesWith(I, I.getOperand(0)); + return nullptr; +} + +static bool isShuffleEquivalentToSelect(ShuffleVectorInst &Shuf) { + int MaskSize = Shuf.getMask()->getType()->getVectorNumElements(); + int VecSize = Shuf.getOperand(0)->getType()->getVectorNumElements(); + + // A vector select does not change the size of the operands. + if (MaskSize != VecSize) + return false; + + // Each mask element must be undefined or choose a vector element from one of + // the source operands without crossing vector lanes. + for (int i = 0; i != MaskSize; ++i) { + int Elt = Shuf.getMaskValue(i); + if (Elt != -1 && Elt != i && Elt != i + VecSize) + return false; + } + + return true; +} + +/// Turn a chain of inserts that splats a value into an insert + shuffle: +/// insertelt(insertelt(insertelt(insertelt X, %k, 0), %k, 1), %k, 2) ... -> +/// shufflevector(insertelt(X, %k, 0), undef, zero) +static Instruction *foldInsSequenceIntoSplat(InsertElementInst &InsElt) { + // We are interested in the last insert in a chain. So if this insert has a + // single user and that user is an insert, bail. + if (InsElt.hasOneUse() && isa<InsertElementInst>(InsElt.user_back())) + return nullptr; + + auto *VecTy = cast<VectorType>(InsElt.getType()); + unsigned NumElements = VecTy->getNumElements(); + + // Do not try to do this for a one-element vector, since that's a nop, + // and will cause an inf-loop. + if (NumElements == 1) + return nullptr; + + Value *SplatVal = InsElt.getOperand(1); + InsertElementInst *CurrIE = &InsElt; + SmallVector<bool, 16> ElementPresent(NumElements, false); + InsertElementInst *FirstIE = nullptr; + + // Walk the chain backwards, keeping track of which indices we inserted into, + // until we hit something that isn't an insert of the splatted value. + while (CurrIE) { + auto *Idx = dyn_cast<ConstantInt>(CurrIE->getOperand(2)); + if (!Idx || CurrIE->getOperand(1) != SplatVal) + return nullptr; + + auto *NextIE = dyn_cast<InsertElementInst>(CurrIE->getOperand(0)); + // Check none of the intermediate steps have any additional uses, except + // for the root insertelement instruction, which can be re-used, if it + // inserts at position 0. + if (CurrIE != &InsElt && + (!CurrIE->hasOneUse() && (NextIE != nullptr || !Idx->isZero()))) + return nullptr; + + ElementPresent[Idx->getZExtValue()] = true; + FirstIE = CurrIE; + CurrIE = NextIE; + } + + // If this is just a single insertelement (not a sequence), we are done. + if (FirstIE == &InsElt) + return nullptr; + + // If we are not inserting into an undef vector, make sure we've seen an + // insert into every element. + // TODO: If the base vector is not undef, it might be better to create a splat + // and then a select-shuffle (blend) with the base vector. + if (!isa<UndefValue>(FirstIE->getOperand(0))) + if (any_of(ElementPresent, [](bool Present) { return !Present; })) + return nullptr; + + // Create the insert + shuffle. + Type *Int32Ty = Type::getInt32Ty(InsElt.getContext()); + UndefValue *UndefVec = UndefValue::get(VecTy); + Constant *Zero = ConstantInt::get(Int32Ty, 0); + if (!cast<ConstantInt>(FirstIE->getOperand(2))->isZero()) + FirstIE = InsertElementInst::Create(UndefVec, SplatVal, Zero, "", &InsElt); + + // Splat from element 0, but replace absent elements with undef in the mask. + SmallVector<Constant *, 16> Mask(NumElements, Zero); + for (unsigned i = 0; i != NumElements; ++i) + if (!ElementPresent[i]) + Mask[i] = UndefValue::get(Int32Ty); + + return new ShuffleVectorInst(FirstIE, UndefVec, ConstantVector::get(Mask)); +} + +/// Try to fold an insert element into an existing splat shuffle by changing +/// the shuffle's mask to include the index of this insert element. +static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) { + // Check if the vector operand of this insert is a canonical splat shuffle. + auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0)); + if (!Shuf || !Shuf->isZeroEltSplat()) + return nullptr; + + // Check for a constant insertion index. + uint64_t IdxC; + if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC))) + return nullptr; + + // Check if the splat shuffle's input is the same as this insert's scalar op. + Value *X = InsElt.getOperand(1); + Value *Op0 = Shuf->getOperand(0); + if (!match(Op0, m_InsertElement(m_Undef(), m_Specific(X), m_ZeroInt()))) + return nullptr; + + // Replace the shuffle mask element at the index of this insert with a zero. + // For example: + // inselt (shuf (inselt undef, X, 0), undef, <0,undef,0,undef>), X, 1 + // --> shuf (inselt undef, X, 0), undef, <0,0,0,undef> + unsigned NumMaskElts = Shuf->getType()->getVectorNumElements(); + SmallVector<Constant *, 16> NewMaskVec(NumMaskElts); + Type *I32Ty = IntegerType::getInt32Ty(Shuf->getContext()); + Constant *Zero = ConstantInt::getNullValue(I32Ty); + for (unsigned i = 0; i != NumMaskElts; ++i) + NewMaskVec[i] = i == IdxC ? Zero : Shuf->getMask()->getAggregateElement(i); + + Constant *NewMask = ConstantVector::get(NewMaskVec); + return new ShuffleVectorInst(Op0, UndefValue::get(Op0->getType()), NewMask); +} + +/// Try to fold an extract+insert element into an existing identity shuffle by +/// changing the shuffle's mask to include the index of this insert element. +static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) { + // Check if the vector operand of this insert is an identity shuffle. + auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0)); + if (!Shuf || !isa<UndefValue>(Shuf->getOperand(1)) || + !(Shuf->isIdentityWithExtract() || Shuf->isIdentityWithPadding())) + return nullptr; + + // Check for a constant insertion index. + uint64_t IdxC; + if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC))) + return nullptr; + + // Check if this insert's scalar op is extracted from the identity shuffle's + // input vector. + Value *Scalar = InsElt.getOperand(1); + Value *X = Shuf->getOperand(0); + if (!match(Scalar, m_ExtractElement(m_Specific(X), m_SpecificInt(IdxC)))) + return nullptr; + + // Replace the shuffle mask element at the index of this extract+insert with + // that same index value. + // For example: + // inselt (shuf X, IdMask), (extelt X, IdxC), IdxC --> shuf X, IdMask' + unsigned NumMaskElts = Shuf->getType()->getVectorNumElements(); + SmallVector<Constant *, 16> NewMaskVec(NumMaskElts); + Type *I32Ty = IntegerType::getInt32Ty(Shuf->getContext()); + Constant *NewMaskEltC = ConstantInt::get(I32Ty, IdxC); + Constant *OldMask = Shuf->getMask(); + for (unsigned i = 0; i != NumMaskElts; ++i) { + if (i != IdxC) { + // All mask elements besides the inserted element remain the same. + NewMaskVec[i] = OldMask->getAggregateElement(i); + } else if (OldMask->getAggregateElement(i) == NewMaskEltC) { + // If the mask element was already set, there's nothing to do + // (demanded elements analysis may unset it later). + return nullptr; + } else { + assert(isa<UndefValue>(OldMask->getAggregateElement(i)) && + "Unexpected shuffle mask element for identity shuffle"); + NewMaskVec[i] = NewMaskEltC; + } + } + + Constant *NewMask = ConstantVector::get(NewMaskVec); + return new ShuffleVectorInst(X, Shuf->getOperand(1), NewMask); +} + +/// If we have an insertelement instruction feeding into another insertelement +/// and the 2nd is inserting a constant into the vector, canonicalize that +/// constant insertion before the insertion of a variable: +/// +/// insertelement (insertelement X, Y, IdxC1), ScalarC, IdxC2 --> +/// insertelement (insertelement X, ScalarC, IdxC2), Y, IdxC1 +/// +/// This has the potential of eliminating the 2nd insertelement instruction +/// via constant folding of the scalar constant into a vector constant. +static Instruction *hoistInsEltConst(InsertElementInst &InsElt2, + InstCombiner::BuilderTy &Builder) { + auto *InsElt1 = dyn_cast<InsertElementInst>(InsElt2.getOperand(0)); + if (!InsElt1 || !InsElt1->hasOneUse()) + return nullptr; + + Value *X, *Y; + Constant *ScalarC; + ConstantInt *IdxC1, *IdxC2; + if (match(InsElt1->getOperand(0), m_Value(X)) && + match(InsElt1->getOperand(1), m_Value(Y)) && !isa<Constant>(Y) && + match(InsElt1->getOperand(2), m_ConstantInt(IdxC1)) && + match(InsElt2.getOperand(1), m_Constant(ScalarC)) && + match(InsElt2.getOperand(2), m_ConstantInt(IdxC2)) && IdxC1 != IdxC2) { + Value *NewInsElt1 = Builder.CreateInsertElement(X, ScalarC, IdxC2); + return InsertElementInst::Create(NewInsElt1, Y, IdxC1); + } + + return nullptr; +} + +/// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex +/// --> shufflevector X, CVec', Mask' +static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) { + auto *Inst = dyn_cast<Instruction>(InsElt.getOperand(0)); + // Bail out if the parent has more than one use. In that case, we'd be + // replacing the insertelt with a shuffle, and that's not a clear win. + if (!Inst || !Inst->hasOneUse()) + return nullptr; + if (auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0))) { + // The shuffle must have a constant vector operand. The insertelt must have + // a constant scalar being inserted at a constant position in the vector. + Constant *ShufConstVec, *InsEltScalar; + uint64_t InsEltIndex; + if (!match(Shuf->getOperand(1), m_Constant(ShufConstVec)) || + !match(InsElt.getOperand(1), m_Constant(InsEltScalar)) || + !match(InsElt.getOperand(2), m_ConstantInt(InsEltIndex))) + return nullptr; + + // Adding an element to an arbitrary shuffle could be expensive, but a + // shuffle that selects elements from vectors without crossing lanes is + // assumed cheap. + // If we're just adding a constant into that shuffle, it will still be + // cheap. + if (!isShuffleEquivalentToSelect(*Shuf)) + return nullptr; + + // From the above 'select' check, we know that the mask has the same number + // of elements as the vector input operands. We also know that each constant + // input element is used in its lane and can not be used more than once by + // the shuffle. Therefore, replace the constant in the shuffle's constant + // vector with the insertelt constant. Replace the constant in the shuffle's + // mask vector with the insertelt index plus the length of the vector + // (because the constant vector operand of a shuffle is always the 2nd + // operand). + Constant *Mask = Shuf->getMask(); + unsigned NumElts = Mask->getType()->getVectorNumElements(); + SmallVector<Constant *, 16> NewShufElts(NumElts); + SmallVector<Constant *, 16> NewMaskElts(NumElts); + for (unsigned I = 0; I != NumElts; ++I) { + if (I == InsEltIndex) { + NewShufElts[I] = InsEltScalar; + Type *Int32Ty = Type::getInt32Ty(Shuf->getContext()); + NewMaskElts[I] = ConstantInt::get(Int32Ty, InsEltIndex + NumElts); + } else { + // Copy over the existing values. + NewShufElts[I] = ShufConstVec->getAggregateElement(I); + NewMaskElts[I] = Mask->getAggregateElement(I); + } + } + + // Create new operands for a shuffle that includes the constant of the + // original insertelt. The old shuffle will be dead now. + return new ShuffleVectorInst(Shuf->getOperand(0), + ConstantVector::get(NewShufElts), + ConstantVector::get(NewMaskElts)); + } else if (auto *IEI = dyn_cast<InsertElementInst>(Inst)) { + // Transform sequences of insertelements ops with constant data/indexes into + // a single shuffle op. + unsigned NumElts = InsElt.getType()->getNumElements(); + + uint64_t InsertIdx[2]; + Constant *Val[2]; + if (!match(InsElt.getOperand(2), m_ConstantInt(InsertIdx[0])) || + !match(InsElt.getOperand(1), m_Constant(Val[0])) || + !match(IEI->getOperand(2), m_ConstantInt(InsertIdx[1])) || + !match(IEI->getOperand(1), m_Constant(Val[1]))) + return nullptr; + SmallVector<Constant *, 16> Values(NumElts); + SmallVector<Constant *, 16> Mask(NumElts); + auto ValI = std::begin(Val); + // Generate new constant vector and mask. + // We have 2 values/masks from the insertelements instructions. Insert them + // into new value/mask vectors. + for (uint64_t I : InsertIdx) { + if (!Values[I]) { + assert(!Mask[I]); + Values[I] = *ValI; + Mask[I] = ConstantInt::get(Type::getInt32Ty(InsElt.getContext()), + NumElts + I); + } + ++ValI; + } + // Remaining values are filled with 'undef' values. + for (unsigned I = 0; I < NumElts; ++I) { + if (!Values[I]) { + assert(!Mask[I]); + Values[I] = UndefValue::get(InsElt.getType()->getElementType()); + Mask[I] = ConstantInt::get(Type::getInt32Ty(InsElt.getContext()), I); + } + } + // Create new operands for a shuffle that includes the constant of the + // original insertelt. + return new ShuffleVectorInst(IEI->getOperand(0), + ConstantVector::get(Values), + ConstantVector::get(Mask)); + } + return nullptr; +} + +Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { + Value *VecOp = IE.getOperand(0); + Value *ScalarOp = IE.getOperand(1); + Value *IdxOp = IE.getOperand(2); + + if (auto *V = SimplifyInsertElementInst( + VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE))) + return replaceInstUsesWith(IE, V); + + // If the vector and scalar are both bitcast from the same element type, do + // the insert in that source type followed by bitcast. + Value *VecSrc, *ScalarSrc; + if (match(VecOp, m_BitCast(m_Value(VecSrc))) && + match(ScalarOp, m_BitCast(m_Value(ScalarSrc))) && + (VecOp->hasOneUse() || ScalarOp->hasOneUse()) && + VecSrc->getType()->isVectorTy() && !ScalarSrc->getType()->isVectorTy() && + VecSrc->getType()->getVectorElementType() == ScalarSrc->getType()) { + // inselt (bitcast VecSrc), (bitcast ScalarSrc), IdxOp --> + // bitcast (inselt VecSrc, ScalarSrc, IdxOp) + Value *NewInsElt = Builder.CreateInsertElement(VecSrc, ScalarSrc, IdxOp); + return new BitCastInst(NewInsElt, IE.getType()); + } + + // If the inserted element was extracted from some other vector and both + // indexes are valid constants, try to turn this into a shuffle. + uint64_t InsertedIdx, ExtractedIdx; + Value *ExtVecOp; + if (match(IdxOp, m_ConstantInt(InsertedIdx)) && + match(ScalarOp, m_ExtractElement(m_Value(ExtVecOp), + m_ConstantInt(ExtractedIdx))) && + ExtractedIdx < ExtVecOp->getType()->getVectorNumElements()) { + // TODO: Looking at the user(s) to determine if this insert is a + // fold-to-shuffle opportunity does not match the usual instcombine + // constraints. We should decide if the transform is worthy based only + // on this instruction and its operands, but that may not work currently. + // + // Here, we are trying to avoid creating shuffles before reaching + // the end of a chain of extract-insert pairs. This is complicated because + // we do not generally form arbitrary shuffle masks in instcombine + // (because those may codegen poorly), but collectShuffleElements() does + // exactly that. + // + // The rules for determining what is an acceptable target-independent + // shuffle mask are fuzzy because they evolve based on the backend's + // capabilities and real-world impact. + auto isShuffleRootCandidate = [](InsertElementInst &Insert) { + if (!Insert.hasOneUse()) + return true; + auto *InsertUser = dyn_cast<InsertElementInst>(Insert.user_back()); + if (!InsertUser) + return true; + return false; + }; + + // Try to form a shuffle from a chain of extract-insert ops. + if (isShuffleRootCandidate(IE)) { + SmallVector<Constant*, 16> Mask; + ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this); + + // The proposed shuffle may be trivial, in which case we shouldn't + // perform the combine. + if (LR.first != &IE && LR.second != &IE) { + // We now have a shuffle of LHS, RHS, Mask. + if (LR.second == nullptr) + LR.second = UndefValue::get(LR.first->getType()); + return new ShuffleVectorInst(LR.first, LR.second, + ConstantVector::get(Mask)); + } + } + } + + unsigned VWidth = VecOp->getType()->getVectorNumElements(); + APInt UndefElts(VWidth, 0); + APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); + if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) { + if (V != &IE) + return replaceInstUsesWith(IE, V); + return &IE; + } + + if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE)) + return Shuf; + + if (Instruction *NewInsElt = hoistInsEltConst(IE, Builder)) + return NewInsElt; + + if (Instruction *Broadcast = foldInsSequenceIntoSplat(IE)) + return Broadcast; + + if (Instruction *Splat = foldInsEltIntoSplat(IE)) + return Splat; + + if (Instruction *IdentityShuf = foldInsEltIntoIdentityShuffle(IE)) + return IdentityShuf; + + return nullptr; +} + +/// Return true if we can evaluate the specified expression tree if the vector +/// elements were shuffled in a different order. +static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask, + unsigned Depth = 5) { + // We can always reorder the elements of a constant. + if (isa<Constant>(V)) + return true; + + // We won't reorder vector arguments. No IPO here. + Instruction *I = dyn_cast<Instruction>(V); + if (!I) return false; + + // Two users may expect different orders of the elements. Don't try it. + if (!I->hasOneUse()) + return false; + + if (Depth == 0) return false; + + switch (I->getOpcode()) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + // Propagating an undefined shuffle mask element to integer div/rem is not + // allowed because those opcodes can create immediate undefined behavior + // from an undefined element in an operand. + if (llvm::any_of(Mask, [](int M){ return M == -1; })) + return false; + LLVM_FALLTHROUGH; + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::GetElementPtr: { + // Bail out if we would create longer vector ops. We could allow creating + // longer vector ops, but that may result in more expensive codegen. + Type *ITy = I->getType(); + if (ITy->isVectorTy() && Mask.size() > ITy->getVectorNumElements()) + return false; + for (Value *Operand : I->operands()) { + if (!canEvaluateShuffled(Operand, Mask, Depth - 1)) + return false; + } + return true; + } + case Instruction::InsertElement: { + ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2)); + if (!CI) return false; + int ElementNumber = CI->getLimitedValue(); + + // Verify that 'CI' does not occur twice in Mask. A single 'insertelement' + // can't put an element into multiple indices. + bool SeenOnce = false; + for (int i = 0, e = Mask.size(); i != e; ++i) { + if (Mask[i] == ElementNumber) { + if (SeenOnce) + return false; + SeenOnce = true; + } + } + return canEvaluateShuffled(I->getOperand(0), Mask, Depth - 1); + } + } + return false; +} + +/// Rebuild a new instruction just like 'I' but with the new operands given. +/// In the event of type mismatch, the type of the operands is correct. +static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) { + // We don't want to use the IRBuilder here because we want the replacement + // instructions to appear next to 'I', not the builder's insertion point. + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + BinaryOperator *BO = cast<BinaryOperator>(I); + assert(NewOps.size() == 2 && "binary operator with #ops != 2"); + BinaryOperator *New = + BinaryOperator::Create(cast<BinaryOperator>(I)->getOpcode(), + NewOps[0], NewOps[1], "", BO); + if (isa<OverflowingBinaryOperator>(BO)) { + New->setHasNoUnsignedWrap(BO->hasNoUnsignedWrap()); + New->setHasNoSignedWrap(BO->hasNoSignedWrap()); + } + if (isa<PossiblyExactOperator>(BO)) { + New->setIsExact(BO->isExact()); + } + if (isa<FPMathOperator>(BO)) + New->copyFastMathFlags(I); + return New; + } + case Instruction::ICmp: + assert(NewOps.size() == 2 && "icmp with #ops != 2"); + return new ICmpInst(I, cast<ICmpInst>(I)->getPredicate(), + NewOps[0], NewOps[1]); + case Instruction::FCmp: + assert(NewOps.size() == 2 && "fcmp with #ops != 2"); + return new FCmpInst(I, cast<FCmpInst>(I)->getPredicate(), + NewOps[0], NewOps[1]); + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: { + // It's possible that the mask has a different number of elements from + // the original cast. We recompute the destination type to match the mask. + Type *DestTy = + VectorType::get(I->getType()->getScalarType(), + NewOps[0]->getType()->getVectorNumElements()); + assert(NewOps.size() == 1 && "cast with #ops != 1"); + return CastInst::Create(cast<CastInst>(I)->getOpcode(), NewOps[0], DestTy, + "", I); + } + case Instruction::GetElementPtr: { + Value *Ptr = NewOps[0]; + ArrayRef<Value*> Idx = NewOps.slice(1); + GetElementPtrInst *GEP = GetElementPtrInst::Create( + cast<GetElementPtrInst>(I)->getSourceElementType(), Ptr, Idx, "", I); + GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds()); + return GEP; + } + } + llvm_unreachable("failed to rebuild vector instructions"); +} + +static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { + // Mask.size() does not need to be equal to the number of vector elements. + + assert(V->getType()->isVectorTy() && "can't reorder non-vector elements"); + Type *EltTy = V->getType()->getScalarType(); + Type *I32Ty = IntegerType::getInt32Ty(V->getContext()); + if (isa<UndefValue>(V)) + return UndefValue::get(VectorType::get(EltTy, Mask.size())); + + if (isa<ConstantAggregateZero>(V)) + return ConstantAggregateZero::get(VectorType::get(EltTy, Mask.size())); + + if (Constant *C = dyn_cast<Constant>(V)) { + SmallVector<Constant *, 16> MaskValues; + for (int i = 0, e = Mask.size(); i != e; ++i) { + if (Mask[i] == -1) + MaskValues.push_back(UndefValue::get(I32Ty)); + else + MaskValues.push_back(ConstantInt::get(I32Ty, Mask[i])); + } + return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()), + ConstantVector::get(MaskValues)); + } + + Instruction *I = cast<Instruction>(V); + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::Select: + case Instruction::GetElementPtr: { + SmallVector<Value*, 8> NewOps; + bool NeedsRebuild = (Mask.size() != I->getType()->getVectorNumElements()); + for (int i = 0, e = I->getNumOperands(); i != e; ++i) { + Value *V; + // Recursively call evaluateInDifferentElementOrder on vector arguments + // as well. E.g. GetElementPtr may have scalar operands even if the + // return value is a vector, so we need to examine the operand type. + if (I->getOperand(i)->getType()->isVectorTy()) + V = evaluateInDifferentElementOrder(I->getOperand(i), Mask); + else + V = I->getOperand(i); + NewOps.push_back(V); + NeedsRebuild |= (V != I->getOperand(i)); + } + if (NeedsRebuild) { + return buildNew(I, NewOps); + } + return I; + } + case Instruction::InsertElement: { + int Element = cast<ConstantInt>(I->getOperand(2))->getLimitedValue(); + + // The insertelement was inserting at Element. Figure out which element + // that becomes after shuffling. The answer is guaranteed to be unique + // by CanEvaluateShuffled. + bool Found = false; + int Index = 0; + for (int e = Mask.size(); Index != e; ++Index) { + if (Mask[Index] == Element) { + Found = true; + break; + } + } + + // If element is not in Mask, no need to handle the operand 1 (element to + // be inserted). Just evaluate values in operand 0 according to Mask. + if (!Found) + return evaluateInDifferentElementOrder(I->getOperand(0), Mask); + + Value *V = evaluateInDifferentElementOrder(I->getOperand(0), Mask); + return InsertElementInst::Create(V, I->getOperand(1), + ConstantInt::get(I32Ty, Index), "", I); + } + } + llvm_unreachable("failed to reorder elements of vector instruction!"); +} + +static void recognizeIdentityMask(const SmallVectorImpl<int> &Mask, + bool &isLHSID, bool &isRHSID) { + isLHSID = isRHSID = true; + + for (unsigned i = 0, e = Mask.size(); i != e; ++i) { + if (Mask[i] < 0) continue; // Ignore undef values. + // Is this an identity shuffle of the LHS value? + isLHSID &= (Mask[i] == (int)i); + + // Is this an identity shuffle of the RHS value? + isRHSID &= (Mask[i]-e == i); + } +} + +// Returns true if the shuffle is extracting a contiguous range of values from +// LHS, for example: +// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ +// Input: |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP| +// Shuffles to: |EE|FF|GG|HH| +// +--+--+--+--+ +static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI, + SmallVector<int, 16> &Mask) { + unsigned LHSElems = SVI.getOperand(0)->getType()->getVectorNumElements(); + unsigned MaskElems = Mask.size(); + unsigned BegIdx = Mask.front(); + unsigned EndIdx = Mask.back(); + if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1) + return false; + for (unsigned I = 0; I != MaskElems; ++I) + if (static_cast<unsigned>(Mask[I]) != BegIdx + I) + return false; + return true; +} + +/// These are the ingredients in an alternate form binary operator as described +/// below. +struct BinopElts { + BinaryOperator::BinaryOps Opcode; + Value *Op0; + Value *Op1; + BinopElts(BinaryOperator::BinaryOps Opc = (BinaryOperator::BinaryOps)0, + Value *V0 = nullptr, Value *V1 = nullptr) : + Opcode(Opc), Op0(V0), Op1(V1) {} + operator bool() const { return Opcode != 0; } +}; + +/// Binops may be transformed into binops with different opcodes and operands. +/// Reverse the usual canonicalization to enable folds with the non-canonical +/// form of the binop. If a transform is possible, return the elements of the +/// new binop. If not, return invalid elements. +static BinopElts getAlternateBinop(BinaryOperator *BO, const DataLayout &DL) { + Value *BO0 = BO->getOperand(0), *BO1 = BO->getOperand(1); + Type *Ty = BO->getType(); + switch (BO->getOpcode()) { + case Instruction::Shl: { + // shl X, C --> mul X, (1 << C) + Constant *C; + if (match(BO1, m_Constant(C))) { + Constant *ShlOne = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C); + return { Instruction::Mul, BO0, ShlOne }; + } + break; + } + case Instruction::Or: { + // or X, C --> add X, C (when X and C have no common bits set) + const APInt *C; + if (match(BO1, m_APInt(C)) && MaskedValueIsZero(BO0, *C, DL)) + return { Instruction::Add, BO0, BO1 }; + break; + } + default: + break; + } + return {}; +} + +static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) { + assert(Shuf.isSelect() && "Must have select-equivalent shuffle"); + + // Are we shuffling together some value and that same value after it has been + // modified by a binop with a constant? + Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); + Constant *C; + bool Op0IsBinop; + if (match(Op0, m_BinOp(m_Specific(Op1), m_Constant(C)))) + Op0IsBinop = true; + else if (match(Op1, m_BinOp(m_Specific(Op0), m_Constant(C)))) + Op0IsBinop = false; + else + return nullptr; + + // The identity constant for a binop leaves a variable operand unchanged. For + // a vector, this is a splat of something like 0, -1, or 1. + // If there's no identity constant for this binop, we're done. + auto *BO = cast<BinaryOperator>(Op0IsBinop ? Op0 : Op1); + BinaryOperator::BinaryOps BOpcode = BO->getOpcode(); + Constant *IdC = ConstantExpr::getBinOpIdentity(BOpcode, Shuf.getType(), true); + if (!IdC) + return nullptr; + + // Shuffle identity constants into the lanes that return the original value. + // Example: shuf (mul X, {-1,-2,-3,-4}), X, {0,5,6,3} --> mul X, {-1,1,1,-4} + // Example: shuf X, (add X, {-1,-2,-3,-4}), {0,1,6,7} --> add X, {0,0,-3,-4} + // The existing binop constant vector remains in the same operand position. + Constant *Mask = Shuf.getMask(); + Constant *NewC = Op0IsBinop ? ConstantExpr::getShuffleVector(C, IdC, Mask) : + ConstantExpr::getShuffleVector(IdC, C, Mask); + + bool MightCreatePoisonOrUB = + Mask->containsUndefElement() && + (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode)); + if (MightCreatePoisonOrUB) + NewC = getSafeVectorConstantForBinop(BOpcode, NewC, true); + + // shuf (bop X, C), X, M --> bop X, C' + // shuf X, (bop X, C), M --> bop X, C' + Value *X = Op0IsBinop ? Op1 : Op0; + Instruction *NewBO = BinaryOperator::Create(BOpcode, X, NewC); + NewBO->copyIRFlags(BO); + + // An undef shuffle mask element may propagate as an undef constant element in + // the new binop. That would produce poison where the original code might not. + // If we already made a safe constant, then there's no danger. + if (Mask->containsUndefElement() && !MightCreatePoisonOrUB) + NewBO->dropPoisonGeneratingFlags(); + return NewBO; +} + +/// If we have an insert of a scalar to a non-zero element of an undefined +/// vector and then shuffle that value, that's the same as inserting to the zero +/// element and shuffling. Splatting from the zero element is recognized as the +/// canonical form of splat. +static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf, + InstCombiner::BuilderTy &Builder) { + Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); + Constant *Mask = Shuf.getMask(); + Value *X; + uint64_t IndexC; + + // Match a shuffle that is a splat to a non-zero element. + if (!match(Op0, m_OneUse(m_InsertElement(m_Undef(), m_Value(X), + m_ConstantInt(IndexC)))) || + !match(Op1, m_Undef()) || match(Mask, m_ZeroInt()) || IndexC == 0) + return nullptr; + + // Insert into element 0 of an undef vector. + UndefValue *UndefVec = UndefValue::get(Shuf.getType()); + Constant *Zero = Builder.getInt32(0); + Value *NewIns = Builder.CreateInsertElement(UndefVec, X, Zero); + + // Splat from element 0. Any mask element that is undefined remains undefined. + // For example: + // shuf (inselt undef, X, 2), undef, <2,2,undef> + // --> shuf (inselt undef, X, 0), undef, <0,0,undef> + unsigned NumMaskElts = Shuf.getType()->getVectorNumElements(); + SmallVector<Constant *, 16> NewMask(NumMaskElts, Zero); + for (unsigned i = 0; i != NumMaskElts; ++i) + if (isa<UndefValue>(Mask->getAggregateElement(i))) + NewMask[i] = Mask->getAggregateElement(i); + + return new ShuffleVectorInst(NewIns, UndefVec, ConstantVector::get(NewMask)); +} + +/// Try to fold shuffles that are the equivalent of a vector select. +static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf, + InstCombiner::BuilderTy &Builder, + const DataLayout &DL) { + if (!Shuf.isSelect()) + return nullptr; + + // Canonicalize to choose from operand 0 first. + unsigned NumElts = Shuf.getType()->getVectorNumElements(); + if (Shuf.getMaskValue(0) >= (int)NumElts) { + // TODO: Can we assert that both operands of a shuffle-select are not undef + // (otherwise, it would have been folded by instsimplify? + Shuf.commute(); + return &Shuf; + } + + if (Instruction *I = foldSelectShuffleWith1Binop(Shuf)) + return I; + + BinaryOperator *B0, *B1; + if (!match(Shuf.getOperand(0), m_BinOp(B0)) || + !match(Shuf.getOperand(1), m_BinOp(B1))) + return nullptr; + + Value *X, *Y; + Constant *C0, *C1; + bool ConstantsAreOp1; + if (match(B0, m_BinOp(m_Value(X), m_Constant(C0))) && + match(B1, m_BinOp(m_Value(Y), m_Constant(C1)))) + ConstantsAreOp1 = true; + else if (match(B0, m_BinOp(m_Constant(C0), m_Value(X))) && + match(B1, m_BinOp(m_Constant(C1), m_Value(Y)))) + ConstantsAreOp1 = false; + else + return nullptr; + + // We need matching binops to fold the lanes together. + BinaryOperator::BinaryOps Opc0 = B0->getOpcode(); + BinaryOperator::BinaryOps Opc1 = B1->getOpcode(); + bool DropNSW = false; + if (ConstantsAreOp1 && Opc0 != Opc1) { + // TODO: We drop "nsw" if shift is converted into multiply because it may + // not be correct when the shift amount is BitWidth - 1. We could examine + // each vector element to determine if it is safe to keep that flag. + if (Opc0 == Instruction::Shl || Opc1 == Instruction::Shl) + DropNSW = true; + if (BinopElts AltB0 = getAlternateBinop(B0, DL)) { + assert(isa<Constant>(AltB0.Op1) && "Expecting constant with alt binop"); + Opc0 = AltB0.Opcode; + C0 = cast<Constant>(AltB0.Op1); + } else if (BinopElts AltB1 = getAlternateBinop(B1, DL)) { + assert(isa<Constant>(AltB1.Op1) && "Expecting constant with alt binop"); + Opc1 = AltB1.Opcode; + C1 = cast<Constant>(AltB1.Op1); + } + } + + if (Opc0 != Opc1) + return nullptr; + + // The opcodes must be the same. Use a new name to make that clear. + BinaryOperator::BinaryOps BOpc = Opc0; + + // Select the constant elements needed for the single binop. + Constant *Mask = Shuf.getMask(); + Constant *NewC = ConstantExpr::getShuffleVector(C0, C1, Mask); + + // We are moving a binop after a shuffle. When a shuffle has an undefined + // mask element, the result is undefined, but it is not poison or undefined + // behavior. That is not necessarily true for div/rem/shift. + bool MightCreatePoisonOrUB = + Mask->containsUndefElement() && + (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc)); + if (MightCreatePoisonOrUB) + NewC = getSafeVectorConstantForBinop(BOpc, NewC, ConstantsAreOp1); + + Value *V; + if (X == Y) { + // Remove a binop and the shuffle by rearranging the constant: + // shuffle (op V, C0), (op V, C1), M --> op V, C' + // shuffle (op C0, V), (op C1, V), M --> op C', V + V = X; + } else { + // If there are 2 different variable operands, we must create a new shuffle + // (select) first, so check uses to ensure that we don't end up with more + // instructions than we started with. + if (!B0->hasOneUse() && !B1->hasOneUse()) + return nullptr; + + // If we use the original shuffle mask and op1 is *variable*, we would be + // putting an undef into operand 1 of div/rem/shift. This is either UB or + // poison. We do not have to guard against UB when *constants* are op1 + // because safe constants guarantee that we do not overflow sdiv/srem (and + // there's no danger for other opcodes). + // TODO: To allow this case, create a new shuffle mask with no undefs. + if (MightCreatePoisonOrUB && !ConstantsAreOp1) + return nullptr; + + // Note: In general, we do not create new shuffles in InstCombine because we + // do not know if a target can lower an arbitrary shuffle optimally. In this + // case, the shuffle uses the existing mask, so there is no additional risk. + + // Select the variable vectors first, then perform the binop: + // shuffle (op X, C0), (op Y, C1), M --> op (shuffle X, Y, M), C' + // shuffle (op C0, X), (op C1, Y), M --> op C', (shuffle X, Y, M) + V = Builder.CreateShuffleVector(X, Y, Mask); + } + + Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, V, NewC) : + BinaryOperator::Create(BOpc, NewC, V); + + // Flags are intersected from the 2 source binops. But there are 2 exceptions: + // 1. If we changed an opcode, poison conditions might have changed. + // 2. If the shuffle had undef mask elements, the new binop might have undefs + // where the original code did not. But if we already made a safe constant, + // then there's no danger. + NewBO->copyIRFlags(B0); + NewBO->andIRFlags(B1); + if (DropNSW) + NewBO->setHasNoSignedWrap(false); + if (Mask->containsUndefElement() && !MightCreatePoisonOrUB) + NewBO->dropPoisonGeneratingFlags(); + return NewBO; +} + +/// Match a shuffle-select-shuffle pattern where the shuffles are widening and +/// narrowing (concatenating with undef and extracting back to the original +/// length). This allows replacing the wide select with a narrow select. +static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf, + InstCombiner::BuilderTy &Builder) { + // This must be a narrowing identity shuffle. It extracts the 1st N elements + // of the 1st vector operand of a shuffle. + if (!match(Shuf.getOperand(1), m_Undef()) || !Shuf.isIdentityWithExtract()) + return nullptr; + + // The vector being shuffled must be a vector select that we can eliminate. + // TODO: The one-use requirement could be eased if X and/or Y are constants. + Value *Cond, *X, *Y; + if (!match(Shuf.getOperand(0), + m_OneUse(m_Select(m_Value(Cond), m_Value(X), m_Value(Y))))) + return nullptr; + + // We need a narrow condition value. It must be extended with undef elements + // and have the same number of elements as this shuffle. + unsigned NarrowNumElts = Shuf.getType()->getVectorNumElements(); + Value *NarrowCond; + if (!match(Cond, m_OneUse(m_ShuffleVector(m_Value(NarrowCond), m_Undef(), + m_Constant()))) || + NarrowCond->getType()->getVectorNumElements() != NarrowNumElts || + !cast<ShuffleVectorInst>(Cond)->isIdentityWithPadding()) + return nullptr; + + // shuf (sel (shuf NarrowCond, undef, WideMask), X, Y), undef, NarrowMask) --> + // sel NarrowCond, (shuf X, undef, NarrowMask), (shuf Y, undef, NarrowMask) + Value *Undef = UndefValue::get(X->getType()); + Value *NarrowX = Builder.CreateShuffleVector(X, Undef, Shuf.getMask()); + Value *NarrowY = Builder.CreateShuffleVector(Y, Undef, Shuf.getMask()); + return SelectInst::Create(NarrowCond, NarrowX, NarrowY); +} + +/// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask. +static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) { + Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); + if (!Shuf.isIdentityWithExtract() || !isa<UndefValue>(Op1)) + return nullptr; + + Value *X, *Y; + Constant *Mask; + if (!match(Op0, m_ShuffleVector(m_Value(X), m_Value(Y), m_Constant(Mask)))) + return nullptr; + + // Be conservative with shuffle transforms. If we can't kill the 1st shuffle, + // then combining may result in worse codegen. + if (!Op0->hasOneUse()) + return nullptr; + + // We are extracting a subvector from a shuffle. Remove excess elements from + // the 1st shuffle mask to eliminate the extract. + // + // This transform is conservatively limited to identity extracts because we do + // not allow arbitrary shuffle mask creation as a target-independent transform + // (because we can't guarantee that will lower efficiently). + // + // If the extracting shuffle has an undef mask element, it transfers to the + // new shuffle mask. Otherwise, copy the original mask element. Example: + // shuf (shuf X, Y, <C0, C1, C2, undef, C4>), undef, <0, undef, 2, 3> --> + // shuf X, Y, <C0, undef, C2, undef> + unsigned NumElts = Shuf.getType()->getVectorNumElements(); + SmallVector<Constant *, 16> NewMask(NumElts); + assert(NumElts < Mask->getType()->getVectorNumElements() && + "Identity with extract must have less elements than its inputs"); + + for (unsigned i = 0; i != NumElts; ++i) { + Constant *ExtractMaskElt = Shuf.getMask()->getAggregateElement(i); + Constant *MaskElt = Mask->getAggregateElement(i); + NewMask[i] = isa<UndefValue>(ExtractMaskElt) ? ExtractMaskElt : MaskElt; + } + return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask)); +} + +/// Try to replace a shuffle with an insertelement. +static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf) { + Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1); + SmallVector<int, 16> Mask = Shuf.getShuffleMask(); + + // The shuffle must not change vector sizes. + // TODO: This restriction could be removed if the insert has only one use + // (because the transform would require a new length-changing shuffle). + int NumElts = Mask.size(); + if (NumElts != (int)(V0->getType()->getVectorNumElements())) + return nullptr; + + // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC' + auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) { + // We need an insertelement with a constant index. + if (!match(V0, m_InsertElement(m_Value(), m_Value(Scalar), + m_ConstantInt(IndexC)))) + return false; + + // Test the shuffle mask to see if it splices the inserted scalar into the + // operand 1 vector of the shuffle. + int NewInsIndex = -1; + for (int i = 0; i != NumElts; ++i) { + // Ignore undef mask elements. + if (Mask[i] == -1) + continue; + + // The shuffle takes elements of operand 1 without lane changes. + if (Mask[i] == NumElts + i) + continue; + + // The shuffle must choose the inserted scalar exactly once. + if (NewInsIndex != -1 || Mask[i] != IndexC->getSExtValue()) + return false; + + // The shuffle is placing the inserted scalar into element i. + NewInsIndex = i; + } + + assert(NewInsIndex != -1 && "Did not fold shuffle with unused operand?"); + + // Index is updated to the potentially translated insertion lane. + IndexC = ConstantInt::get(IndexC->getType(), NewInsIndex); + return true; + }; + + // If the shuffle is unnecessary, insert the scalar operand directly into + // operand 1 of the shuffle. Example: + // shuffle (insert ?, S, 1), V1, <1, 5, 6, 7> --> insert V1, S, 0 + Value *Scalar; + ConstantInt *IndexC; + if (isShufflingScalarIntoOp1(Scalar, IndexC)) + return InsertElementInst::Create(V1, Scalar, IndexC); + + // Try again after commuting shuffle. Example: + // shuffle V0, (insert ?, S, 0), <0, 1, 2, 4> --> + // shuffle (insert ?, S, 0), V0, <4, 5, 6, 0> --> insert V0, S, 3 + std::swap(V0, V1); + ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); + if (isShufflingScalarIntoOp1(Scalar, IndexC)) + return InsertElementInst::Create(V1, Scalar, IndexC); + + return nullptr; +} + +static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) { + // Match the operands as identity with padding (also known as concatenation + // with undef) shuffles of the same source type. The backend is expected to + // recreate these concatenations from a shuffle of narrow operands. + auto *Shuffle0 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(0)); + auto *Shuffle1 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(1)); + if (!Shuffle0 || !Shuffle0->isIdentityWithPadding() || + !Shuffle1 || !Shuffle1->isIdentityWithPadding()) + return nullptr; + + // We limit this transform to power-of-2 types because we expect that the + // backend can convert the simplified IR patterns to identical nodes as the + // original IR. + // TODO: If we can verify the same behavior for arbitrary types, the + // power-of-2 checks can be removed. + Value *X = Shuffle0->getOperand(0); + Value *Y = Shuffle1->getOperand(0); + if (X->getType() != Y->getType() || + !isPowerOf2_32(Shuf.getType()->getVectorNumElements()) || + !isPowerOf2_32(Shuffle0->getType()->getVectorNumElements()) || + !isPowerOf2_32(X->getType()->getVectorNumElements()) || + isa<UndefValue>(X) || isa<UndefValue>(Y)) + return nullptr; + assert(isa<UndefValue>(Shuffle0->getOperand(1)) && + isa<UndefValue>(Shuffle1->getOperand(1)) && + "Unexpected operand for identity shuffle"); + + // This is a shuffle of 2 widening shuffles. We can shuffle the narrow source + // operands directly by adjusting the shuffle mask to account for the narrower + // types: + // shuf (widen X), (widen Y), Mask --> shuf X, Y, Mask' + int NarrowElts = X->getType()->getVectorNumElements(); + int WideElts = Shuffle0->getType()->getVectorNumElements(); + assert(WideElts > NarrowElts && "Unexpected types for identity with padding"); + + Type *I32Ty = IntegerType::getInt32Ty(Shuf.getContext()); + SmallVector<int, 16> Mask = Shuf.getShuffleMask(); + SmallVector<Constant *, 16> NewMask(Mask.size(), UndefValue::get(I32Ty)); + for (int i = 0, e = Mask.size(); i != e; ++i) { + if (Mask[i] == -1) + continue; + + // If this shuffle is choosing an undef element from 1 of the sources, that + // element is undef. + if (Mask[i] < WideElts) { + if (Shuffle0->getMaskValue(Mask[i]) == -1) + continue; + } else { + if (Shuffle1->getMaskValue(Mask[i] - WideElts) == -1) + continue; + } + + // If this shuffle is choosing from the 1st narrow op, the mask element is + // the same. If this shuffle is choosing from the 2nd narrow op, the mask + // element is offset down to adjust for the narrow vector widths. + if (Mask[i] < WideElts) { + assert(Mask[i] < NarrowElts && "Unexpected shuffle mask"); + NewMask[i] = ConstantInt::get(I32Ty, Mask[i]); + } else { + assert(Mask[i] < (WideElts + NarrowElts) && "Unexpected shuffle mask"); + NewMask[i] = ConstantInt::get(I32Ty, Mask[i] - (WideElts - NarrowElts)); + } + } + return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask)); +} + +Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { + Value *LHS = SVI.getOperand(0); + Value *RHS = SVI.getOperand(1); + if (auto *V = SimplifyShuffleVectorInst( + LHS, RHS, SVI.getMask(), SVI.getType(), SQ.getWithInstruction(&SVI))) + return replaceInstUsesWith(SVI, V); + + // Canonicalize shuffle(x ,x,mask) -> shuffle(x, undef,mask') + // Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask'). + unsigned VWidth = SVI.getType()->getVectorNumElements(); + unsigned LHSWidth = LHS->getType()->getVectorNumElements(); + SmallVector<int, 16> Mask = SVI.getShuffleMask(); + Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); + if (LHS == RHS || isa<UndefValue>(LHS)) { + // Remap any references to RHS to use LHS. + SmallVector<Constant*, 16> Elts; + for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) { + if (Mask[i] < 0) { + Elts.push_back(UndefValue::get(Int32Ty)); + continue; + } + + if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) || + (Mask[i] < (int)e && isa<UndefValue>(LHS))) { + Mask[i] = -1; // Turn into undef. + Elts.push_back(UndefValue::get(Int32Ty)); + } else { + Mask[i] = Mask[i] % e; // Force to LHS. + Elts.push_back(ConstantInt::get(Int32Ty, Mask[i])); + } + } + SVI.setOperand(0, SVI.getOperand(1)); + SVI.setOperand(1, UndefValue::get(RHS->getType())); + SVI.setOperand(2, ConstantVector::get(Elts)); + return &SVI; + } + + if (Instruction *I = canonicalizeInsertSplat(SVI, Builder)) + return I; + + if (Instruction *I = foldSelectShuffle(SVI, Builder, DL)) + return I; + + if (Instruction *I = narrowVectorSelect(SVI, Builder)) + return I; + + APInt UndefElts(VWidth, 0); + APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); + if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) { + if (V != &SVI) + return replaceInstUsesWith(SVI, V); + return &SVI; + } + + if (Instruction *I = foldIdentityExtractShuffle(SVI)) + return I; + + // These transforms have the potential to lose undef knowledge, so they are + // intentionally placed after SimplifyDemandedVectorElts(). + if (Instruction *I = foldShuffleWithInsert(SVI)) + return I; + if (Instruction *I = foldIdentityPaddedShuffles(SVI)) + return I; + + if (VWidth == LHSWidth) { + // Analyze the shuffle, are the LHS or RHS and identity shuffles? + bool isLHSID, isRHSID; + recognizeIdentityMask(Mask, isLHSID, isRHSID); + + // Eliminate identity shuffles. + if (isLHSID) return replaceInstUsesWith(SVI, LHS); + if (isRHSID) return replaceInstUsesWith(SVI, RHS); + } + + if (isa<UndefValue>(RHS) && canEvaluateShuffled(LHS, Mask)) { + Value *V = evaluateInDifferentElementOrder(LHS, Mask); + return replaceInstUsesWith(SVI, V); + } + + // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to + // a non-vector type. We can instead bitcast the original vector followed by + // an extract of the desired element: + // + // %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, + // <4 x i32> <i32 0, i32 1, i32 2, i32 3> + // %1 = bitcast <4 x i8> %sroa to i32 + // Becomes: + // %bc = bitcast <16 x i8> %in to <4 x i32> + // %ext = extractelement <4 x i32> %bc, i32 0 + // + // If the shuffle is extracting a contiguous range of values from the input + // vector then each use which is a bitcast of the extracted size can be + // replaced. This will work if the vector types are compatible, and the begin + // index is aligned to a value in the casted vector type. If the begin index + // isn't aligned then we can shuffle the original vector (keeping the same + // vector type) before extracting. + // + // This code will bail out if the target type is fundamentally incompatible + // with vectors of the source type. + // + // Example of <16 x i8>, target type i32: + // Index range [4,8): v-----------v Will work. + // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + // <16 x i8>: | | | | | | | | | | | | | | | | | + // <4 x i32>: | | | | | + // +-----------+-----------+-----------+-----------+ + // Index range [6,10): ^-----------^ Needs an extra shuffle. + // Target type i40: ^--------------^ Won't work, bail. + bool MadeChange = false; + if (isShuffleExtractingFromLHS(SVI, Mask)) { + Value *V = LHS; + unsigned MaskElems = Mask.size(); + VectorType *SrcTy = cast<VectorType>(V->getType()); + unsigned VecBitWidth = SrcTy->getBitWidth(); + unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType()); + assert(SrcElemBitWidth && "vector elements must have a bitwidth"); + unsigned SrcNumElems = SrcTy->getNumElements(); + SmallVector<BitCastInst *, 8> BCs; + DenseMap<Type *, Value *> NewBCs; + for (User *U : SVI.users()) + if (BitCastInst *BC = dyn_cast<BitCastInst>(U)) + if (!BC->use_empty()) + // Only visit bitcasts that weren't previously handled. + BCs.push_back(BC); + for (BitCastInst *BC : BCs) { + unsigned BegIdx = Mask.front(); + Type *TgtTy = BC->getDestTy(); + unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy); + if (!TgtElemBitWidth) + continue; + unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth; + bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth; + bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth); + if (!VecBitWidthsEqual) + continue; + if (!VectorType::isValidElementType(TgtTy)) + continue; + VectorType *CastSrcTy = VectorType::get(TgtTy, TgtNumElems); + if (!BegIsAligned) { + // Shuffle the input so [0,NumElements) contains the output, and + // [NumElems,SrcNumElems) is undef. + SmallVector<Constant *, 16> ShuffleMask(SrcNumElems, + UndefValue::get(Int32Ty)); + for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I) + ShuffleMask[I] = ConstantInt::get(Int32Ty, Idx); + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), + ConstantVector::get(ShuffleMask), + SVI.getName() + ".extract"); + BegIdx = 0; + } + unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth; + assert(SrcElemsPerTgtElem); + BegIdx /= SrcElemsPerTgtElem; + bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end(); + auto *NewBC = + BCAlreadyExists + ? NewBCs[CastSrcTy] + : Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc"); + if (!BCAlreadyExists) + NewBCs[CastSrcTy] = NewBC; + auto *Ext = Builder.CreateExtractElement( + NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract"); + // The shufflevector isn't being replaced: the bitcast that used it + // is. InstCombine will visit the newly-created instructions. + replaceInstUsesWith(*BC, Ext); + MadeChange = true; + } + } + + // If the LHS is a shufflevector itself, see if we can combine it with this + // one without producing an unusual shuffle. + // Cases that might be simplified: + // 1. + // x1=shuffle(v1,v2,mask1) + // x=shuffle(x1,undef,mask) + // ==> + // x=shuffle(v1,undef,newMask) + // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : -1 + // 2. + // x1=shuffle(v1,undef,mask1) + // x=shuffle(x1,x2,mask) + // where v1.size() == mask1.size() + // ==> + // x=shuffle(v1,x2,newMask) + // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : mask[i] + // 3. + // x2=shuffle(v2,undef,mask2) + // x=shuffle(x1,x2,mask) + // where v2.size() == mask2.size() + // ==> + // x=shuffle(x1,v2,newMask) + // newMask[i] = (mask[i] < x1.size()) + // ? mask[i] : mask2[mask[i]-x1.size()]+x1.size() + // 4. + // x1=shuffle(v1,undef,mask1) + // x2=shuffle(v2,undef,mask2) + // x=shuffle(x1,x2,mask) + // where v1.size() == v2.size() + // ==> + // x=shuffle(v1,v2,newMask) + // newMask[i] = (mask[i] < x1.size()) + // ? mask1[mask[i]] : mask2[mask[i]-x1.size()]+v1.size() + // + // Here we are really conservative: + // we are absolutely afraid of producing a shuffle mask not in the input + // program, because the code gen may not be smart enough to turn a merged + // shuffle into two specific shuffles: it may produce worse code. As such, + // we only merge two shuffles if the result is either a splat or one of the + // input shuffle masks. In this case, merging the shuffles just removes + // one instruction, which we know is safe. This is good for things like + // turning: (splat(splat)) -> splat, or + // merge(V[0..n], V[n+1..2n]) -> V[0..2n] + ShuffleVectorInst* LHSShuffle = dyn_cast<ShuffleVectorInst>(LHS); + ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS); + if (LHSShuffle) + if (!isa<UndefValue>(LHSShuffle->getOperand(1)) && !isa<UndefValue>(RHS)) + LHSShuffle = nullptr; + if (RHSShuffle) + if (!isa<UndefValue>(RHSShuffle->getOperand(1))) + RHSShuffle = nullptr; + if (!LHSShuffle && !RHSShuffle) + return MadeChange ? &SVI : nullptr; + + Value* LHSOp0 = nullptr; + Value* LHSOp1 = nullptr; + Value* RHSOp0 = nullptr; + unsigned LHSOp0Width = 0; + unsigned RHSOp0Width = 0; + if (LHSShuffle) { + LHSOp0 = LHSShuffle->getOperand(0); + LHSOp1 = LHSShuffle->getOperand(1); + LHSOp0Width = LHSOp0->getType()->getVectorNumElements(); + } + if (RHSShuffle) { + RHSOp0 = RHSShuffle->getOperand(0); + RHSOp0Width = RHSOp0->getType()->getVectorNumElements(); + } + Value* newLHS = LHS; + Value* newRHS = RHS; + if (LHSShuffle) { + // case 1 + if (isa<UndefValue>(RHS)) { + newLHS = LHSOp0; + newRHS = LHSOp1; + } + // case 2 or 4 + else if (LHSOp0Width == LHSWidth) { + newLHS = LHSOp0; + } + } + // case 3 or 4 + if (RHSShuffle && RHSOp0Width == LHSWidth) { + newRHS = RHSOp0; + } + // case 4 + if (LHSOp0 == RHSOp0) { + newLHS = LHSOp0; + newRHS = nullptr; + } + + if (newLHS == LHS && newRHS == RHS) + return MadeChange ? &SVI : nullptr; + + SmallVector<int, 16> LHSMask; + SmallVector<int, 16> RHSMask; + if (newLHS != LHS) + LHSMask = LHSShuffle->getShuffleMask(); + if (RHSShuffle && newRHS != RHS) + RHSMask = RHSShuffle->getShuffleMask(); + + unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth; + SmallVector<int, 16> newMask; + bool isSplat = true; + int SplatElt = -1; + // Create a new mask for the new ShuffleVectorInst so that the new + // ShuffleVectorInst is equivalent to the original one. + for (unsigned i = 0; i < VWidth; ++i) { + int eltMask; + if (Mask[i] < 0) { + // This element is an undef value. + eltMask = -1; + } else if (Mask[i] < (int)LHSWidth) { + // This element is from left hand side vector operand. + // + // If LHS is going to be replaced (case 1, 2, or 4), calculate the + // new mask value for the element. + if (newLHS != LHS) { + eltMask = LHSMask[Mask[i]]; + // If the value selected is an undef value, explicitly specify it + // with a -1 mask value. + if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1)) + eltMask = -1; + } else + eltMask = Mask[i]; + } else { + // This element is from right hand side vector operand + // + // If the value selected is an undef value, explicitly specify it + // with a -1 mask value. (case 1) + if (isa<UndefValue>(RHS)) + eltMask = -1; + // If RHS is going to be replaced (case 3 or 4), calculate the + // new mask value for the element. + else if (newRHS != RHS) { + eltMask = RHSMask[Mask[i]-LHSWidth]; + // If the value selected is an undef value, explicitly specify it + // with a -1 mask value. + if (eltMask >= (int)RHSOp0Width) { + assert(isa<UndefValue>(RHSShuffle->getOperand(1)) + && "should have been check above"); + eltMask = -1; + } + } else + eltMask = Mask[i]-LHSWidth; + + // If LHS's width is changed, shift the mask value accordingly. + // If newRHS == nullptr, i.e. LHSOp0 == RHSOp0, we want to remap any + // references from RHSOp0 to LHSOp0, so we don't need to shift the mask. + // If newRHS == newLHS, we want to remap any references from newRHS to + // newLHS so that we can properly identify splats that may occur due to + // obfuscation across the two vectors. + if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS) + eltMask += newLHSWidth; + } + + // Check if this could still be a splat. + if (eltMask >= 0) { + if (SplatElt >= 0 && SplatElt != eltMask) + isSplat = false; + SplatElt = eltMask; + } + + newMask.push_back(eltMask); + } + + // If the result mask is equal to one of the original shuffle masks, + // or is a splat, do the replacement. + if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) { + SmallVector<Constant*, 16> Elts; + for (unsigned i = 0, e = newMask.size(); i != e; ++i) { + if (newMask[i] < 0) { + Elts.push_back(UndefValue::get(Int32Ty)); + } else { + Elts.push_back(ConstantInt::get(Int32Ty, newMask[i])); + } + } + if (!newRHS) + newRHS = UndefValue::get(newLHS->getType()); + return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts)); + } + + // If the result mask is an identity, replace uses of this instruction with + // corresponding argument. + bool isLHSID, isRHSID; + recognizeIdentityMask(newMask, isLHSID, isRHSID); + if (isLHSID && VWidth == LHSOp0Width) return replaceInstUsesWith(SVI, newLHS); + if (isRHSID && VWidth == RHSOp0Width) return replaceInstUsesWith(SVI, newRHS); + + return MadeChange ? &SVI : nullptr; +} diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp new file mode 100644 index 000000000000..ecb486c544e0 --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -0,0 +1,3652 @@ +//===- InstructionCombining.cpp - Combine multiple instructions -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// InstructionCombining - Combine instructions to form fewer, simple +// instructions. This pass does not modify the CFG. This pass is where +// algebraic simplification happens. +// +// This pass combines things like: +// %Y = add i32 %X, 1 +// %Z = add i32 %Y, 1 +// into: +// %Z = add i32 %X, 2 +// +// This is a simple worklist driven algorithm. +// +// This pass guarantees that the following canonicalizations are performed on +// the program: +// 1. If a binary operator has a constant operand, it is moved to the RHS +// 2. Bitwise operators with constant operands are always grouped so that +// shifts are performed first, then or's, then and's, then xor's. +// 3. Compare instructions are converted from <,>,<=,>= to ==,!= if possible +// 4. All cmp instructions on boolean values are replaced with logical ops +// 5. add X, X is represented as (X*2) => (X << 1) +// 6. Multiplies with a power-of-two constant argument are transformed into +// shifts. +// ... etc. +// +//===----------------------------------------------------------------------===// + +#include "InstCombineInternal.h" +#include "llvm-c/Initialization.h" +#include "llvm-c/Transforms/InstCombine.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/TinyPtrVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LazyBlockFrequencyInfo.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/TargetFolder.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Pass.h" +#include "llvm/Support/CBindingWrapping.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/DebugCounter.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/InstCombine/InstCombine.h" +#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" +#include "llvm/Transforms/Utils/Local.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <memory> +#include <string> +#include <utility> + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "instcombine" + +STATISTIC(NumCombined , "Number of insts combined"); +STATISTIC(NumConstProp, "Number of constant folds"); +STATISTIC(NumDeadInst , "Number of dead inst eliminated"); +STATISTIC(NumSunkInst , "Number of instructions sunk"); +STATISTIC(NumExpand, "Number of expansions"); +STATISTIC(NumFactor , "Number of factorizations"); +STATISTIC(NumReassoc , "Number of reassociations"); +DEBUG_COUNTER(VisitCounter, "instcombine-visit", + "Controls which instructions are visited"); + +static cl::opt<bool> +EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"), + cl::init(true)); + +static cl::opt<bool> +EnableExpensiveCombines("expensive-combines", + cl::desc("Enable expensive instruction combines")); + +static cl::opt<unsigned> +MaxArraySize("instcombine-maxarray-size", cl::init(1024), + cl::desc("Maximum array size considered when doing a combine")); + +// FIXME: Remove this flag when it is no longer necessary to convert +// llvm.dbg.declare to avoid inaccurate debug info. Setting this to false +// increases variable availability at the cost of accuracy. Variables that +// cannot be promoted by mem2reg or SROA will be described as living in memory +// for their entire lifetime. However, passes like DSE and instcombine can +// delete stores to the alloca, leading to misleading and inaccurate debug +// information. This flag can be removed when those passes are fixed. +static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare", + cl::Hidden, cl::init(true)); + +Value *InstCombiner::EmitGEPOffset(User *GEP) { + return llvm::EmitGEPOffset(&Builder, DL, GEP); +} + +/// Return true if it is desirable to convert an integer computation from a +/// given bit width to a new bit width. +/// We don't want to convert from a legal to an illegal type or from a smaller +/// to a larger illegal type. A width of '1' is always treated as a legal type +/// because i1 is a fundamental type in IR, and there are many specialized +/// optimizations for i1 types. Widths of 8, 16 or 32 are equally treated as +/// legal to convert to, in order to open up more combining opportunities. +/// NOTE: this treats i8, i16 and i32 specially, due to them being so common +/// from frontend languages. +bool InstCombiner::shouldChangeType(unsigned FromWidth, + unsigned ToWidth) const { + bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth); + bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth); + + // Convert to widths of 8, 16 or 32 even if they are not legal types. Only + // shrink types, to prevent infinite loops. + if (ToWidth < FromWidth && (ToWidth == 8 || ToWidth == 16 || ToWidth == 32)) + return true; + + // If this is a legal integer from type, and the result would be an illegal + // type, don't do the transformation. + if (FromLegal && !ToLegal) + return false; + + // Otherwise, if both are illegal, do not increase the size of the result. We + // do allow things like i160 -> i64, but not i64 -> i160. + if (!FromLegal && !ToLegal && ToWidth > FromWidth) + return false; + + return true; +} + +/// Return true if it is desirable to convert a computation from 'From' to 'To'. +/// We don't want to convert from a legal to an illegal type or from a smaller +/// to a larger illegal type. i1 is always treated as a legal type because it is +/// a fundamental type in IR, and there are many specialized optimizations for +/// i1 types. +bool InstCombiner::shouldChangeType(Type *From, Type *To) const { + // TODO: This could be extended to allow vectors. Datalayout changes might be + // needed to properly support that. + if (!From->isIntegerTy() || !To->isIntegerTy()) + return false; + + unsigned FromWidth = From->getPrimitiveSizeInBits(); + unsigned ToWidth = To->getPrimitiveSizeInBits(); + return shouldChangeType(FromWidth, ToWidth); +} + +// Return true, if No Signed Wrap should be maintained for I. +// The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C", +// where both B and C should be ConstantInts, results in a constant that does +// not overflow. This function only handles the Add and Sub opcodes. For +// all other opcodes, the function conservatively returns false. +static bool maintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) { + auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I); + if (!OBO || !OBO->hasNoSignedWrap()) + return false; + + // We reason about Add and Sub Only. + Instruction::BinaryOps Opcode = I.getOpcode(); + if (Opcode != Instruction::Add && Opcode != Instruction::Sub) + return false; + + const APInt *BVal, *CVal; + if (!match(B, m_APInt(BVal)) || !match(C, m_APInt(CVal))) + return false; + + bool Overflow = false; + if (Opcode == Instruction::Add) + (void)BVal->sadd_ov(*CVal, Overflow); + else + (void)BVal->ssub_ov(*CVal, Overflow); + + return !Overflow; +} + +static bool hasNoUnsignedWrap(BinaryOperator &I) { + auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I); + return OBO && OBO->hasNoUnsignedWrap(); +} + +static bool hasNoSignedWrap(BinaryOperator &I) { + auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I); + return OBO && OBO->hasNoSignedWrap(); +} + +/// Conservatively clears subclassOptionalData after a reassociation or +/// commutation. We preserve fast-math flags when applicable as they can be +/// preserved. +static void ClearSubclassDataAfterReassociation(BinaryOperator &I) { + FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I); + if (!FPMO) { + I.clearSubclassOptionalData(); + return; + } + + FastMathFlags FMF = I.getFastMathFlags(); + I.clearSubclassOptionalData(); + I.setFastMathFlags(FMF); +} + +/// Combine constant operands of associative operations either before or after a +/// cast to eliminate one of the associative operations: +/// (op (cast (op X, C2)), C1) --> (cast (op X, op (C1, C2))) +/// (op (cast (op X, C2)), C1) --> (op (cast X), op (C1, C2)) +static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1) { + auto *Cast = dyn_cast<CastInst>(BinOp1->getOperand(0)); + if (!Cast || !Cast->hasOneUse()) + return false; + + // TODO: Enhance logic for other casts and remove this check. + auto CastOpcode = Cast->getOpcode(); + if (CastOpcode != Instruction::ZExt) + return false; + + // TODO: Enhance logic for other BinOps and remove this check. + if (!BinOp1->isBitwiseLogicOp()) + return false; + + auto AssocOpcode = BinOp1->getOpcode(); + auto *BinOp2 = dyn_cast<BinaryOperator>(Cast->getOperand(0)); + if (!BinOp2 || !BinOp2->hasOneUse() || BinOp2->getOpcode() != AssocOpcode) + return false; + + Constant *C1, *C2; + if (!match(BinOp1->getOperand(1), m_Constant(C1)) || + !match(BinOp2->getOperand(1), m_Constant(C2))) + return false; + + // TODO: This assumes a zext cast. + // Eg, if it was a trunc, we'd cast C1 to the source type because casting C2 + // to the destination type might lose bits. + + // Fold the constants together in the destination type: + // (op (cast (op X, C2)), C1) --> (op (cast X), FoldedC) + Type *DestTy = C1->getType(); + Constant *CastC2 = ConstantExpr::getCast(CastOpcode, C2, DestTy); + Constant *FoldedC = ConstantExpr::get(AssocOpcode, C1, CastC2); + Cast->setOperand(0, BinOp2->getOperand(0)); + BinOp1->setOperand(1, FoldedC); + return true; +} + +/// This performs a few simplifications for operators that are associative or +/// commutative: +/// +/// Commutative operators: +/// +/// 1. Order operands such that they are listed from right (least complex) to +/// left (most complex). This puts constants before unary operators before +/// binary operators. +/// +/// Associative operators: +/// +/// 2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies. +/// 3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies. +/// +/// Associative and commutative operators: +/// +/// 4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies. +/// 5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. +/// 6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" +/// if C1 and C2 are constants. +bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { + Instruction::BinaryOps Opcode = I.getOpcode(); + bool Changed = false; + + do { + // Order operands such that they are listed from right (least complex) to + // left (most complex). This puts constants before unary operators before + // binary operators. + if (I.isCommutative() && getComplexity(I.getOperand(0)) < + getComplexity(I.getOperand(1))) + Changed = !I.swapOperands(); + + BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0)); + BinaryOperator *Op1 = dyn_cast<BinaryOperator>(I.getOperand(1)); + + if (I.isAssociative()) { + // Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies. + if (Op0 && Op0->getOpcode() == Opcode) { + Value *A = Op0->getOperand(0); + Value *B = Op0->getOperand(1); + Value *C = I.getOperand(1); + + // Does "B op C" simplify? + if (Value *V = SimplifyBinOp(Opcode, B, C, SQ.getWithInstruction(&I))) { + // It simplifies to V. Form "A op V". + I.setOperand(0, A); + I.setOperand(1, V); + bool IsNUW = hasNoUnsignedWrap(I) && hasNoUnsignedWrap(*Op0); + bool IsNSW = maintainNoSignedWrap(I, B, C) && hasNoSignedWrap(*Op0); + + // Conservatively clear all optional flags since they may not be + // preserved by the reassociation. Reset nsw/nuw based on the above + // analysis. + ClearSubclassDataAfterReassociation(I); + + // Note: this is only valid because SimplifyBinOp doesn't look at + // the operands to Op0. + if (IsNUW) + I.setHasNoUnsignedWrap(true); + + if (IsNSW) + I.setHasNoSignedWrap(true); + + Changed = true; + ++NumReassoc; + continue; + } + } + + // Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies. + if (Op1 && Op1->getOpcode() == Opcode) { + Value *A = I.getOperand(0); + Value *B = Op1->getOperand(0); + Value *C = Op1->getOperand(1); + + // Does "A op B" simplify? + if (Value *V = SimplifyBinOp(Opcode, A, B, SQ.getWithInstruction(&I))) { + // It simplifies to V. Form "V op C". + I.setOperand(0, V); + I.setOperand(1, C); + // Conservatively clear the optional flags, since they may not be + // preserved by the reassociation. + ClearSubclassDataAfterReassociation(I); + Changed = true; + ++NumReassoc; + continue; + } + } + } + + if (I.isAssociative() && I.isCommutative()) { + if (simplifyAssocCastAssoc(&I)) { + Changed = true; + ++NumReassoc; + continue; + } + + // Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies. + if (Op0 && Op0->getOpcode() == Opcode) { + Value *A = Op0->getOperand(0); + Value *B = Op0->getOperand(1); + Value *C = I.getOperand(1); + + // Does "C op A" simplify? + if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) { + // It simplifies to V. Form "V op B". + I.setOperand(0, V); + I.setOperand(1, B); + // Conservatively clear the optional flags, since they may not be + // preserved by the reassociation. + ClearSubclassDataAfterReassociation(I); + Changed = true; + ++NumReassoc; + continue; + } + } + + // Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. + if (Op1 && Op1->getOpcode() == Opcode) { + Value *A = I.getOperand(0); + Value *B = Op1->getOperand(0); + Value *C = Op1->getOperand(1); + + // Does "C op A" simplify? + if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) { + // It simplifies to V. Form "B op V". + I.setOperand(0, B); + I.setOperand(1, V); + // Conservatively clear the optional flags, since they may not be + // preserved by the reassociation. + ClearSubclassDataAfterReassociation(I); + Changed = true; + ++NumReassoc; + continue; + } + } + + // Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" + // if C1 and C2 are constants. + Value *A, *B; + Constant *C1, *C2; + if (Op0 && Op1 && + Op0->getOpcode() == Opcode && Op1->getOpcode() == Opcode && + match(Op0, m_OneUse(m_BinOp(m_Value(A), m_Constant(C1)))) && + match(Op1, m_OneUse(m_BinOp(m_Value(B), m_Constant(C2))))) { + bool IsNUW = hasNoUnsignedWrap(I) && + hasNoUnsignedWrap(*Op0) && + hasNoUnsignedWrap(*Op1); + BinaryOperator *NewBO = (IsNUW && Opcode == Instruction::Add) ? + BinaryOperator::CreateNUW(Opcode, A, B) : + BinaryOperator::Create(Opcode, A, B); + + if (isa<FPMathOperator>(NewBO)) { + FastMathFlags Flags = I.getFastMathFlags(); + Flags &= Op0->getFastMathFlags(); + Flags &= Op1->getFastMathFlags(); + NewBO->setFastMathFlags(Flags); + } + InsertNewInstWith(NewBO, I); + NewBO->takeName(Op1); + I.setOperand(0, NewBO); + I.setOperand(1, ConstantExpr::get(Opcode, C1, C2)); + // Conservatively clear the optional flags, since they may not be + // preserved by the reassociation. + ClearSubclassDataAfterReassociation(I); + if (IsNUW) + I.setHasNoUnsignedWrap(true); + + Changed = true; + continue; + } + } + + // No further simplifications. + return Changed; + } while (true); +} + +/// Return whether "X LOp (Y ROp Z)" is always equal to +/// "(X LOp Y) ROp (X LOp Z)". +static bool leftDistributesOverRight(Instruction::BinaryOps LOp, + Instruction::BinaryOps ROp) { + // X & (Y | Z) <--> (X & Y) | (X & Z) + // X & (Y ^ Z) <--> (X & Y) ^ (X & Z) + if (LOp == Instruction::And) + return ROp == Instruction::Or || ROp == Instruction::Xor; + + // X | (Y & Z) <--> (X | Y) & (X | Z) + if (LOp == Instruction::Or) + return ROp == Instruction::And; + + // X * (Y + Z) <--> (X * Y) + (X * Z) + // X * (Y - Z) <--> (X * Y) - (X * Z) + if (LOp == Instruction::Mul) + return ROp == Instruction::Add || ROp == Instruction::Sub; + + return false; +} + +/// Return whether "(X LOp Y) ROp Z" is always equal to +/// "(X ROp Z) LOp (Y ROp Z)". +static bool rightDistributesOverLeft(Instruction::BinaryOps LOp, + Instruction::BinaryOps ROp) { + if (Instruction::isCommutative(ROp)) + return leftDistributesOverRight(ROp, LOp); + + // (X {&|^} Y) >> Z <--> (X >> Z) {&|^} (Y >> Z) for all shifts. + return Instruction::isBitwiseLogicOp(LOp) && Instruction::isShift(ROp); + + // TODO: It would be nice to handle division, aka "(X + Y)/Z = X/Z + Y/Z", + // but this requires knowing that the addition does not overflow and other + // such subtleties. +} + +/// This function returns identity value for given opcode, which can be used to +/// factor patterns like (X * 2) + X ==> (X * 2) + (X * 1) ==> X * (2 + 1). +static Value *getIdentityValue(Instruction::BinaryOps Opcode, Value *V) { + if (isa<Constant>(V)) + return nullptr; + + return ConstantExpr::getBinOpIdentity(Opcode, V->getType()); +} + +/// This function predicates factorization using distributive laws. By default, +/// it just returns the 'Op' inputs. But for special-cases like +/// 'add(shl(X, 5), ...)', this function will have TopOpcode == Instruction::Add +/// and Op = shl(X, 5). The 'shl' is treated as the more general 'mul X, 32' to +/// allow more factorization opportunities. +static Instruction::BinaryOps +getBinOpsForFactorization(Instruction::BinaryOps TopOpcode, BinaryOperator *Op, + Value *&LHS, Value *&RHS) { + assert(Op && "Expected a binary operator"); + LHS = Op->getOperand(0); + RHS = Op->getOperand(1); + if (TopOpcode == Instruction::Add || TopOpcode == Instruction::Sub) { + Constant *C; + if (match(Op, m_Shl(m_Value(), m_Constant(C)))) { + // X << C --> X * (1 << C) + RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), C); + return Instruction::Mul; + } + // TODO: We can add other conversions e.g. shr => div etc. + } + return Op->getOpcode(); +} + +/// This tries to simplify binary operations by factorizing out common terms +/// (e. g. "(A*B)+(A*C)" -> "A*(B+C)"). +Value *InstCombiner::tryFactorization(BinaryOperator &I, + Instruction::BinaryOps InnerOpcode, + Value *A, Value *B, Value *C, Value *D) { + assert(A && B && C && D && "All values must be provided"); + + Value *V = nullptr; + Value *SimplifiedInst = nullptr; + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + Instruction::BinaryOps TopLevelOpcode = I.getOpcode(); + + // Does "X op' Y" always equal "Y op' X"? + bool InnerCommutative = Instruction::isCommutative(InnerOpcode); + + // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"? + if (leftDistributesOverRight(InnerOpcode, TopLevelOpcode)) + // Does the instruction have the form "(A op' B) op (A op' D)" or, in the + // commutative case, "(A op' B) op (C op' A)"? + if (A == C || (InnerCommutative && A == D)) { + if (A != C) + std::swap(C, D); + // Consider forming "A op' (B op D)". + // If "B op D" simplifies then it can be formed with no cost. + V = SimplifyBinOp(TopLevelOpcode, B, D, SQ.getWithInstruction(&I)); + // If "B op D" doesn't simplify then only go on if both of the existing + // operations "A op' B" and "C op' D" will be zapped as no longer used. + if (!V && LHS->hasOneUse() && RHS->hasOneUse()) + V = Builder.CreateBinOp(TopLevelOpcode, B, D, RHS->getName()); + if (V) { + SimplifiedInst = Builder.CreateBinOp(InnerOpcode, A, V); + } + } + + // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"? + if (!SimplifiedInst && rightDistributesOverLeft(TopLevelOpcode, InnerOpcode)) + // Does the instruction have the form "(A op' B) op (C op' B)" or, in the + // commutative case, "(A op' B) op (B op' D)"? + if (B == D || (InnerCommutative && B == C)) { + if (B != D) + std::swap(C, D); + // Consider forming "(A op C) op' B". + // If "A op C" simplifies then it can be formed with no cost. + V = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I)); + + // If "A op C" doesn't simplify then only go on if both of the existing + // operations "A op' B" and "C op' D" will be zapped as no longer used. + if (!V && LHS->hasOneUse() && RHS->hasOneUse()) + V = Builder.CreateBinOp(TopLevelOpcode, A, C, LHS->getName()); + if (V) { + SimplifiedInst = Builder.CreateBinOp(InnerOpcode, V, B); + } + } + + if (SimplifiedInst) { + ++NumFactor; + SimplifiedInst->takeName(&I); + + // Check if we can add NSW/NUW flags to SimplifiedInst. If so, set them. + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SimplifiedInst)) { + if (isa<OverflowingBinaryOperator>(SimplifiedInst)) { + bool HasNSW = false; + bool HasNUW = false; + if (isa<OverflowingBinaryOperator>(&I)) { + HasNSW = I.hasNoSignedWrap(); + HasNUW = I.hasNoUnsignedWrap(); + } + + if (auto *LOBO = dyn_cast<OverflowingBinaryOperator>(LHS)) { + HasNSW &= LOBO->hasNoSignedWrap(); + HasNUW &= LOBO->hasNoUnsignedWrap(); + } + + if (auto *ROBO = dyn_cast<OverflowingBinaryOperator>(RHS)) { + HasNSW &= ROBO->hasNoSignedWrap(); + HasNUW &= ROBO->hasNoUnsignedWrap(); + } + + if (TopLevelOpcode == Instruction::Add && + InnerOpcode == Instruction::Mul) { + // We can propagate 'nsw' if we know that + // %Y = mul nsw i16 %X, C + // %Z = add nsw i16 %Y, %X + // => + // %Z = mul nsw i16 %X, C+1 + // + // iff C+1 isn't INT_MIN + const APInt *CInt; + if (match(V, m_APInt(CInt))) { + if (!CInt->isMinSignedValue()) + BO->setHasNoSignedWrap(HasNSW); + } + + // nuw can be propagated with any constant or nuw value. + BO->setHasNoUnsignedWrap(HasNUW); + } + } + } + } + return SimplifiedInst; +} + +/// This tries to simplify binary operations which some other binary operation +/// distributes over either by factorizing out common terms +/// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in +/// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win). +/// Returns the simplified value, or null if it didn't simplify. +Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS); + BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS); + Instruction::BinaryOps TopLevelOpcode = I.getOpcode(); + + { + // Factorization. + Value *A, *B, *C, *D; + Instruction::BinaryOps LHSOpcode, RHSOpcode; + if (Op0) + LHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op0, A, B); + if (Op1) + RHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op1, C, D); + + // The instruction has the form "(A op' B) op (C op' D)". Try to factorize + // a common term. + if (Op0 && Op1 && LHSOpcode == RHSOpcode) + if (Value *V = tryFactorization(I, LHSOpcode, A, B, C, D)) + return V; + + // The instruction has the form "(A op' B) op (C)". Try to factorize common + // term. + if (Op0) + if (Value *Ident = getIdentityValue(LHSOpcode, RHS)) + if (Value *V = tryFactorization(I, LHSOpcode, A, B, RHS, Ident)) + return V; + + // The instruction has the form "(B) op (C op' D)". Try to factorize common + // term. + if (Op1) + if (Value *Ident = getIdentityValue(RHSOpcode, LHS)) + if (Value *V = tryFactorization(I, RHSOpcode, LHS, Ident, C, D)) + return V; + } + + // Expansion. + if (Op0 && rightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) { + // The instruction has the form "(A op' B) op C". See if expanding it out + // to "(A op C) op' (B op C)" results in simplifications. + Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS; + Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op' + + Value *L = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I)); + Value *R = SimplifyBinOp(TopLevelOpcode, B, C, SQ.getWithInstruction(&I)); + + // Do "A op C" and "B op C" both simplify? + if (L && R) { + // They do! Return "L op' R". + ++NumExpand; + C = Builder.CreateBinOp(InnerOpcode, L, R); + C->takeName(&I); + return C; + } + + // Does "A op C" simplify to the identity value for the inner opcode? + if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) { + // They do! Return "B op C". + ++NumExpand; + C = Builder.CreateBinOp(TopLevelOpcode, B, C); + C->takeName(&I); + return C; + } + + // Does "B op C" simplify to the identity value for the inner opcode? + if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) { + // They do! Return "A op C". + ++NumExpand; + C = Builder.CreateBinOp(TopLevelOpcode, A, C); + C->takeName(&I); + return C; + } + } + + if (Op1 && leftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) { + // The instruction has the form "A op (B op' C)". See if expanding it out + // to "(A op B) op' (A op C)" results in simplifications. + Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1); + Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op' + + Value *L = SimplifyBinOp(TopLevelOpcode, A, B, SQ.getWithInstruction(&I)); + Value *R = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I)); + + // Do "A op B" and "A op C" both simplify? + if (L && R) { + // They do! Return "L op' R". + ++NumExpand; + A = Builder.CreateBinOp(InnerOpcode, L, R); + A->takeName(&I); + return A; + } + + // Does "A op B" simplify to the identity value for the inner opcode? + if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) { + // They do! Return "A op C". + ++NumExpand; + A = Builder.CreateBinOp(TopLevelOpcode, A, C); + A->takeName(&I); + return A; + } + + // Does "A op C" simplify to the identity value for the inner opcode? + if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) { + // They do! Return "A op B". + ++NumExpand; + A = Builder.CreateBinOp(TopLevelOpcode, A, B); + A->takeName(&I); + return A; + } + } + + return SimplifySelectsFeedingBinaryOp(I, LHS, RHS); +} + +Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I, + Value *LHS, Value *RHS) { + Instruction::BinaryOps Opcode = I.getOpcode(); + // (op (select (a, b, c)), (select (a, d, e))) -> (select (a, (op b, d), (op + // c, e))) + Value *A, *B, *C, *D, *E; + Value *SI = nullptr; + if (match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C))) && + match(RHS, m_Select(m_Specific(A), m_Value(D), m_Value(E)))) { + bool SelectsHaveOneUse = LHS->hasOneUse() && RHS->hasOneUse(); + + FastMathFlags FMF; + BuilderTy::FastMathFlagGuard Guard(Builder); + if (isa<FPMathOperator>(&I)) { + FMF = I.getFastMathFlags(); + Builder.setFastMathFlags(FMF); + } + + Value *V1 = SimplifyBinOp(Opcode, C, E, FMF, SQ.getWithInstruction(&I)); + Value *V2 = SimplifyBinOp(Opcode, B, D, FMF, SQ.getWithInstruction(&I)); + if (V1 && V2) + SI = Builder.CreateSelect(A, V2, V1); + else if (V2 && SelectsHaveOneUse) + SI = Builder.CreateSelect(A, V2, Builder.CreateBinOp(Opcode, C, E)); + else if (V1 && SelectsHaveOneUse) + SI = Builder.CreateSelect(A, Builder.CreateBinOp(Opcode, B, D), V1); + + if (SI) + SI->takeName(&I); + } + + return SI; +} + +/// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a +/// constant zero (which is the 'negate' form). +Value *InstCombiner::dyn_castNegVal(Value *V) const { + Value *NegV; + if (match(V, m_Neg(m_Value(NegV)))) + return NegV; + + // Constants can be considered to be negated values if they can be folded. + if (ConstantInt *C = dyn_cast<ConstantInt>(V)) + return ConstantExpr::getNeg(C); + + if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V)) + if (C->getType()->getElementType()->isIntegerTy()) + return ConstantExpr::getNeg(C); + + if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) { + for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) { + Constant *Elt = CV->getAggregateElement(i); + if (!Elt) + return nullptr; + + if (isa<UndefValue>(Elt)) + continue; + + if (!isa<ConstantInt>(Elt)) + return nullptr; + } + return ConstantExpr::getNeg(CV); + } + + return nullptr; +} + +static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO, + InstCombiner::BuilderTy &Builder) { + if (auto *Cast = dyn_cast<CastInst>(&I)) + return Builder.CreateCast(Cast->getOpcode(), SO, I.getType()); + + assert(I.isBinaryOp() && "Unexpected opcode for select folding"); + + // Figure out if the constant is the left or the right argument. + bool ConstIsRHS = isa<Constant>(I.getOperand(1)); + Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS)); + + if (auto *SOC = dyn_cast<Constant>(SO)) { + if (ConstIsRHS) + return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand); + return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC); + } + + Value *Op0 = SO, *Op1 = ConstOperand; + if (!ConstIsRHS) + std::swap(Op0, Op1); + + auto *BO = cast<BinaryOperator>(&I); + Value *RI = Builder.CreateBinOp(BO->getOpcode(), Op0, Op1, + SO->getName() + ".op"); + auto *FPInst = dyn_cast<Instruction>(RI); + if (FPInst && isa<FPMathOperator>(FPInst)) + FPInst->copyFastMathFlags(BO); + return RI; +} + +Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { + // Don't modify shared select instructions. + if (!SI->hasOneUse()) + return nullptr; + + Value *TV = SI->getTrueValue(); + Value *FV = SI->getFalseValue(); + if (!(isa<Constant>(TV) || isa<Constant>(FV))) + return nullptr; + + // Bool selects with constant operands can be folded to logical ops. + if (SI->getType()->isIntOrIntVectorTy(1)) + return nullptr; + + // If it's a bitcast involving vectors, make sure it has the same number of + // elements on both sides. + if (auto *BC = dyn_cast<BitCastInst>(&Op)) { + VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy()); + VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy()); + + // Verify that either both or neither are vectors. + if ((SrcTy == nullptr) != (DestTy == nullptr)) + return nullptr; + + // If vectors, verify that they have the same number of elements. + if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements()) + return nullptr; + } + + // Test if a CmpInst instruction is used exclusively by a select as + // part of a minimum or maximum operation. If so, refrain from doing + // any other folding. This helps out other analyses which understand + // non-obfuscated minimum and maximum idioms, such as ScalarEvolution + // and CodeGen. And in this case, at least one of the comparison + // operands has at least one user besides the compare (the select), + // which would often largely negate the benefit of folding anyway. + if (auto *CI = dyn_cast<CmpInst>(SI->getCondition())) { + if (CI->hasOneUse()) { + Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); + if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) || + (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1)) + return nullptr; + } + } + + Value *NewTV = foldOperationIntoSelectOperand(Op, TV, Builder); + Value *NewFV = foldOperationIntoSelectOperand(Op, FV, Builder); + return SelectInst::Create(SI->getCondition(), NewTV, NewFV, "", nullptr, SI); +} + +static Value *foldOperationIntoPhiValue(BinaryOperator *I, Value *InV, + InstCombiner::BuilderTy &Builder) { + bool ConstIsRHS = isa<Constant>(I->getOperand(1)); + Constant *C = cast<Constant>(I->getOperand(ConstIsRHS)); + + if (auto *InC = dyn_cast<Constant>(InV)) { + if (ConstIsRHS) + return ConstantExpr::get(I->getOpcode(), InC, C); + return ConstantExpr::get(I->getOpcode(), C, InC); + } + + Value *Op0 = InV, *Op1 = C; + if (!ConstIsRHS) + std::swap(Op0, Op1); + + Value *RI = Builder.CreateBinOp(I->getOpcode(), Op0, Op1, "phitmp"); + auto *FPInst = dyn_cast<Instruction>(RI); + if (FPInst && isa<FPMathOperator>(FPInst)) + FPInst->copyFastMathFlags(I); + return RI; +} + +Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) { + unsigned NumPHIValues = PN->getNumIncomingValues(); + if (NumPHIValues == 0) + return nullptr; + + // We normally only transform phis with a single use. However, if a PHI has + // multiple uses and they are all the same operation, we can fold *all* of the + // uses into the PHI. + if (!PN->hasOneUse()) { + // Walk the use list for the instruction, comparing them to I. + for (User *U : PN->users()) { + Instruction *UI = cast<Instruction>(U); + if (UI != &I && !I.isIdenticalTo(UI)) + return nullptr; + } + // Otherwise, we can replace *all* users with the new PHI we form. + } + + // Check to see if all of the operands of the PHI are simple constants + // (constantint/constantfp/undef). If there is one non-constant value, + // remember the BB it is in. If there is more than one or if *it* is a PHI, + // bail out. We don't do arbitrary constant expressions here because moving + // their computation can be expensive without a cost model. + BasicBlock *NonConstBB = nullptr; + for (unsigned i = 0; i != NumPHIValues; ++i) { + Value *InVal = PN->getIncomingValue(i); + if (isa<Constant>(InVal) && !isa<ConstantExpr>(InVal)) + continue; + + if (isa<PHINode>(InVal)) return nullptr; // Itself a phi. + if (NonConstBB) return nullptr; // More than one non-const value. + + NonConstBB = PN->getIncomingBlock(i); + + // If the InVal is an invoke at the end of the pred block, then we can't + // insert a computation after it without breaking the edge. + if (isa<InvokeInst>(InVal)) + if (cast<Instruction>(InVal)->getParent() == NonConstBB) + return nullptr; + + // If the incoming non-constant value is in I's block, we will remove one + // instruction, but insert another equivalent one, leading to infinite + // instcombine. + if (isPotentiallyReachable(I.getParent(), NonConstBB, &DT, LI)) + return nullptr; + } + + // If there is exactly one non-constant value, we can insert a copy of the + // operation in that block. However, if this is a critical edge, we would be + // inserting the computation on some other paths (e.g. inside a loop). Only + // do this if the pred block is unconditionally branching into the phi block. + if (NonConstBB != nullptr) { + BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator()); + if (!BI || !BI->isUnconditional()) return nullptr; + } + + // Okay, we can do the transformation: create the new PHI node. + PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues()); + InsertNewInstBefore(NewPN, *PN); + NewPN->takeName(PN); + + // If we are going to have to insert a new computation, do so right before the + // predecessor's terminator. + if (NonConstBB) + Builder.SetInsertPoint(NonConstBB->getTerminator()); + + // Next, add all of the operands to the PHI. + if (SelectInst *SI = dyn_cast<SelectInst>(&I)) { + // We only currently try to fold the condition of a select when it is a phi, + // not the true/false values. + Value *TrueV = SI->getTrueValue(); + Value *FalseV = SI->getFalseValue(); + BasicBlock *PhiTransBB = PN->getParent(); + for (unsigned i = 0; i != NumPHIValues; ++i) { + BasicBlock *ThisBB = PN->getIncomingBlock(i); + Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB); + Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB); + Value *InV = nullptr; + // Beware of ConstantExpr: it may eventually evaluate to getNullValue, + // even if currently isNullValue gives false. + Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)); + // For vector constants, we cannot use isNullValue to fold into + // FalseVInPred versus TrueVInPred. When we have individual nonzero + // elements in the vector, we will incorrectly fold InC to + // `TrueVInPred`. + if (InC && !isa<ConstantExpr>(InC) && isa<ConstantInt>(InC)) + InV = InC->isNullValue() ? FalseVInPred : TrueVInPred; + else { + // Generate the select in the same block as PN's current incoming block. + // Note: ThisBB need not be the NonConstBB because vector constants + // which are constants by definition are handled here. + // FIXME: This can lead to an increase in IR generation because we might + // generate selects for vector constant phi operand, that could not be + // folded to TrueVInPred or FalseVInPred as done for ConstantInt. For + // non-vector phis, this transformation was always profitable because + // the select would be generated exactly once in the NonConstBB. + Builder.SetInsertPoint(ThisBB->getTerminator()); + InV = Builder.CreateSelect(PN->getIncomingValue(i), TrueVInPred, + FalseVInPred, "phitmp"); + } + NewPN->addIncoming(InV, ThisBB); + } + } else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) { + Constant *C = cast<Constant>(I.getOperand(1)); + for (unsigned i = 0; i != NumPHIValues; ++i) { + Value *InV = nullptr; + if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) + InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C); + else if (isa<ICmpInst>(CI)) + InV = Builder.CreateICmp(CI->getPredicate(), PN->getIncomingValue(i), + C, "phitmp"); + else + InV = Builder.CreateFCmp(CI->getPredicate(), PN->getIncomingValue(i), + C, "phitmp"); + NewPN->addIncoming(InV, PN->getIncomingBlock(i)); + } + } else if (auto *BO = dyn_cast<BinaryOperator>(&I)) { + for (unsigned i = 0; i != NumPHIValues; ++i) { + Value *InV = foldOperationIntoPhiValue(BO, PN->getIncomingValue(i), + Builder); + NewPN->addIncoming(InV, PN->getIncomingBlock(i)); + } + } else { + CastInst *CI = cast<CastInst>(&I); + Type *RetTy = CI->getType(); + for (unsigned i = 0; i != NumPHIValues; ++i) { + Value *InV; + if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) + InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy); + else + InV = Builder.CreateCast(CI->getOpcode(), PN->getIncomingValue(i), + I.getType(), "phitmp"); + NewPN->addIncoming(InV, PN->getIncomingBlock(i)); + } + } + + for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) { + Instruction *User = cast<Instruction>(*UI++); + if (User == &I) continue; + replaceInstUsesWith(*User, NewPN); + eraseInstFromFunction(*User); + } + return replaceInstUsesWith(I, NewPN); +} + +Instruction *InstCombiner::foldBinOpIntoSelectOrPhi(BinaryOperator &I) { + if (!isa<Constant>(I.getOperand(1))) + return nullptr; + + if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) { + if (Instruction *NewSel = FoldOpIntoSelect(I, Sel)) + return NewSel; + } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) { + if (Instruction *NewPhi = foldOpIntoPhi(I, PN)) + return NewPhi; + } + return nullptr; +} + +/// Given a pointer type and a constant offset, determine whether or not there +/// is a sequence of GEP indices into the pointed type that will land us at the +/// specified offset. If so, fill them into NewIndices and return the resultant +/// element type, otherwise return null. +Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset, + SmallVectorImpl<Value *> &NewIndices) { + Type *Ty = PtrTy->getElementType(); + if (!Ty->isSized()) + return nullptr; + + // Start with the index over the outer type. Note that the type size + // might be zero (even if the offset isn't zero) if the indexed type + // is something like [0 x {int, int}] + Type *IndexTy = DL.getIndexType(PtrTy); + int64_t FirstIdx = 0; + if (int64_t TySize = DL.getTypeAllocSize(Ty)) { + FirstIdx = Offset/TySize; + Offset -= FirstIdx*TySize; + + // Handle hosts where % returns negative instead of values [0..TySize). + if (Offset < 0) { + --FirstIdx; + Offset += TySize; + assert(Offset >= 0); + } + assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset"); + } + + NewIndices.push_back(ConstantInt::get(IndexTy, FirstIdx)); + + // Index into the types. If we fail, set OrigBase to null. + while (Offset) { + // Indexing into tail padding between struct/array elements. + if (uint64_t(Offset * 8) >= DL.getTypeSizeInBits(Ty)) + return nullptr; + + if (StructType *STy = dyn_cast<StructType>(Ty)) { + const StructLayout *SL = DL.getStructLayout(STy); + assert(Offset < (int64_t)SL->getSizeInBytes() && + "Offset must stay within the indexed type"); + + unsigned Elt = SL->getElementContainingOffset(Offset); + NewIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()), + Elt)); + + Offset -= SL->getElementOffset(Elt); + Ty = STy->getElementType(Elt); + } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) { + uint64_t EltSize = DL.getTypeAllocSize(AT->getElementType()); + assert(EltSize && "Cannot index into a zero-sized array"); + NewIndices.push_back(ConstantInt::get(IndexTy,Offset/EltSize)); + Offset %= EltSize; + Ty = AT->getElementType(); + } else { + // Otherwise, we can't index into the middle of this atomic type, bail. + return nullptr; + } + } + + return Ty; +} + +static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) { + // If this GEP has only 0 indices, it is the same pointer as + // Src. If Src is not a trivial GEP too, don't combine + // the indices. + if (GEP.hasAllZeroIndices() && !Src.hasAllZeroIndices() && + !Src.hasOneUse()) + return false; + return true; +} + +/// Return a value X such that Val = X * Scale, or null if none. +/// If the multiplication is known not to overflow, then NoSignedWrap is set. +Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { + assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!"); + assert(cast<IntegerType>(Val->getType())->getBitWidth() == + Scale.getBitWidth() && "Scale not compatible with value!"); + + // If Val is zero or Scale is one then Val = Val * Scale. + if (match(Val, m_Zero()) || Scale == 1) { + NoSignedWrap = true; + return Val; + } + + // If Scale is zero then it does not divide Val. + if (Scale.isMinValue()) + return nullptr; + + // Look through chains of multiplications, searching for a constant that is + // divisible by Scale. For example, descaling X*(Y*(Z*4)) by a factor of 4 + // will find the constant factor 4 and produce X*(Y*Z). Descaling X*(Y*8) by + // a factor of 4 will produce X*(Y*2). The principle of operation is to bore + // down from Val: + // + // Val = M1 * X || Analysis starts here and works down + // M1 = M2 * Y || Doesn't descend into terms with more + // M2 = Z * 4 \/ than one use + // + // Then to modify a term at the bottom: + // + // Val = M1 * X + // M1 = Z * Y || Replaced M2 with Z + // + // Then to work back up correcting nsw flags. + + // Op - the term we are currently analyzing. Starts at Val then drills down. + // Replaced with its descaled value before exiting from the drill down loop. + Value *Op = Val; + + // Parent - initially null, but after drilling down notes where Op came from. + // In the example above, Parent is (Val, 0) when Op is M1, because M1 is the + // 0'th operand of Val. + std::pair<Instruction *, unsigned> Parent; + + // Set if the transform requires a descaling at deeper levels that doesn't + // overflow. + bool RequireNoSignedWrap = false; + + // Log base 2 of the scale. Negative if not a power of 2. + int32_t logScale = Scale.exactLogBase2(); + + for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // If Op is a constant divisible by Scale then descale to the quotient. + APInt Quotient(Scale), Remainder(Scale); // Init ensures right bitwidth. + APInt::sdivrem(CI->getValue(), Scale, Quotient, Remainder); + if (!Remainder.isMinValue()) + // Not divisible by Scale. + return nullptr; + // Replace with the quotient in the parent. + Op = ConstantInt::get(CI->getType(), Quotient); + NoSignedWrap = true; + break; + } + + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op)) { + if (BO->getOpcode() == Instruction::Mul) { + // Multiplication. + NoSignedWrap = BO->hasNoSignedWrap(); + if (RequireNoSignedWrap && !NoSignedWrap) + return nullptr; + + // There are three cases for multiplication: multiplication by exactly + // the scale, multiplication by a constant different to the scale, and + // multiplication by something else. + Value *LHS = BO->getOperand(0); + Value *RHS = BO->getOperand(1); + + if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) { + // Multiplication by a constant. + if (CI->getValue() == Scale) { + // Multiplication by exactly the scale, replace the multiplication + // by its left-hand side in the parent. + Op = LHS; + break; + } + + // Otherwise drill down into the constant. + if (!Op->hasOneUse()) + return nullptr; + + Parent = std::make_pair(BO, 1); + continue; + } + + // Multiplication by something else. Drill down into the left-hand side + // since that's where the reassociate pass puts the good stuff. + if (!Op->hasOneUse()) + return nullptr; + + Parent = std::make_pair(BO, 0); + continue; + } + + if (logScale > 0 && BO->getOpcode() == Instruction::Shl && + isa<ConstantInt>(BO->getOperand(1))) { + // Multiplication by a power of 2. + NoSignedWrap = BO->hasNoSignedWrap(); + if (RequireNoSignedWrap && !NoSignedWrap) + return nullptr; + + Value *LHS = BO->getOperand(0); + int32_t Amt = cast<ConstantInt>(BO->getOperand(1))-> + getLimitedValue(Scale.getBitWidth()); + // Op = LHS << Amt. + + if (Amt == logScale) { + // Multiplication by exactly the scale, replace the multiplication + // by its left-hand side in the parent. + Op = LHS; + break; + } + if (Amt < logScale || !Op->hasOneUse()) + return nullptr; + + // Multiplication by more than the scale. Reduce the multiplying amount + // by the scale in the parent. + Parent = std::make_pair(BO, 1); + Op = ConstantInt::get(BO->getType(), Amt - logScale); + break; + } + } + + if (!Op->hasOneUse()) + return nullptr; + + if (CastInst *Cast = dyn_cast<CastInst>(Op)) { + if (Cast->getOpcode() == Instruction::SExt) { + // Op is sign-extended from a smaller type, descale in the smaller type. + unsigned SmallSize = Cast->getSrcTy()->getPrimitiveSizeInBits(); + APInt SmallScale = Scale.trunc(SmallSize); + // Suppose Op = sext X, and we descale X as Y * SmallScale. We want to + // descale Op as (sext Y) * Scale. In order to have + // sext (Y * SmallScale) = (sext Y) * Scale + // some conditions need to hold however: SmallScale must sign-extend to + // Scale and the multiplication Y * SmallScale should not overflow. + if (SmallScale.sext(Scale.getBitWidth()) != Scale) + // SmallScale does not sign-extend to Scale. + return nullptr; + assert(SmallScale.exactLogBase2() == logScale); + // Require that Y * SmallScale must not overflow. + RequireNoSignedWrap = true; + + // Drill down through the cast. + Parent = std::make_pair(Cast, 0); + Scale = SmallScale; + continue; + } + + if (Cast->getOpcode() == Instruction::Trunc) { + // Op is truncated from a larger type, descale in the larger type. + // Suppose Op = trunc X, and we descale X as Y * sext Scale. Then + // trunc (Y * sext Scale) = (trunc Y) * Scale + // always holds. However (trunc Y) * Scale may overflow even if + // trunc (Y * sext Scale) does not, so nsw flags need to be cleared + // from this point up in the expression (see later). + if (RequireNoSignedWrap) + return nullptr; + + // Drill down through the cast. + unsigned LargeSize = Cast->getSrcTy()->getPrimitiveSizeInBits(); + Parent = std::make_pair(Cast, 0); + Scale = Scale.sext(LargeSize); + if (logScale + 1 == (int32_t)Cast->getType()->getPrimitiveSizeInBits()) + logScale = -1; + assert(Scale.exactLogBase2() == logScale); + continue; + } + } + + // Unsupported expression, bail out. + return nullptr; + } + + // If Op is zero then Val = Op * Scale. + if (match(Op, m_Zero())) { + NoSignedWrap = true; + return Op; + } + + // We know that we can successfully descale, so from here on we can safely + // modify the IR. Op holds the descaled version of the deepest term in the + // expression. NoSignedWrap is 'true' if multiplying Op by Scale is known + // not to overflow. + + if (!Parent.first) + // The expression only had one term. + return Op; + + // Rewrite the parent using the descaled version of its operand. + assert(Parent.first->hasOneUse() && "Drilled down when more than one use!"); + assert(Op != Parent.first->getOperand(Parent.second) && + "Descaling was a no-op?"); + Parent.first->setOperand(Parent.second, Op); + Worklist.Add(Parent.first); + + // Now work back up the expression correcting nsw flags. The logic is based + // on the following observation: if X * Y is known not to overflow as a signed + // multiplication, and Y is replaced by a value Z with smaller absolute value, + // then X * Z will not overflow as a signed multiplication either. As we work + // our way up, having NoSignedWrap 'true' means that the descaled value at the + // current level has strictly smaller absolute value than the original. + Instruction *Ancestor = Parent.first; + do { + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Ancestor)) { + // If the multiplication wasn't nsw then we can't say anything about the + // value of the descaled multiplication, and we have to clear nsw flags + // from this point on up. + bool OpNoSignedWrap = BO->hasNoSignedWrap(); + NoSignedWrap &= OpNoSignedWrap; + if (NoSignedWrap != OpNoSignedWrap) { + BO->setHasNoSignedWrap(NoSignedWrap); + Worklist.Add(Ancestor); + } + } else if (Ancestor->getOpcode() == Instruction::Trunc) { + // The fact that the descaled input to the trunc has smaller absolute + // value than the original input doesn't tell us anything useful about + // the absolute values of the truncations. + NoSignedWrap = false; + } + assert((Ancestor->getOpcode() != Instruction::SExt || NoSignedWrap) && + "Failed to keep proper track of nsw flags while drilling down?"); + + if (Ancestor == Val) + // Got to the top, all done! + return Val; + + // Move up one level in the expression. + assert(Ancestor->hasOneUse() && "Drilled down when more than one use!"); + Ancestor = Ancestor->user_back(); + } while (true); +} + +Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) { + if (!Inst.getType()->isVectorTy()) return nullptr; + + BinaryOperator::BinaryOps Opcode = Inst.getOpcode(); + unsigned NumElts = cast<VectorType>(Inst.getType())->getNumElements(); + Value *LHS = Inst.getOperand(0), *RHS = Inst.getOperand(1); + assert(cast<VectorType>(LHS->getType())->getNumElements() == NumElts); + assert(cast<VectorType>(RHS->getType())->getNumElements() == NumElts); + + // If both operands of the binop are vector concatenations, then perform the + // narrow binop on each pair of the source operands followed by concatenation + // of the results. + Value *L0, *L1, *R0, *R1; + Constant *Mask; + if (match(LHS, m_ShuffleVector(m_Value(L0), m_Value(L1), m_Constant(Mask))) && + match(RHS, m_ShuffleVector(m_Value(R0), m_Value(R1), m_Specific(Mask))) && + LHS->hasOneUse() && RHS->hasOneUse() && + cast<ShuffleVectorInst>(LHS)->isConcat() && + cast<ShuffleVectorInst>(RHS)->isConcat()) { + // This transform does not have the speculative execution constraint as + // below because the shuffle is a concatenation. The new binops are + // operating on exactly the same elements as the existing binop. + // TODO: We could ease the mask requirement to allow different undef lanes, + // but that requires an analysis of the binop-with-undef output value. + Value *NewBO0 = Builder.CreateBinOp(Opcode, L0, R0); + if (auto *BO = dyn_cast<BinaryOperator>(NewBO0)) + BO->copyIRFlags(&Inst); + Value *NewBO1 = Builder.CreateBinOp(Opcode, L1, R1); + if (auto *BO = dyn_cast<BinaryOperator>(NewBO1)) + BO->copyIRFlags(&Inst); + return new ShuffleVectorInst(NewBO0, NewBO1, Mask); + } + + // It may not be safe to reorder shuffles and things like div, urem, etc. + // because we may trap when executing those ops on unknown vector elements. + // See PR20059. + if (!isSafeToSpeculativelyExecute(&Inst)) + return nullptr; + + auto createBinOpShuffle = [&](Value *X, Value *Y, Constant *M) { + Value *XY = Builder.CreateBinOp(Opcode, X, Y); + if (auto *BO = dyn_cast<BinaryOperator>(XY)) + BO->copyIRFlags(&Inst); + return new ShuffleVectorInst(XY, UndefValue::get(XY->getType()), M); + }; + + // If both arguments of the binary operation are shuffles that use the same + // mask and shuffle within a single vector, move the shuffle after the binop. + Value *V1, *V2; + if (match(LHS, m_ShuffleVector(m_Value(V1), m_Undef(), m_Constant(Mask))) && + match(RHS, m_ShuffleVector(m_Value(V2), m_Undef(), m_Specific(Mask))) && + V1->getType() == V2->getType() && + (LHS->hasOneUse() || RHS->hasOneUse() || LHS == RHS)) { + // Op(shuffle(V1, Mask), shuffle(V2, Mask)) -> shuffle(Op(V1, V2), Mask) + return createBinOpShuffle(V1, V2, Mask); + } + + // If both arguments of a commutative binop are select-shuffles that use the + // same mask with commuted operands, the shuffles are unnecessary. + if (Inst.isCommutative() && + match(LHS, m_ShuffleVector(m_Value(V1), m_Value(V2), m_Constant(Mask))) && + match(RHS, m_ShuffleVector(m_Specific(V2), m_Specific(V1), + m_Specific(Mask)))) { + auto *LShuf = cast<ShuffleVectorInst>(LHS); + auto *RShuf = cast<ShuffleVectorInst>(RHS); + // TODO: Allow shuffles that contain undefs in the mask? + // That is legal, but it reduces undef knowledge. + // TODO: Allow arbitrary shuffles by shuffling after binop? + // That might be legal, but we have to deal with poison. + if (LShuf->isSelect() && !LShuf->getMask()->containsUndefElement() && + RShuf->isSelect() && !RShuf->getMask()->containsUndefElement()) { + // Example: + // LHS = shuffle V1, V2, <0, 5, 6, 3> + // RHS = shuffle V2, V1, <0, 5, 6, 3> + // LHS + RHS --> (V10+V20, V21+V11, V22+V12, V13+V23) --> V1 + V2 + Instruction *NewBO = BinaryOperator::Create(Opcode, V1, V2); + NewBO->copyIRFlags(&Inst); + return NewBO; + } + } + + // If one argument is a shuffle within one vector and the other is a constant, + // try moving the shuffle after the binary operation. This canonicalization + // intends to move shuffles closer to other shuffles and binops closer to + // other binops, so they can be folded. It may also enable demanded elements + // transforms. + Constant *C; + if (match(&Inst, m_c_BinOp( + m_OneUse(m_ShuffleVector(m_Value(V1), m_Undef(), m_Constant(Mask))), + m_Constant(C))) && + V1->getType()->getVectorNumElements() <= NumElts) { + assert(Inst.getType()->getScalarType() == V1->getType()->getScalarType() && + "Shuffle should not change scalar type"); + + // Find constant NewC that has property: + // shuffle(NewC, ShMask) = C + // If such constant does not exist (example: ShMask=<0,0> and C=<1,2>) + // reorder is not possible. A 1-to-1 mapping is not required. Example: + // ShMask = <1,1,2,2> and C = <5,5,6,6> --> NewC = <undef,5,6,undef> + bool ConstOp1 = isa<Constant>(RHS); + SmallVector<int, 16> ShMask; + ShuffleVectorInst::getShuffleMask(Mask, ShMask); + unsigned SrcVecNumElts = V1->getType()->getVectorNumElements(); + UndefValue *UndefScalar = UndefValue::get(C->getType()->getScalarType()); + SmallVector<Constant *, 16> NewVecC(SrcVecNumElts, UndefScalar); + bool MayChange = true; + for (unsigned I = 0; I < NumElts; ++I) { + Constant *CElt = C->getAggregateElement(I); + if (ShMask[I] >= 0) { + assert(ShMask[I] < (int)NumElts && "Not expecting narrowing shuffle"); + Constant *NewCElt = NewVecC[ShMask[I]]; + // Bail out if: + // 1. The constant vector contains a constant expression. + // 2. The shuffle needs an element of the constant vector that can't + // be mapped to a new constant vector. + // 3. This is a widening shuffle that copies elements of V1 into the + // extended elements (extending with undef is allowed). + if (!CElt || (!isa<UndefValue>(NewCElt) && NewCElt != CElt) || + I >= SrcVecNumElts) { + MayChange = false; + break; + } + NewVecC[ShMask[I]] = CElt; + } + // If this is a widening shuffle, we must be able to extend with undef + // elements. If the original binop does not produce an undef in the high + // lanes, then this transform is not safe. + // TODO: We could shuffle those non-undef constant values into the + // result by using a constant vector (rather than an undef vector) + // as operand 1 of the new binop, but that might be too aggressive + // for target-independent shuffle creation. + if (I >= SrcVecNumElts) { + Constant *MaybeUndef = + ConstOp1 ? ConstantExpr::get(Opcode, UndefScalar, CElt) + : ConstantExpr::get(Opcode, CElt, UndefScalar); + if (!isa<UndefValue>(MaybeUndef)) { + MayChange = false; + break; + } + } + } + if (MayChange) { + Constant *NewC = ConstantVector::get(NewVecC); + // It may not be safe to execute a binop on a vector with undef elements + // because the entire instruction can be folded to undef or create poison + // that did not exist in the original code. + if (Inst.isIntDivRem() || (Inst.isShift() && ConstOp1)) + NewC = getSafeVectorConstantForBinop(Opcode, NewC, ConstOp1); + + // Op(shuffle(V1, Mask), C) -> shuffle(Op(V1, NewC), Mask) + // Op(C, shuffle(V1, Mask)) -> shuffle(Op(NewC, V1), Mask) + Value *NewLHS = ConstOp1 ? V1 : NewC; + Value *NewRHS = ConstOp1 ? NewC : V1; + return createBinOpShuffle(NewLHS, NewRHS, Mask); + } + } + + return nullptr; +} + +/// Try to narrow the width of a binop if at least 1 operand is an extend of +/// of a value. This requires a potentially expensive known bits check to make +/// sure the narrow op does not overflow. +Instruction *InstCombiner::narrowMathIfNoOverflow(BinaryOperator &BO) { + // We need at least one extended operand. + Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1); + + // If this is a sub, we swap the operands since we always want an extension + // on the RHS. The LHS can be an extension or a constant. + if (BO.getOpcode() == Instruction::Sub) + std::swap(Op0, Op1); + + Value *X; + bool IsSext = match(Op0, m_SExt(m_Value(X))); + if (!IsSext && !match(Op0, m_ZExt(m_Value(X)))) + return nullptr; + + // If both operands are the same extension from the same source type and we + // can eliminate at least one (hasOneUse), this might work. + CastInst::CastOps CastOpc = IsSext ? Instruction::SExt : Instruction::ZExt; + Value *Y; + if (!(match(Op1, m_ZExtOrSExt(m_Value(Y))) && X->getType() == Y->getType() && + cast<Operator>(Op1)->getOpcode() == CastOpc && + (Op0->hasOneUse() || Op1->hasOneUse()))) { + // If that did not match, see if we have a suitable constant operand. + // Truncating and extending must produce the same constant. + Constant *WideC; + if (!Op0->hasOneUse() || !match(Op1, m_Constant(WideC))) + return nullptr; + Constant *NarrowC = ConstantExpr::getTrunc(WideC, X->getType()); + if (ConstantExpr::getCast(CastOpc, NarrowC, BO.getType()) != WideC) + return nullptr; + Y = NarrowC; + } + + // Swap back now that we found our operands. + if (BO.getOpcode() == Instruction::Sub) + std::swap(X, Y); + + // Both operands have narrow versions. Last step: the math must not overflow + // in the narrow width. + if (!willNotOverflow(BO.getOpcode(), X, Y, BO, IsSext)) + return nullptr; + + // bo (ext X), (ext Y) --> ext (bo X, Y) + // bo (ext X), C --> ext (bo X, C') + Value *NarrowBO = Builder.CreateBinOp(BO.getOpcode(), X, Y, "narrow"); + if (auto *NewBinOp = dyn_cast<BinaryOperator>(NarrowBO)) { + if (IsSext) + NewBinOp->setHasNoSignedWrap(); + else + NewBinOp->setHasNoUnsignedWrap(); + } + return CastInst::Create(CastOpc, NarrowBO, BO.getType()); +} + +Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { + SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end()); + Type *GEPType = GEP.getType(); + Type *GEPEltType = GEP.getSourceElementType(); + if (Value *V = SimplifyGEPInst(GEPEltType, Ops, SQ.getWithInstruction(&GEP))) + return replaceInstUsesWith(GEP, V); + + // For vector geps, use the generic demanded vector support. + if (GEP.getType()->isVectorTy()) { + auto VWidth = GEP.getType()->getVectorNumElements(); + APInt UndefElts(VWidth, 0); + APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); + if (Value *V = SimplifyDemandedVectorElts(&GEP, AllOnesEltMask, + UndefElts)) { + if (V != &GEP) + return replaceInstUsesWith(GEP, V); + return &GEP; + } + + // TODO: 1) Scalarize splat operands, 2) scalarize entire instruction if + // possible (decide on canonical form for pointer broadcast), 3) exploit + // undef elements to decrease demanded bits + } + + Value *PtrOp = GEP.getOperand(0); + + // Eliminate unneeded casts for indices, and replace indices which displace + // by multiples of a zero size type with zero. + bool MadeChange = false; + + // Index width may not be the same width as pointer width. + // Data layout chooses the right type based on supported integer types. + Type *NewScalarIndexTy = + DL.getIndexType(GEP.getPointerOperandType()->getScalarType()); + + gep_type_iterator GTI = gep_type_begin(GEP); + for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E; + ++I, ++GTI) { + // Skip indices into struct types. + if (GTI.isStruct()) + continue; + + Type *IndexTy = (*I)->getType(); + Type *NewIndexType = + IndexTy->isVectorTy() + ? VectorType::get(NewScalarIndexTy, IndexTy->getVectorNumElements()) + : NewScalarIndexTy; + + // If the element type has zero size then any index over it is equivalent + // to an index of zero, so replace it with zero if it is not zero already. + Type *EltTy = GTI.getIndexedType(); + if (EltTy->isSized() && DL.getTypeAllocSize(EltTy) == 0) + if (!isa<Constant>(*I) || !match(I->get(), m_Zero())) { + *I = Constant::getNullValue(NewIndexType); + MadeChange = true; + } + + if (IndexTy != NewIndexType) { + // If we are using a wider index than needed for this platform, shrink + // it to what we need. If narrower, sign-extend it to what we need. + // This explicit cast can make subsequent optimizations more obvious. + *I = Builder.CreateIntCast(*I, NewIndexType, true); + MadeChange = true; + } + } + if (MadeChange) + return &GEP; + + // Check to see if the inputs to the PHI node are getelementptr instructions. + if (auto *PN = dyn_cast<PHINode>(PtrOp)) { + auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0)); + if (!Op1) + return nullptr; + + // Don't fold a GEP into itself through a PHI node. This can only happen + // through the back-edge of a loop. Folding a GEP into itself means that + // the value of the previous iteration needs to be stored in the meantime, + // thus requiring an additional register variable to be live, but not + // actually achieving anything (the GEP still needs to be executed once per + // loop iteration). + if (Op1 == &GEP) + return nullptr; + + int DI = -1; + + for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) { + auto *Op2 = dyn_cast<GetElementPtrInst>(*I); + if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands()) + return nullptr; + + // As for Op1 above, don't try to fold a GEP into itself. + if (Op2 == &GEP) + return nullptr; + + // Keep track of the type as we walk the GEP. + Type *CurTy = nullptr; + + for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) { + if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType()) + return nullptr; + + if (Op1->getOperand(J) != Op2->getOperand(J)) { + if (DI == -1) { + // We have not seen any differences yet in the GEPs feeding the + // PHI yet, so we record this one if it is allowed to be a + // variable. + + // The first two arguments can vary for any GEP, the rest have to be + // static for struct slots + if (J > 1 && CurTy->isStructTy()) + return nullptr; + + DI = J; + } else { + // The GEP is different by more than one input. While this could be + // extended to support GEPs that vary by more than one variable it + // doesn't make sense since it greatly increases the complexity and + // would result in an R+R+R addressing mode which no backend + // directly supports and would need to be broken into several + // simpler instructions anyway. + return nullptr; + } + } + + // Sink down a layer of the type for the next iteration. + if (J > 0) { + if (J == 1) { + CurTy = Op1->getSourceElementType(); + } else if (auto *CT = dyn_cast<CompositeType>(CurTy)) { + CurTy = CT->getTypeAtIndex(Op1->getOperand(J)); + } else { + CurTy = nullptr; + } + } + } + } + + // If not all GEPs are identical we'll have to create a new PHI node. + // Check that the old PHI node has only one use so that it will get + // removed. + if (DI != -1 && !PN->hasOneUse()) + return nullptr; + + auto *NewGEP = cast<GetElementPtrInst>(Op1->clone()); + if (DI == -1) { + // All the GEPs feeding the PHI are identical. Clone one down into our + // BB so that it can be merged with the current GEP. + GEP.getParent()->getInstList().insert( + GEP.getParent()->getFirstInsertionPt(), NewGEP); + } else { + // All the GEPs feeding the PHI differ at a single offset. Clone a GEP + // into the current block so it can be merged, and create a new PHI to + // set that index. + PHINode *NewPN; + { + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(PN); + NewPN = Builder.CreatePHI(Op1->getOperand(DI)->getType(), + PN->getNumOperands()); + } + + for (auto &I : PN->operands()) + NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI), + PN->getIncomingBlock(I)); + + NewGEP->setOperand(DI, NewPN); + GEP.getParent()->getInstList().insert( + GEP.getParent()->getFirstInsertionPt(), NewGEP); + NewGEP->setOperand(DI, NewPN); + } + + GEP.setOperand(0, NewGEP); + PtrOp = NewGEP; + } + + // Combine Indices - If the source pointer to this getelementptr instruction + // is a getelementptr instruction, combine the indices of the two + // getelementptr instructions into a single instruction. + if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) { + if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src)) + return nullptr; + + // Try to reassociate loop invariant GEP chains to enable LICM. + if (LI && Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 && + Src->hasOneUse()) { + if (Loop *L = LI->getLoopFor(GEP.getParent())) { + Value *GO1 = GEP.getOperand(1); + Value *SO1 = Src->getOperand(1); + // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is + // invariant: this breaks the dependence between GEPs and allows LICM + // to hoist the invariant part out of the loop. + if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) { + // We have to be careful here. + // We have something like: + // %src = getelementptr <ty>, <ty>* %base, <ty> %idx + // %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2 + // If we just swap idx & idx2 then we could inadvertantly + // change %src from a vector to a scalar, or vice versa. + // Cases: + // 1) %base a scalar & idx a scalar & idx2 a vector + // => Swapping idx & idx2 turns %src into a vector type. + // 2) %base a scalar & idx a vector & idx2 a scalar + // => Swapping idx & idx2 turns %src in a scalar type + // 3) %base, %idx, and %idx2 are scalars + // => %src & %gep are scalars + // => swapping idx & idx2 is safe + // 4) %base a vector + // => %src is a vector + // => swapping idx & idx2 is safe. + auto *SO0 = Src->getOperand(0); + auto *SO0Ty = SO0->getType(); + if (!isa<VectorType>(GEPType) || // case 3 + isa<VectorType>(SO0Ty)) { // case 4 + Src->setOperand(1, GO1); + GEP.setOperand(1, SO1); + return &GEP; + } else { + // Case 1 or 2 + // -- have to recreate %src & %gep + // put NewSrc at same location as %src + Builder.SetInsertPoint(cast<Instruction>(PtrOp)); + auto *NewSrc = cast<GetElementPtrInst>( + Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName())); + NewSrc->setIsInBounds(Src->isInBounds()); + auto *NewGEP = GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1}); + NewGEP->setIsInBounds(GEP.isInBounds()); + return NewGEP; + } + } + } + } + + // Note that if our source is a gep chain itself then we wait for that + // chain to be resolved before we perform this transformation. This + // avoids us creating a TON of code in some cases. + if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0))) + if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP)) + return nullptr; // Wait until our source is folded to completion. + + SmallVector<Value*, 8> Indices; + + // Find out whether the last index in the source GEP is a sequential idx. + bool EndsWithSequential = false; + for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src); + I != E; ++I) + EndsWithSequential = I.isSequential(); + + // Can we combine the two pointer arithmetics offsets? + if (EndsWithSequential) { + // Replace: gep (gep %P, long B), long A, ... + // With: T = long A+B; gep %P, T, ... + Value *SO1 = Src->getOperand(Src->getNumOperands()-1); + Value *GO1 = GEP.getOperand(1); + + // If they aren't the same type, then the input hasn't been processed + // by the loop above yet (which canonicalizes sequential index types to + // intptr_t). Just avoid transforming this until the input has been + // normalized. + if (SO1->getType() != GO1->getType()) + return nullptr; + + Value *Sum = + SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP)); + // Only do the combine when we are sure the cost after the + // merge is never more than that before the merge. + if (Sum == nullptr) + return nullptr; + + // Update the GEP in place if possible. + if (Src->getNumOperands() == 2) { + GEP.setOperand(0, Src->getOperand(0)); + GEP.setOperand(1, Sum); + return &GEP; + } + Indices.append(Src->op_begin()+1, Src->op_end()-1); + Indices.push_back(Sum); + Indices.append(GEP.op_begin()+2, GEP.op_end()); + } else if (isa<Constant>(*GEP.idx_begin()) && + cast<Constant>(*GEP.idx_begin())->isNullValue() && + Src->getNumOperands() != 1) { + // Otherwise we can do the fold if the first index of the GEP is a zero + Indices.append(Src->op_begin()+1, Src->op_end()); + Indices.append(GEP.idx_begin()+1, GEP.idx_end()); + } + + if (!Indices.empty()) + return GEP.isInBounds() && Src->isInBounds() + ? GetElementPtrInst::CreateInBounds( + Src->getSourceElementType(), Src->getOperand(0), Indices, + GEP.getName()) + : GetElementPtrInst::Create(Src->getSourceElementType(), + Src->getOperand(0), Indices, + GEP.getName()); + } + + if (GEP.getNumIndices() == 1) { + unsigned AS = GEP.getPointerAddressSpace(); + if (GEP.getOperand(1)->getType()->getScalarSizeInBits() == + DL.getIndexSizeInBits(AS)) { + uint64_t TyAllocSize = DL.getTypeAllocSize(GEPEltType); + + bool Matched = false; + uint64_t C; + Value *V = nullptr; + if (TyAllocSize == 1) { + V = GEP.getOperand(1); + Matched = true; + } else if (match(GEP.getOperand(1), + m_AShr(m_Value(V), m_ConstantInt(C)))) { + if (TyAllocSize == 1ULL << C) + Matched = true; + } else if (match(GEP.getOperand(1), + m_SDiv(m_Value(V), m_ConstantInt(C)))) { + if (TyAllocSize == C) + Matched = true; + } + + if (Matched) { + // Canonicalize (gep i8* X, -(ptrtoint Y)) + // to (inttoptr (sub (ptrtoint X), (ptrtoint Y))) + // The GEP pattern is emitted by the SCEV expander for certain kinds of + // pointer arithmetic. + if (match(V, m_Neg(m_PtrToInt(m_Value())))) { + Operator *Index = cast<Operator>(V); + Value *PtrToInt = Builder.CreatePtrToInt(PtrOp, Index->getType()); + Value *NewSub = Builder.CreateSub(PtrToInt, Index->getOperand(1)); + return CastInst::Create(Instruction::IntToPtr, NewSub, GEPType); + } + // Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X)) + // to (bitcast Y) + Value *Y; + if (match(V, m_Sub(m_PtrToInt(m_Value(Y)), + m_PtrToInt(m_Specific(GEP.getOperand(0)))))) + return CastInst::CreatePointerBitCastOrAddrSpaceCast(Y, GEPType); + } + } + } + + // We do not handle pointer-vector geps here. + if (GEPType->isVectorTy()) + return nullptr; + + // Handle gep(bitcast x) and gep(gep x, 0, 0, 0). + Value *StrippedPtr = PtrOp->stripPointerCasts(); + PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType()); + + if (StrippedPtr != PtrOp) { + bool HasZeroPointerIndex = false; + Type *StrippedPtrEltTy = StrippedPtrTy->getElementType(); + + if (auto *C = dyn_cast<ConstantInt>(GEP.getOperand(1))) + HasZeroPointerIndex = C->isZero(); + + // Transform: GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... + // into : GEP [10 x i8]* X, i32 0, ... + // + // Likewise, transform: GEP (bitcast i8* X to [0 x i8]*), i32 0, ... + // into : GEP i8* X, ... + // + // This occurs when the program declares an array extern like "int X[];" + if (HasZeroPointerIndex) { + if (auto *CATy = dyn_cast<ArrayType>(GEPEltType)) { + // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ? + if (CATy->getElementType() == StrippedPtrEltTy) { + // -> GEP i8* X, ... + SmallVector<Value*, 8> Idx(GEP.idx_begin()+1, GEP.idx_end()); + GetElementPtrInst *Res = GetElementPtrInst::Create( + StrippedPtrEltTy, StrippedPtr, Idx, GEP.getName()); + Res->setIsInBounds(GEP.isInBounds()); + if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) + return Res; + // Insert Res, and create an addrspacecast. + // e.g., + // GEP (addrspacecast i8 addrspace(1)* X to [0 x i8]*), i32 0, ... + // -> + // %0 = GEP i8 addrspace(1)* X, ... + // addrspacecast i8 addrspace(1)* %0 to i8* + return new AddrSpaceCastInst(Builder.Insert(Res), GEPType); + } + + if (auto *XATy = dyn_cast<ArrayType>(StrippedPtrEltTy)) { + // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ? + if (CATy->getElementType() == XATy->getElementType()) { + // -> GEP [10 x i8]* X, i32 0, ... + // At this point, we know that the cast source type is a pointer + // to an array of the same type as the destination pointer + // array. Because the array type is never stepped over (there + // is a leading zero) we can fold the cast into this GEP. + if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) { + GEP.setOperand(0, StrippedPtr); + GEP.setSourceElementType(XATy); + return &GEP; + } + // Cannot replace the base pointer directly because StrippedPtr's + // address space is different. Instead, create a new GEP followed by + // an addrspacecast. + // e.g., + // GEP (addrspacecast [10 x i8] addrspace(1)* X to [0 x i8]*), + // i32 0, ... + // -> + // %0 = GEP [10 x i8] addrspace(1)* X, ... + // addrspacecast i8 addrspace(1)* %0 to i8* + SmallVector<Value*, 8> Idx(GEP.idx_begin(), GEP.idx_end()); + Value *NewGEP = + GEP.isInBounds() + ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr, + Idx, GEP.getName()) + : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx, + GEP.getName()); + return new AddrSpaceCastInst(NewGEP, GEPType); + } + } + } + } else if (GEP.getNumOperands() == 2) { + // Transform things like: + // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V + // into: %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast + if (StrippedPtrEltTy->isArrayTy() && + DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType()) == + DL.getTypeAllocSize(GEPEltType)) { + Type *IdxType = DL.getIndexType(GEPType); + Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) }; + Value *NewGEP = + GEP.isInBounds() + ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr, Idx, + GEP.getName()) + : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx, + GEP.getName()); + + // V and GEP are both pointer types --> BitCast + return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, GEPType); + } + + // Transform things like: + // %V = mul i64 %N, 4 + // %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V + // into: %t1 = getelementptr i32* %arr, i32 %N; bitcast + if (GEPEltType->isSized() && StrippedPtrEltTy->isSized()) { + // Check that changing the type amounts to dividing the index by a scale + // factor. + uint64_t ResSize = DL.getTypeAllocSize(GEPEltType); + uint64_t SrcSize = DL.getTypeAllocSize(StrippedPtrEltTy); + if (ResSize && SrcSize % ResSize == 0) { + Value *Idx = GEP.getOperand(1); + unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits(); + uint64_t Scale = SrcSize / ResSize; + + // Earlier transforms ensure that the index has the right type + // according to Data Layout, which considerably simplifies the + // logic by eliminating implicit casts. + assert(Idx->getType() == DL.getIndexType(GEPType) && + "Index type does not match the Data Layout preferences"); + + bool NSW; + if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) { + // Successfully decomposed Idx as NewIdx * Scale, form a new GEP. + // If the multiplication NewIdx * Scale may overflow then the new + // GEP may not be "inbounds". + Value *NewGEP = + GEP.isInBounds() && NSW + ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr, + NewIdx, GEP.getName()) + : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, NewIdx, + GEP.getName()); + + // The NewGEP must be pointer typed, so must the old one -> BitCast + return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, + GEPType); + } + } + } + + // Similarly, transform things like: + // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp + // (where tmp = 8*tmp2) into: + // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast + if (GEPEltType->isSized() && StrippedPtrEltTy->isSized() && + StrippedPtrEltTy->isArrayTy()) { + // Check that changing to the array element type amounts to dividing the + // index by a scale factor. + uint64_t ResSize = DL.getTypeAllocSize(GEPEltType); + uint64_t ArrayEltSize = + DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType()); + if (ResSize && ArrayEltSize % ResSize == 0) { + Value *Idx = GEP.getOperand(1); + unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits(); + uint64_t Scale = ArrayEltSize / ResSize; + + // Earlier transforms ensure that the index has the right type + // according to the Data Layout, which considerably simplifies + // the logic by eliminating implicit casts. + assert(Idx->getType() == DL.getIndexType(GEPType) && + "Index type does not match the Data Layout preferences"); + + bool NSW; + if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) { + // Successfully decomposed Idx as NewIdx * Scale, form a new GEP. + // If the multiplication NewIdx * Scale may overflow then the new + // GEP may not be "inbounds". + Type *IndTy = DL.getIndexType(GEPType); + Value *Off[2] = {Constant::getNullValue(IndTy), NewIdx}; + + Value *NewGEP = + GEP.isInBounds() && NSW + ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr, + Off, GEP.getName()) + : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Off, + GEP.getName()); + // The NewGEP must be pointer typed, so must the old one -> BitCast + return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, + GEPType); + } + } + } + } + } + + // addrspacecast between types is canonicalized as a bitcast, then an + // addrspacecast. To take advantage of the below bitcast + struct GEP, look + // through the addrspacecast. + Value *ASCStrippedPtrOp = PtrOp; + if (auto *ASC = dyn_cast<AddrSpaceCastInst>(PtrOp)) { + // X = bitcast A addrspace(1)* to B addrspace(1)* + // Y = addrspacecast A addrspace(1)* to B addrspace(2)* + // Z = gep Y, <...constant indices...> + // Into an addrspacecasted GEP of the struct. + if (auto *BC = dyn_cast<BitCastInst>(ASC->getOperand(0))) + ASCStrippedPtrOp = BC; + } + + if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) { + Value *SrcOp = BCI->getOperand(0); + PointerType *SrcType = cast<PointerType>(BCI->getSrcTy()); + Type *SrcEltType = SrcType->getElementType(); + + // GEP directly using the source operand if this GEP is accessing an element + // of a bitcasted pointer to vector or array of the same dimensions: + // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z + // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z + auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy) { + return ArrTy->getArrayElementType() == VecTy->getVectorElementType() && + ArrTy->getArrayNumElements() == VecTy->getVectorNumElements(); + }; + if (GEP.getNumOperands() == 3 && + ((GEPEltType->isArrayTy() && SrcEltType->isVectorTy() && + areMatchingArrayAndVecTypes(GEPEltType, SrcEltType)) || + (GEPEltType->isVectorTy() && SrcEltType->isArrayTy() && + areMatchingArrayAndVecTypes(SrcEltType, GEPEltType)))) { + + // Create a new GEP here, as using `setOperand()` followed by + // `setSourceElementType()` won't actually update the type of the + // existing GEP Value. Causing issues if this Value is accessed when + // constructing an AddrSpaceCastInst + Value *NGEP = + GEP.isInBounds() + ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]}) + : Builder.CreateGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]}); + NGEP->takeName(&GEP); + + // Preserve GEP address space to satisfy users + if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace()) + return new AddrSpaceCastInst(NGEP, GEPType); + + return replaceInstUsesWith(GEP, NGEP); + } + + // See if we can simplify: + // X = bitcast A* to B* + // Y = gep X, <...constant indices...> + // into a gep of the original struct. This is important for SROA and alias + // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged. + unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEPType); + APInt Offset(OffsetBits, 0); + if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset)) { + // If this GEP instruction doesn't move the pointer, just replace the GEP + // with a bitcast of the real input to the dest type. + if (!Offset) { + // If the bitcast is of an allocation, and the allocation will be + // converted to match the type of the cast, don't touch this. + if (isa<AllocaInst>(SrcOp) || isAllocationFn(SrcOp, &TLI)) { + // See if the bitcast simplifies, if so, don't nuke this GEP yet. + if (Instruction *I = visitBitCast(*BCI)) { + if (I != BCI) { + I->takeName(BCI); + BCI->getParent()->getInstList().insert(BCI->getIterator(), I); + replaceInstUsesWith(*BCI, I); + } + return &GEP; + } + } + + if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace()) + return new AddrSpaceCastInst(SrcOp, GEPType); + return new BitCastInst(SrcOp, GEPType); + } + + // Otherwise, if the offset is non-zero, we need to find out if there is a + // field at Offset in 'A's type. If so, we can pull the cast through the + // GEP. + SmallVector<Value*, 8> NewIndices; + if (FindElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices)) { + Value *NGEP = + GEP.isInBounds() + ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices) + : Builder.CreateGEP(SrcEltType, SrcOp, NewIndices); + + if (NGEP->getType() == GEPType) + return replaceInstUsesWith(GEP, NGEP); + NGEP->takeName(&GEP); + + if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace()) + return new AddrSpaceCastInst(NGEP, GEPType); + return new BitCastInst(NGEP, GEPType); + } + } + } + + if (!GEP.isInBounds()) { + unsigned IdxWidth = + DL.getIndexSizeInBits(PtrOp->getType()->getPointerAddressSpace()); + APInt BasePtrOffset(IdxWidth, 0); + Value *UnderlyingPtrOp = + PtrOp->stripAndAccumulateInBoundsConstantOffsets(DL, + BasePtrOffset); + if (auto *AI = dyn_cast<AllocaInst>(UnderlyingPtrOp)) { + if (GEP.accumulateConstantOffset(DL, BasePtrOffset) && + BasePtrOffset.isNonNegative()) { + APInt AllocSize(IdxWidth, DL.getTypeAllocSize(AI->getAllocatedType())); + if (BasePtrOffset.ule(AllocSize)) { + return GetElementPtrInst::CreateInBounds( + GEP.getSourceElementType(), PtrOp, makeArrayRef(Ops).slice(1), + GEP.getName()); + } + } + } + } + + return nullptr; +} + +static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI, + Instruction *AI) { + if (isa<ConstantPointerNull>(V)) + return true; + if (auto *LI = dyn_cast<LoadInst>(V)) + return isa<GlobalVariable>(LI->getPointerOperand()); + // Two distinct allocations will never be equal. + // We rely on LookThroughBitCast in isAllocLikeFn being false, since looking + // through bitcasts of V can cause + // the result statement below to be true, even when AI and V (ex: + // i8* ->i32* ->i8* of AI) are the same allocations. + return isAllocLikeFn(V, TLI) && V != AI; +} + +static bool isAllocSiteRemovable(Instruction *AI, + SmallVectorImpl<WeakTrackingVH> &Users, + const TargetLibraryInfo *TLI) { + SmallVector<Instruction*, 4> Worklist; + Worklist.push_back(AI); + + do { + Instruction *PI = Worklist.pop_back_val(); + for (User *U : PI->users()) { + Instruction *I = cast<Instruction>(U); + switch (I->getOpcode()) { + default: + // Give up the moment we see something we can't handle. + return false; + + case Instruction::AddrSpaceCast: + case Instruction::BitCast: + case Instruction::GetElementPtr: + Users.emplace_back(I); + Worklist.push_back(I); + continue; + + case Instruction::ICmp: { + ICmpInst *ICI = cast<ICmpInst>(I); + // We can fold eq/ne comparisons with null to false/true, respectively. + // We also fold comparisons in some conditions provided the alloc has + // not escaped (see isNeverEqualToUnescapedAlloc). + if (!ICI->isEquality()) + return false; + unsigned OtherIndex = (ICI->getOperand(0) == PI) ? 1 : 0; + if (!isNeverEqualToUnescapedAlloc(ICI->getOperand(OtherIndex), TLI, AI)) + return false; + Users.emplace_back(I); + continue; + } + + case Instruction::Call: + // Ignore no-op and store intrinsics. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: + return false; + + case Intrinsic::memmove: + case Intrinsic::memcpy: + case Intrinsic::memset: { + MemIntrinsic *MI = cast<MemIntrinsic>(II); + if (MI->isVolatile() || MI->getRawDest() != PI) + return false; + LLVM_FALLTHROUGH; + } + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::objectsize: + Users.emplace_back(I); + continue; + } + } + + if (isFreeCall(I, TLI)) { + Users.emplace_back(I); + continue; + } + return false; + + case Instruction::Store: { + StoreInst *SI = cast<StoreInst>(I); + if (SI->isVolatile() || SI->getPointerOperand() != PI) + return false; + Users.emplace_back(I); + continue; + } + } + llvm_unreachable("missing a return?"); + } + } while (!Worklist.empty()); + return true; +} + +Instruction *InstCombiner::visitAllocSite(Instruction &MI) { + // If we have a malloc call which is only used in any amount of comparisons to + // null and free calls, delete the calls and replace the comparisons with true + // or false as appropriate. + + // This is based on the principle that we can substitute our own allocation + // function (which will never return null) rather than knowledge of the + // specific function being called. In some sense this can change the permitted + // outputs of a program (when we convert a malloc to an alloca, the fact that + // the allocation is now on the stack is potentially visible, for example), + // but we believe in a permissible manner. + SmallVector<WeakTrackingVH, 64> Users; + + // If we are removing an alloca with a dbg.declare, insert dbg.value calls + // before each store. + TinyPtrVector<DbgVariableIntrinsic *> DIIs; + std::unique_ptr<DIBuilder> DIB; + if (isa<AllocaInst>(MI)) { + DIIs = FindDbgAddrUses(&MI); + DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false)); + } + + if (isAllocSiteRemovable(&MI, Users, &TLI)) { + for (unsigned i = 0, e = Users.size(); i != e; ++i) { + // Lowering all @llvm.objectsize calls first because they may + // use a bitcast/GEP of the alloca we are removing. + if (!Users[i]) + continue; + + Instruction *I = cast<Instruction>(&*Users[i]); + + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::objectsize) { + Value *Result = + lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/true); + replaceInstUsesWith(*I, Result); + eraseInstFromFunction(*I); + Users[i] = nullptr; // Skip examining in the next loop. + } + } + } + for (unsigned i = 0, e = Users.size(); i != e; ++i) { + if (!Users[i]) + continue; + + Instruction *I = cast<Instruction>(&*Users[i]); + + if (ICmpInst *C = dyn_cast<ICmpInst>(I)) { + replaceInstUsesWith(*C, + ConstantInt::get(Type::getInt1Ty(C->getContext()), + C->isFalseWhenEqual())); + } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I) || + isa<AddrSpaceCastInst>(I)) { + replaceInstUsesWith(*I, UndefValue::get(I->getType())); + } else if (auto *SI = dyn_cast<StoreInst>(I)) { + for (auto *DII : DIIs) + ConvertDebugDeclareToDebugValue(DII, SI, *DIB); + } + eraseInstFromFunction(*I); + } + + if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) { + // Replace invoke with a NOP intrinsic to maintain the original CFG + Module *M = II->getModule(); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing); + InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(), + None, "", II->getParent()); + } + + for (auto *DII : DIIs) + eraseInstFromFunction(*DII); + + return eraseInstFromFunction(MI); + } + return nullptr; +} + +/// Move the call to free before a NULL test. +/// +/// Check if this free is accessed after its argument has been test +/// against NULL (property 0). +/// If yes, it is legal to move this call in its predecessor block. +/// +/// The move is performed only if the block containing the call to free +/// will be removed, i.e.: +/// 1. it has only one predecessor P, and P has two successors +/// 2. it contains the call, noops, and an unconditional branch +/// 3. its successor is the same as its predecessor's successor +/// +/// The profitability is out-of concern here and this function should +/// be called only if the caller knows this transformation would be +/// profitable (e.g., for code size). +static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI, + const DataLayout &DL) { + Value *Op = FI.getArgOperand(0); + BasicBlock *FreeInstrBB = FI.getParent(); + BasicBlock *PredBB = FreeInstrBB->getSinglePredecessor(); + + // Validate part of constraint #1: Only one predecessor + // FIXME: We can extend the number of predecessor, but in that case, we + // would duplicate the call to free in each predecessor and it may + // not be profitable even for code size. + if (!PredBB) + return nullptr; + + // Validate constraint #2: Does this block contains only the call to + // free, noops, and an unconditional branch? + BasicBlock *SuccBB; + Instruction *FreeInstrBBTerminator = FreeInstrBB->getTerminator(); + if (!match(FreeInstrBBTerminator, m_UnconditionalBr(SuccBB))) + return nullptr; + + // If there are only 2 instructions in the block, at this point, + // this is the call to free and unconditional. + // If there are more than 2 instructions, check that they are noops + // i.e., they won't hurt the performance of the generated code. + if (FreeInstrBB->size() != 2) { + for (const Instruction &Inst : *FreeInstrBB) { + if (&Inst == &FI || &Inst == FreeInstrBBTerminator) + continue; + auto *Cast = dyn_cast<CastInst>(&Inst); + if (!Cast || !Cast->isNoopCast(DL)) + return nullptr; + } + } + // Validate the rest of constraint #1 by matching on the pred branch. + Instruction *TI = PredBB->getTerminator(); + BasicBlock *TrueBB, *FalseBB; + ICmpInst::Predicate Pred; + if (!match(TI, m_Br(m_ICmp(Pred, + m_CombineOr(m_Specific(Op), + m_Specific(Op->stripPointerCasts())), + m_Zero()), + TrueBB, FalseBB))) + return nullptr; + if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE) + return nullptr; + + // Validate constraint #3: Ensure the null case just falls through. + if (SuccBB != (Pred == ICmpInst::ICMP_EQ ? TrueBB : FalseBB)) + return nullptr; + assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) && + "Broken CFG: missing edge from predecessor to successor"); + + // At this point, we know that everything in FreeInstrBB can be moved + // before TI. + for (BasicBlock::iterator It = FreeInstrBB->begin(), End = FreeInstrBB->end(); + It != End;) { + Instruction &Instr = *It++; + if (&Instr == FreeInstrBBTerminator) + break; + Instr.moveBefore(TI); + } + assert(FreeInstrBB->size() == 1 && + "Only the branch instruction should remain"); + return &FI; +} + +Instruction *InstCombiner::visitFree(CallInst &FI) { + Value *Op = FI.getArgOperand(0); + + // free undef -> unreachable. + if (isa<UndefValue>(Op)) { + // Leave a marker since we can't modify the CFG here. + CreateNonTerminatorUnreachable(&FI); + return eraseInstFromFunction(FI); + } + + // If we have 'free null' delete the instruction. This can happen in stl code + // when lots of inlining happens. + if (isa<ConstantPointerNull>(Op)) + return eraseInstFromFunction(FI); + + // If we optimize for code size, try to move the call to free before the null + // test so that simplify cfg can remove the empty block and dead code + // elimination the branch. I.e., helps to turn something like: + // if (foo) free(foo); + // into + // free(foo); + if (MinimizeSize) + if (Instruction *I = tryToMoveFreeBeforeNullTest(FI, DL)) + return I; + + return nullptr; +} + +Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) { + if (RI.getNumOperands() == 0) // ret void + return nullptr; + + Value *ResultOp = RI.getOperand(0); + Type *VTy = ResultOp->getType(); + if (!VTy->isIntegerTy()) + return nullptr; + + // There might be assume intrinsics dominating this return that completely + // determine the value. If so, constant fold it. + KnownBits Known = computeKnownBits(ResultOp, 0, &RI); + if (Known.isConstant()) + RI.setOperand(0, Constant::getIntegerValue(VTy, Known.getConstant())); + + return nullptr; +} + +Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { + // Change br (not X), label True, label False to: br X, label False, True + Value *X = nullptr; + if (match(&BI, m_Br(m_Not(m_Value(X)), m_BasicBlock(), m_BasicBlock())) && + !isa<Constant>(X)) { + // Swap Destinations and condition... + BI.setCondition(X); + BI.swapSuccessors(); + return &BI; + } + + // If the condition is irrelevant, remove the use so that other + // transforms on the condition become more effective. + if (BI.isConditional() && !isa<ConstantInt>(BI.getCondition()) && + BI.getSuccessor(0) == BI.getSuccessor(1)) { + BI.setCondition(ConstantInt::getFalse(BI.getCondition()->getType())); + return &BI; + } + + // Canonicalize, for example, icmp_ne -> icmp_eq or fcmp_one -> fcmp_oeq. + CmpInst::Predicate Pred; + if (match(&BI, m_Br(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), + m_BasicBlock(), m_BasicBlock())) && + !isCanonicalPredicate(Pred)) { + // Swap destinations and condition. + CmpInst *Cond = cast<CmpInst>(BI.getCondition()); + Cond->setPredicate(CmpInst::getInversePredicate(Pred)); + BI.swapSuccessors(); + Worklist.Add(Cond); + return &BI; + } + + return nullptr; +} + +Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { + Value *Cond = SI.getCondition(); + Value *Op0; + ConstantInt *AddRHS; + if (match(Cond, m_Add(m_Value(Op0), m_ConstantInt(AddRHS)))) { + // Change 'switch (X+4) case 1:' into 'switch (X) case -3'. + for (auto Case : SI.cases()) { + Constant *NewCase = ConstantExpr::getSub(Case.getCaseValue(), AddRHS); + assert(isa<ConstantInt>(NewCase) && + "Result of expression should be constant"); + Case.setValue(cast<ConstantInt>(NewCase)); + } + SI.setCondition(Op0); + return &SI; + } + + KnownBits Known = computeKnownBits(Cond, 0, &SI); + unsigned LeadingKnownZeros = Known.countMinLeadingZeros(); + unsigned LeadingKnownOnes = Known.countMinLeadingOnes(); + + // Compute the number of leading bits we can ignore. + // TODO: A better way to determine this would use ComputeNumSignBits(). + for (auto &C : SI.cases()) { + LeadingKnownZeros = std::min( + LeadingKnownZeros, C.getCaseValue()->getValue().countLeadingZeros()); + LeadingKnownOnes = std::min( + LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes()); + } + + unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes); + + // Shrink the condition operand if the new type is smaller than the old type. + // But do not shrink to a non-standard type, because backend can't generate + // good code for that yet. + // TODO: We can make it aggressive again after fixing PR39569. + if (NewWidth > 0 && NewWidth < Known.getBitWidth() && + shouldChangeType(Known.getBitWidth(), NewWidth)) { + IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth); + Builder.SetInsertPoint(&SI); + Value *NewCond = Builder.CreateTrunc(Cond, Ty, "trunc"); + SI.setCondition(NewCond); + + for (auto Case : SI.cases()) { + APInt TruncatedCase = Case.getCaseValue()->getValue().trunc(NewWidth); + Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase)); + } + return &SI; + } + + return nullptr; +} + +Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { + Value *Agg = EV.getAggregateOperand(); + + if (!EV.hasIndices()) + return replaceInstUsesWith(EV, Agg); + + if (Value *V = SimplifyExtractValueInst(Agg, EV.getIndices(), + SQ.getWithInstruction(&EV))) + return replaceInstUsesWith(EV, V); + + if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) { + // We're extracting from an insertvalue instruction, compare the indices + const unsigned *exti, *exte, *insi, *inse; + for (exti = EV.idx_begin(), insi = IV->idx_begin(), + exte = EV.idx_end(), inse = IV->idx_end(); + exti != exte && insi != inse; + ++exti, ++insi) { + if (*insi != *exti) + // The insert and extract both reference distinctly different elements. + // This means the extract is not influenced by the insert, and we can + // replace the aggregate operand of the extract with the aggregate + // operand of the insert. i.e., replace + // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1 + // %E = extractvalue { i32, { i32 } } %I, 0 + // with + // %E = extractvalue { i32, { i32 } } %A, 0 + return ExtractValueInst::Create(IV->getAggregateOperand(), + EV.getIndices()); + } + if (exti == exte && insi == inse) + // Both iterators are at the end: Index lists are identical. Replace + // %B = insertvalue { i32, { i32 } } %A, i32 42, 1, 0 + // %C = extractvalue { i32, { i32 } } %B, 1, 0 + // with "i32 42" + return replaceInstUsesWith(EV, IV->getInsertedValueOperand()); + if (exti == exte) { + // The extract list is a prefix of the insert list. i.e. replace + // %I = insertvalue { i32, { i32 } } %A, i32 42, 1, 0 + // %E = extractvalue { i32, { i32 } } %I, 1 + // with + // %X = extractvalue { i32, { i32 } } %A, 1 + // %E = insertvalue { i32 } %X, i32 42, 0 + // by switching the order of the insert and extract (though the + // insertvalue should be left in, since it may have other uses). + Value *NewEV = Builder.CreateExtractValue(IV->getAggregateOperand(), + EV.getIndices()); + return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(), + makeArrayRef(insi, inse)); + } + if (insi == inse) + // The insert list is a prefix of the extract list + // We can simply remove the common indices from the extract and make it + // operate on the inserted value instead of the insertvalue result. + // i.e., replace + // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1 + // %E = extractvalue { i32, { i32 } } %I, 1, 0 + // with + // %E extractvalue { i32 } { i32 42 }, 0 + return ExtractValueInst::Create(IV->getInsertedValueOperand(), + makeArrayRef(exti, exte)); + } + if (WithOverflowInst *WO = dyn_cast<WithOverflowInst>(Agg)) { + // We're extracting from an overflow intrinsic, see if we're the only user, + // which allows us to simplify multiple result intrinsics to simpler + // things that just get one value. + if (WO->hasOneUse()) { + // Check if we're grabbing only the result of a 'with overflow' intrinsic + // and replace it with a traditional binary instruction. + if (*EV.idx_begin() == 0) { + Instruction::BinaryOps BinOp = WO->getBinaryOp(); + Value *LHS = WO->getLHS(), *RHS = WO->getRHS(); + replaceInstUsesWith(*WO, UndefValue::get(WO->getType())); + eraseInstFromFunction(*WO); + return BinaryOperator::Create(BinOp, LHS, RHS); + } + + // If the normal result of the add is dead, and the RHS is a constant, + // we can transform this into a range comparison. + // overflow = uadd a, -4 --> overflow = icmp ugt a, 3 + if (WO->getIntrinsicID() == Intrinsic::uadd_with_overflow) + if (ConstantInt *CI = dyn_cast<ConstantInt>(WO->getRHS())) + return new ICmpInst(ICmpInst::ICMP_UGT, WO->getLHS(), + ConstantExpr::getNot(CI)); + } + } + if (LoadInst *L = dyn_cast<LoadInst>(Agg)) + // If the (non-volatile) load only has one use, we can rewrite this to a + // load from a GEP. This reduces the size of the load. If a load is used + // only by extractvalue instructions then this either must have been + // optimized before, or it is a struct with padding, in which case we + // don't want to do the transformation as it loses padding knowledge. + if (L->isSimple() && L->hasOneUse()) { + // extractvalue has integer indices, getelementptr has Value*s. Convert. + SmallVector<Value*, 4> Indices; + // Prefix an i32 0 since we need the first element. + Indices.push_back(Builder.getInt32(0)); + for (ExtractValueInst::idx_iterator I = EV.idx_begin(), E = EV.idx_end(); + I != E; ++I) + Indices.push_back(Builder.getInt32(*I)); + + // We need to insert these at the location of the old load, not at that of + // the extractvalue. + Builder.SetInsertPoint(L); + Value *GEP = Builder.CreateInBoundsGEP(L->getType(), + L->getPointerOperand(), Indices); + Instruction *NL = Builder.CreateLoad(EV.getType(), GEP); + // Whatever aliasing information we had for the orignal load must also + // hold for the smaller load, so propagate the annotations. + AAMDNodes Nodes; + L->getAAMetadata(Nodes); + NL->setAAMetadata(Nodes); + // Returning the load directly will cause the main loop to insert it in + // the wrong spot, so use replaceInstUsesWith(). + return replaceInstUsesWith(EV, NL); + } + // We could simplify extracts from other values. Note that nested extracts may + // already be simplified implicitly by the above: extract (extract (insert) ) + // will be translated into extract ( insert ( extract ) ) first and then just + // the value inserted, if appropriate. Similarly for extracts from single-use + // loads: extract (extract (load)) will be translated to extract (load (gep)) + // and if again single-use then via load (gep (gep)) to load (gep). + // However, double extracts from e.g. function arguments or return values + // aren't handled yet. + return nullptr; +} + +/// Return 'true' if the given typeinfo will match anything. +static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) { + switch (Personality) { + case EHPersonality::GNU_C: + case EHPersonality::GNU_C_SjLj: + case EHPersonality::Rust: + // The GCC C EH and Rust personality only exists to support cleanups, so + // it's not clear what the semantics of catch clauses are. + return false; + case EHPersonality::Unknown: + return false; + case EHPersonality::GNU_Ada: + // While __gnat_all_others_value will match any Ada exception, it doesn't + // match foreign exceptions (or didn't, before gcc-4.7). + return false; + case EHPersonality::GNU_CXX: + case EHPersonality::GNU_CXX_SjLj: + case EHPersonality::GNU_ObjC: + case EHPersonality::MSVC_X86SEH: + case EHPersonality::MSVC_Win64SEH: + case EHPersonality::MSVC_CXX: + case EHPersonality::CoreCLR: + case EHPersonality::Wasm_CXX: + return TypeInfo->isNullValue(); + } + llvm_unreachable("invalid enum"); +} + +static bool shorter_filter(const Value *LHS, const Value *RHS) { + return + cast<ArrayType>(LHS->getType())->getNumElements() + < + cast<ArrayType>(RHS->getType())->getNumElements(); +} + +Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { + // The logic here should be correct for any real-world personality function. + // However if that turns out not to be true, the offending logic can always + // be conditioned on the personality function, like the catch-all logic is. + EHPersonality Personality = + classifyEHPersonality(LI.getParent()->getParent()->getPersonalityFn()); + + // Simplify the list of clauses, eg by removing repeated catch clauses + // (these are often created by inlining). + bool MakeNewInstruction = false; // If true, recreate using the following: + SmallVector<Constant *, 16> NewClauses; // - Clauses for the new instruction; + bool CleanupFlag = LI.isCleanup(); // - The new instruction is a cleanup. + + SmallPtrSet<Value *, 16> AlreadyCaught; // Typeinfos known caught already. + for (unsigned i = 0, e = LI.getNumClauses(); i != e; ++i) { + bool isLastClause = i + 1 == e; + if (LI.isCatch(i)) { + // A catch clause. + Constant *CatchClause = LI.getClause(i); + Constant *TypeInfo = CatchClause->stripPointerCasts(); + + // If we already saw this clause, there is no point in having a second + // copy of it. + if (AlreadyCaught.insert(TypeInfo).second) { + // This catch clause was not already seen. + NewClauses.push_back(CatchClause); + } else { + // Repeated catch clause - drop the redundant copy. + MakeNewInstruction = true; + } + + // If this is a catch-all then there is no point in keeping any following + // clauses or marking the landingpad as having a cleanup. + if (isCatchAll(Personality, TypeInfo)) { + if (!isLastClause) + MakeNewInstruction = true; + CleanupFlag = false; + break; + } + } else { + // A filter clause. If any of the filter elements were already caught + // then they can be dropped from the filter. It is tempting to try to + // exploit the filter further by saying that any typeinfo that does not + // occur in the filter can't be caught later (and thus can be dropped). + // However this would be wrong, since typeinfos can match without being + // equal (for example if one represents a C++ class, and the other some + // class derived from it). + assert(LI.isFilter(i) && "Unsupported landingpad clause!"); + Constant *FilterClause = LI.getClause(i); + ArrayType *FilterType = cast<ArrayType>(FilterClause->getType()); + unsigned NumTypeInfos = FilterType->getNumElements(); + + // An empty filter catches everything, so there is no point in keeping any + // following clauses or marking the landingpad as having a cleanup. By + // dealing with this case here the following code is made a bit simpler. + if (!NumTypeInfos) { + NewClauses.push_back(FilterClause); + if (!isLastClause) + MakeNewInstruction = true; + CleanupFlag = false; + break; + } + + bool MakeNewFilter = false; // If true, make a new filter. + SmallVector<Constant *, 16> NewFilterElts; // New elements. + if (isa<ConstantAggregateZero>(FilterClause)) { + // Not an empty filter - it contains at least one null typeinfo. + assert(NumTypeInfos > 0 && "Should have handled empty filter already!"); + Constant *TypeInfo = + Constant::getNullValue(FilterType->getElementType()); + // If this typeinfo is a catch-all then the filter can never match. + if (isCatchAll(Personality, TypeInfo)) { + // Throw the filter away. + MakeNewInstruction = true; + continue; + } + + // There is no point in having multiple copies of this typeinfo, so + // discard all but the first copy if there is more than one. + NewFilterElts.push_back(TypeInfo); + if (NumTypeInfos > 1) + MakeNewFilter = true; + } else { + ConstantArray *Filter = cast<ConstantArray>(FilterClause); + SmallPtrSet<Value *, 16> SeenInFilter; // For uniquing the elements. + NewFilterElts.reserve(NumTypeInfos); + + // Remove any filter elements that were already caught or that already + // occurred in the filter. While there, see if any of the elements are + // catch-alls. If so, the filter can be discarded. + bool SawCatchAll = false; + for (unsigned j = 0; j != NumTypeInfos; ++j) { + Constant *Elt = Filter->getOperand(j); + Constant *TypeInfo = Elt->stripPointerCasts(); + if (isCatchAll(Personality, TypeInfo)) { + // This element is a catch-all. Bail out, noting this fact. + SawCatchAll = true; + break; + } + + // Even if we've seen a type in a catch clause, we don't want to + // remove it from the filter. An unexpected type handler may be + // set up for a call site which throws an exception of the same + // type caught. In order for the exception thrown by the unexpected + // handler to propagate correctly, the filter must be correctly + // described for the call site. + // + // Example: + // + // void unexpected() { throw 1;} + // void foo() throw (int) { + // std::set_unexpected(unexpected); + // try { + // throw 2.0; + // } catch (int i) {} + // } + + // There is no point in having multiple copies of the same typeinfo in + // a filter, so only add it if we didn't already. + if (SeenInFilter.insert(TypeInfo).second) + NewFilterElts.push_back(cast<Constant>(Elt)); + } + // A filter containing a catch-all cannot match anything by definition. + if (SawCatchAll) { + // Throw the filter away. + MakeNewInstruction = true; + continue; + } + + // If we dropped something from the filter, make a new one. + if (NewFilterElts.size() < NumTypeInfos) + MakeNewFilter = true; + } + if (MakeNewFilter) { + FilterType = ArrayType::get(FilterType->getElementType(), + NewFilterElts.size()); + FilterClause = ConstantArray::get(FilterType, NewFilterElts); + MakeNewInstruction = true; + } + + NewClauses.push_back(FilterClause); + + // If the new filter is empty then it will catch everything so there is + // no point in keeping any following clauses or marking the landingpad + // as having a cleanup. The case of the original filter being empty was + // already handled above. + if (MakeNewFilter && !NewFilterElts.size()) { + assert(MakeNewInstruction && "New filter but not a new instruction!"); + CleanupFlag = false; + break; + } + } + } + + // If several filters occur in a row then reorder them so that the shortest + // filters come first (those with the smallest number of elements). This is + // advantageous because shorter filters are more likely to match, speeding up + // unwinding, but mostly because it increases the effectiveness of the other + // filter optimizations below. + for (unsigned i = 0, e = NewClauses.size(); i + 1 < e; ) { + unsigned j; + // Find the maximal 'j' s.t. the range [i, j) consists entirely of filters. + for (j = i; j != e; ++j) + if (!isa<ArrayType>(NewClauses[j]->getType())) + break; + + // Check whether the filters are already sorted by length. We need to know + // if sorting them is actually going to do anything so that we only make a + // new landingpad instruction if it does. + for (unsigned k = i; k + 1 < j; ++k) + if (shorter_filter(NewClauses[k+1], NewClauses[k])) { + // Not sorted, so sort the filters now. Doing an unstable sort would be + // correct too but reordering filters pointlessly might confuse users. + std::stable_sort(NewClauses.begin() + i, NewClauses.begin() + j, + shorter_filter); + MakeNewInstruction = true; + break; + } + + // Look for the next batch of filters. + i = j + 1; + } + + // If typeinfos matched if and only if equal, then the elements of a filter L + // that occurs later than a filter F could be replaced by the intersection of + // the elements of F and L. In reality two typeinfos can match without being + // equal (for example if one represents a C++ class, and the other some class + // derived from it) so it would be wrong to perform this transform in general. + // However the transform is correct and useful if F is a subset of L. In that + // case L can be replaced by F, and thus removed altogether since repeating a + // filter is pointless. So here we look at all pairs of filters F and L where + // L follows F in the list of clauses, and remove L if every element of F is + // an element of L. This can occur when inlining C++ functions with exception + // specifications. + for (unsigned i = 0; i + 1 < NewClauses.size(); ++i) { + // Examine each filter in turn. + Value *Filter = NewClauses[i]; + ArrayType *FTy = dyn_cast<ArrayType>(Filter->getType()); + if (!FTy) + // Not a filter - skip it. + continue; + unsigned FElts = FTy->getNumElements(); + // Examine each filter following this one. Doing this backwards means that + // we don't have to worry about filters disappearing under us when removed. + for (unsigned j = NewClauses.size() - 1; j != i; --j) { + Value *LFilter = NewClauses[j]; + ArrayType *LTy = dyn_cast<ArrayType>(LFilter->getType()); + if (!LTy) + // Not a filter - skip it. + continue; + // If Filter is a subset of LFilter, i.e. every element of Filter is also + // an element of LFilter, then discard LFilter. + SmallVectorImpl<Constant *>::iterator J = NewClauses.begin() + j; + // If Filter is empty then it is a subset of LFilter. + if (!FElts) { + // Discard LFilter. + NewClauses.erase(J); + MakeNewInstruction = true; + // Move on to the next filter. + continue; + } + unsigned LElts = LTy->getNumElements(); + // If Filter is longer than LFilter then it cannot be a subset of it. + if (FElts > LElts) + // Move on to the next filter. + continue; + // At this point we know that LFilter has at least one element. + if (isa<ConstantAggregateZero>(LFilter)) { // LFilter only contains zeros. + // Filter is a subset of LFilter iff Filter contains only zeros (as we + // already know that Filter is not longer than LFilter). + if (isa<ConstantAggregateZero>(Filter)) { + assert(FElts <= LElts && "Should have handled this case earlier!"); + // Discard LFilter. + NewClauses.erase(J); + MakeNewInstruction = true; + } + // Move on to the next filter. + continue; + } + ConstantArray *LArray = cast<ConstantArray>(LFilter); + if (isa<ConstantAggregateZero>(Filter)) { // Filter only contains zeros. + // Since Filter is non-empty and contains only zeros, it is a subset of + // LFilter iff LFilter contains a zero. + assert(FElts > 0 && "Should have eliminated the empty filter earlier!"); + for (unsigned l = 0; l != LElts; ++l) + if (LArray->getOperand(l)->isNullValue()) { + // LFilter contains a zero - discard it. + NewClauses.erase(J); + MakeNewInstruction = true; + break; + } + // Move on to the next filter. + continue; + } + // At this point we know that both filters are ConstantArrays. Loop over + // operands to see whether every element of Filter is also an element of + // LFilter. Since filters tend to be short this is probably faster than + // using a method that scales nicely. + ConstantArray *FArray = cast<ConstantArray>(Filter); + bool AllFound = true; + for (unsigned f = 0; f != FElts; ++f) { + Value *FTypeInfo = FArray->getOperand(f)->stripPointerCasts(); + AllFound = false; + for (unsigned l = 0; l != LElts; ++l) { + Value *LTypeInfo = LArray->getOperand(l)->stripPointerCasts(); + if (LTypeInfo == FTypeInfo) { + AllFound = true; + break; + } + } + if (!AllFound) + break; + } + if (AllFound) { + // Discard LFilter. + NewClauses.erase(J); + MakeNewInstruction = true; + } + // Move on to the next filter. + } + } + + // If we changed any of the clauses, replace the old landingpad instruction + // with a new one. + if (MakeNewInstruction) { + LandingPadInst *NLI = LandingPadInst::Create(LI.getType(), + NewClauses.size()); + for (unsigned i = 0, e = NewClauses.size(); i != e; ++i) + NLI->addClause(NewClauses[i]); + // A landing pad with no clauses must have the cleanup flag set. It is + // theoretically possible, though highly unlikely, that we eliminated all + // clauses. If so, force the cleanup flag to true. + if (NewClauses.empty()) + CleanupFlag = true; + NLI->setCleanup(CleanupFlag); + return NLI; + } + + // Even if none of the clauses changed, we may nonetheless have understood + // that the cleanup flag is pointless. Clear it if so. + if (LI.isCleanup() != CleanupFlag) { + assert(!CleanupFlag && "Adding a cleanup, not removing one?!"); + LI.setCleanup(CleanupFlag); + return &LI; + } + + return nullptr; +} + +/// Try to move the specified instruction from its current block into the +/// beginning of DestBlock, which can only happen if it's safe to move the +/// instruction past all of the instructions between it and the end of its +/// block. +static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { + assert(I->hasOneUse() && "Invariants didn't hold!"); + BasicBlock *SrcBlock = I->getParent(); + + // Cannot move control-flow-involving, volatile loads, vaarg, etc. + if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() || + I->isTerminator()) + return false; + + // Do not sink static or dynamic alloca instructions. Static allocas must + // remain in the entry block, and dynamic allocas must not be sunk in between + // a stacksave / stackrestore pair, which would incorrectly shorten its + // lifetime. + if (isa<AllocaInst>(I)) + return false; + + // Do not sink into catchswitch blocks. + if (isa<CatchSwitchInst>(DestBlock->getTerminator())) + return false; + + // Do not sink convergent call instructions. + if (auto *CI = dyn_cast<CallInst>(I)) { + if (CI->isConvergent()) + return false; + } + // We can only sink load instructions if there is nothing between the load and + // the end of block that could change the value. + if (I->mayReadFromMemory()) { + for (BasicBlock::iterator Scan = I->getIterator(), + E = I->getParent()->end(); + Scan != E; ++Scan) + if (Scan->mayWriteToMemory()) + return false; + } + BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt(); + I->moveBefore(&*InsertPos); + ++NumSunkInst; + + // Also sink all related debug uses from the source basic block. Otherwise we + // get debug use before the def. Attempt to salvage debug uses first, to + // maximise the range variables have location for. If we cannot salvage, then + // mark the location undef: we know it was supposed to receive a new location + // here, but that computation has been sunk. + SmallVector<DbgVariableIntrinsic *, 2> DbgUsers; + findDbgUsers(DbgUsers, I); + for (auto *DII : reverse(DbgUsers)) { + if (DII->getParent() == SrcBlock) { + if (isa<DbgDeclareInst>(DII)) { + // A dbg.declare instruction should not be cloned, since there can only be + // one per variable fragment. It should be left in the original place since + // sunk instruction is not an alloca(otherwise we could not be here). + // But we need to update arguments of dbg.declare instruction, so that it + // would not point into sunk instruction. + if (!isa<CastInst>(I)) + continue; // dbg.declare points at something it shouldn't + + DII->setOperand( + 0, MetadataAsValue::get(I->getContext(), + ValueAsMetadata::get(I->getOperand(0)))); + continue; + } + + // dbg.value is in the same basic block as the sunk inst, see if we can + // salvage it. Clone a new copy of the instruction: on success we need + // both salvaged and unsalvaged copies. + SmallVector<DbgVariableIntrinsic *, 1> TmpUser{ + cast<DbgVariableIntrinsic>(DII->clone())}; + + if (!salvageDebugInfoForDbgValues(*I, TmpUser)) { + // We are unable to salvage: sink the cloned dbg.value, and mark the + // original as undef, terminating any earlier variable location. + LLVM_DEBUG(dbgs() << "SINK: " << *DII << '\n'); + TmpUser[0]->insertBefore(&*InsertPos); + Value *Undef = UndefValue::get(I->getType()); + DII->setOperand(0, MetadataAsValue::get(DII->getContext(), + ValueAsMetadata::get(Undef))); + } else { + // We successfully salvaged: place the salvaged dbg.value in the + // original location, and move the unmodified dbg.value to sink with + // the sunk inst. + TmpUser[0]->insertBefore(DII); + DII->moveBefore(&*InsertPos); + } + } + } + return true; +} + +bool InstCombiner::run() { + while (!Worklist.isEmpty()) { + Instruction *I = Worklist.RemoveOne(); + if (I == nullptr) continue; // skip null values. + + // Check to see if we can DCE the instruction. + if (isInstructionTriviallyDead(I, &TLI)) { + LLVM_DEBUG(dbgs() << "IC: DCE: " << *I << '\n'); + eraseInstFromFunction(*I); + ++NumDeadInst; + MadeIRChange = true; + continue; + } + + if (!DebugCounter::shouldExecute(VisitCounter)) + continue; + + // Instruction isn't dead, see if we can constant propagate it. + if (!I->use_empty() && + (I->getNumOperands() == 0 || isa<Constant>(I->getOperand(0)))) { + if (Constant *C = ConstantFoldInstruction(I, DL, &TLI)) { + LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I + << '\n'); + + // Add operands to the worklist. + replaceInstUsesWith(*I, C); + ++NumConstProp; + if (isInstructionTriviallyDead(I, &TLI)) + eraseInstFromFunction(*I); + MadeIRChange = true; + continue; + } + } + + // In general, it is possible for computeKnownBits to determine all bits in + // a value even when the operands are not all constants. + Type *Ty = I->getType(); + if (ExpensiveCombines && !I->use_empty() && Ty->isIntOrIntVectorTy()) { + KnownBits Known = computeKnownBits(I, /*Depth*/0, I); + if (Known.isConstant()) { + Constant *C = ConstantInt::get(Ty, Known.getConstant()); + LLVM_DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C + << " from: " << *I << '\n'); + + // Add operands to the worklist. + replaceInstUsesWith(*I, C); + ++NumConstProp; + if (isInstructionTriviallyDead(I, &TLI)) + eraseInstFromFunction(*I); + MadeIRChange = true; + continue; + } + } + + // See if we can trivially sink this instruction to a successor basic block. + if (EnableCodeSinking && I->hasOneUse()) { + BasicBlock *BB = I->getParent(); + Instruction *UserInst = cast<Instruction>(*I->user_begin()); + BasicBlock *UserParent; + + // Get the block the use occurs in. + if (PHINode *PN = dyn_cast<PHINode>(UserInst)) + UserParent = PN->getIncomingBlock(*I->use_begin()); + else + UserParent = UserInst->getParent(); + + if (UserParent != BB) { + bool UserIsSuccessor = false; + // See if the user is one of our successors. + for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) + if (*SI == UserParent) { + UserIsSuccessor = true; + break; + } + + // If the user is one of our immediate successors, and if that successor + // only has us as a predecessors (we'd have to split the critical edge + // otherwise), we can keep going. + if (UserIsSuccessor && UserParent->getUniquePredecessor()) { + // Okay, the CFG is simple enough, try to sink this instruction. + if (TryToSinkInstruction(I, UserParent)) { + LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n'); + MadeIRChange = true; + // We'll add uses of the sunk instruction below, but since sinking + // can expose opportunities for it's *operands* add them to the + // worklist + for (Use &U : I->operands()) + if (Instruction *OpI = dyn_cast<Instruction>(U.get())) + Worklist.Add(OpI); + } + } + } + } + + // Now that we have an instruction, try combining it to simplify it. + Builder.SetInsertPoint(I); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + +#ifndef NDEBUG + std::string OrigI; +#endif + LLVM_DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str();); + LLVM_DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n'); + + if (Instruction *Result = visit(*I)) { + ++NumCombined; + // Should we replace the old instruction with a new one? + if (Result != I) { + LLVM_DEBUG(dbgs() << "IC: Old = " << *I << '\n' + << " New = " << *Result << '\n'); + + if (I->getDebugLoc()) + Result->setDebugLoc(I->getDebugLoc()); + // Everything uses the new instruction now. + I->replaceAllUsesWith(Result); + + // Move the name to the new instruction first. + Result->takeName(I); + + // Push the new instruction and any users onto the worklist. + Worklist.AddUsersToWorkList(*Result); + Worklist.Add(Result); + + // Insert the new instruction into the basic block... + BasicBlock *InstParent = I->getParent(); + BasicBlock::iterator InsertPos = I->getIterator(); + + // If we replace a PHI with something that isn't a PHI, fix up the + // insertion point. + if (!isa<PHINode>(Result) && isa<PHINode>(InsertPos)) + InsertPos = InstParent->getFirstInsertionPt(); + + InstParent->getInstList().insert(InsertPos, Result); + + eraseInstFromFunction(*I); + } else { + LLVM_DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n' + << " New = " << *I << '\n'); + + // If the instruction was modified, it's possible that it is now dead. + // if so, remove it. + if (isInstructionTriviallyDead(I, &TLI)) { + eraseInstFromFunction(*I); + } else { + Worklist.AddUsersToWorkList(*I); + Worklist.Add(I); + } + } + MadeIRChange = true; + } + } + + Worklist.Zap(); + return MadeIRChange; +} + +/// Walk the function in depth-first order, adding all reachable code to the +/// worklist. +/// +/// This has a couple of tricks to make the code faster and more powerful. In +/// particular, we constant fold and DCE instructions as we go, to avoid adding +/// them to the worklist (this significantly speeds up instcombine on code where +/// many instructions are dead or constant). Additionally, if we find a branch +/// whose condition is a known constant, we only visit the reachable successors. +static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, + SmallPtrSetImpl<BasicBlock *> &Visited, + InstCombineWorklist &ICWorklist, + const TargetLibraryInfo *TLI) { + bool MadeIRChange = false; + SmallVector<BasicBlock*, 256> Worklist; + Worklist.push_back(BB); + + SmallVector<Instruction*, 128> InstrsForInstCombineWorklist; + DenseMap<Constant *, Constant *> FoldedConstants; + + do { + BB = Worklist.pop_back_val(); + + // We have now visited this block! If we've already been here, ignore it. + if (!Visited.insert(BB).second) + continue; + + for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { + Instruction *Inst = &*BBI++; + + // DCE instruction if trivially dead. + if (isInstructionTriviallyDead(Inst, TLI)) { + ++NumDeadInst; + LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n'); + if (!salvageDebugInfo(*Inst)) + replaceDbgUsesWithUndef(Inst); + Inst->eraseFromParent(); + MadeIRChange = true; + continue; + } + + // ConstantProp instruction if trivially constant. + if (!Inst->use_empty() && + (Inst->getNumOperands() == 0 || isa<Constant>(Inst->getOperand(0)))) + if (Constant *C = ConstantFoldInstruction(Inst, DL, TLI)) { + LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *Inst + << '\n'); + Inst->replaceAllUsesWith(C); + ++NumConstProp; + if (isInstructionTriviallyDead(Inst, TLI)) + Inst->eraseFromParent(); + MadeIRChange = true; + continue; + } + + // See if we can constant fold its operands. + for (Use &U : Inst->operands()) { + if (!isa<ConstantVector>(U) && !isa<ConstantExpr>(U)) + continue; + + auto *C = cast<Constant>(U); + Constant *&FoldRes = FoldedConstants[C]; + if (!FoldRes) + FoldRes = ConstantFoldConstant(C, DL, TLI); + if (!FoldRes) + FoldRes = C; + + if (FoldRes != C) { + LLVM_DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst + << "\n Old = " << *C + << "\n New = " << *FoldRes << '\n'); + U = FoldRes; + MadeIRChange = true; + } + } + + // Skip processing debug intrinsics in InstCombine. Processing these call instructions + // consumes non-trivial amount of time and provides no value for the optimization. + if (!isa<DbgInfoIntrinsic>(Inst)) + InstrsForInstCombineWorklist.push_back(Inst); + } + + // Recursively visit successors. If this is a branch or switch on a + // constant, only visit the reachable successor. + Instruction *TI = BB->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) { + bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue(); + BasicBlock *ReachableBB = BI->getSuccessor(!CondVal); + Worklist.push_back(ReachableBB); + continue; + } + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) { + Worklist.push_back(SI->findCaseValue(Cond)->getCaseSuccessor()); + continue; + } + } + + for (BasicBlock *SuccBB : successors(TI)) + Worklist.push_back(SuccBB); + } while (!Worklist.empty()); + + // Once we've found all of the instructions to add to instcombine's worklist, + // add them in reverse order. This way instcombine will visit from the top + // of the function down. This jives well with the way that it adds all uses + // of instructions to the worklist after doing a transformation, thus avoiding + // some N^2 behavior in pathological cases. + ICWorklist.AddInitialGroup(InstrsForInstCombineWorklist); + + return MadeIRChange; +} + +/// Populate the IC worklist from a function, and prune any dead basic +/// blocks discovered in the process. +/// +/// This also does basic constant propagation and other forward fixing to make +/// the combiner itself run much faster. +static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL, + TargetLibraryInfo *TLI, + InstCombineWorklist &ICWorklist) { + bool MadeIRChange = false; + + // Do a depth-first traversal of the function, populate the worklist with + // the reachable instructions. Ignore blocks that are not reachable. Keep + // track of which blocks we visit. + SmallPtrSet<BasicBlock *, 32> Visited; + MadeIRChange |= + AddReachableCodeToWorklist(&F.front(), DL, Visited, ICWorklist, TLI); + + // Do a quick scan over the function. If we find any blocks that are + // unreachable, remove any instructions inside of them. This prevents + // the instcombine code from having to deal with some bad special cases. + for (BasicBlock &BB : F) { + if (Visited.count(&BB)) + continue; + + unsigned NumDeadInstInBB = removeAllNonTerminatorAndEHPadInstructions(&BB); + MadeIRChange |= NumDeadInstInBB > 0; + NumDeadInst += NumDeadInstInBB; + } + + return MadeIRChange; +} + +static bool combineInstructionsOverFunction( + Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA, + AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, + OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, bool ExpensiveCombines = true, + LoopInfo *LI = nullptr) { + auto &DL = F.getParent()->getDataLayout(); + ExpensiveCombines |= EnableExpensiveCombines; + + /// Builder - This is an IRBuilder that automatically inserts new + /// instructions into the worklist when they are created. + IRBuilder<TargetFolder, IRBuilderCallbackInserter> Builder( + F.getContext(), TargetFolder(DL), + IRBuilderCallbackInserter([&Worklist, &AC](Instruction *I) { + Worklist.Add(I); + if (match(I, m_Intrinsic<Intrinsic::assume>())) + AC.registerAssumption(cast<CallInst>(I)); + })); + + // Lower dbg.declare intrinsics otherwise their value may be clobbered + // by instcombiner. + bool MadeIRChange = false; + if (ShouldLowerDbgDeclare) + MadeIRChange = LowerDbgDeclare(F); + + // Iterate while there is work to do. + int Iteration = 0; + while (true) { + ++Iteration; + LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " + << F.getName() << "\n"); + + MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist); + + InstCombiner IC(Worklist, Builder, F.hasMinSize(), ExpensiveCombines, AA, + AC, TLI, DT, ORE, BFI, PSI, DL, LI); + IC.MaxArraySizeForCombine = MaxArraySize; + + if (!IC.run()) + break; + } + + return MadeIRChange || Iteration > 1; +} + +PreservedAnalyses InstCombinePass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &AC = AM.getResult<AssumptionAnalysis>(F); + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + + auto *LI = AM.getCachedResult<LoopAnalysis>(F); + + auto *AA = &AM.getResult<AAManager>(F); + const ModuleAnalysisManager &MAM = + AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); + ProfileSummaryInfo *PSI = + MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); + auto *BFI = (PSI && PSI->hasProfileSummary()) ? + &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr; + + if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, + BFI, PSI, ExpensiveCombines, LI)) + // No changes, all analyses are preserved. + return PreservedAnalyses::all(); + + // Mark all the analyses that instcombine updates as preserved. + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + PA.preserve<AAManager>(); + PA.preserve<BasicAA>(); + PA.preserve<GlobalsAA>(); + return PA; +} + +void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addRequired<ProfileSummaryInfoWrapperPass>(); + LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); +} + +bool InstructionCombiningPass::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + // Required analyses. + auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); + + // Optional analyses. + auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); + auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + ProfileSummaryInfo *PSI = + &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); + BlockFrequencyInfo *BFI = + (PSI && PSI->hasProfileSummary()) ? + &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() : + nullptr; + + return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, + BFI, PSI, ExpensiveCombines, LI); +} + +char InstructionCombiningPass::ID = 0; + +INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine", + "Combine redundant instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine", + "Combine redundant instructions", false, false) + +// Initialization Routines +void llvm::initializeInstCombine(PassRegistry &Registry) { + initializeInstructionCombiningPassPass(Registry); +} + +void LLVMInitializeInstCombine(LLVMPassRegistryRef R) { + initializeInstructionCombiningPassPass(*unwrap(R)); +} + +FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines) { + return new InstructionCombiningPass(ExpensiveCombines); +} + +void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createInstructionCombiningPass()); +} |
