Import LLVM r73954.vendor/llvm/llvm-r73954

author: Ed Schouten <ed@FreeBSD.org> 2009-06-23 14:50:01 +0000
committer: Ed Schouten <ed@FreeBSD.org> 2009-06-23 14:50:01 +0000
commit: 0408e1d309a743aca4ed4592cf2c712a71537901 (patch)
tree: 6be075b410677415707e0987e3a49123130cef22
parent: b2f21fb044b6b5c52cff6227f9f79ca4ed42b18f (diff)
download: src-test2-0408e1d309a743aca4ed4592cf2c712a71537901.tar.gz
src-test2-0408e1d309a743aca4ed4592cf2c712a71537901.zip
149 files changed, 12263 insertions, 1234 deletions
diff --git a/include/llvm/Analysis/IVUsers.h b/include/llvm/Analysis/IVUsers.h
index ac785d5c54a9..30a457ace811 100644
--- a/include/llvm/Analysis/IVUsers.h
+++ b/include/llvm/Analysis/IVUsers.h
@@ -34,7 +34,7 @@ class IVUsersOfOneStride;
 class IVStrideUse : public CallbackVH, public ilist_node<IVStrideUse> {
 public:
   IVStrideUse(IVUsersOfOneStride *parent,
-              const SCEVHandle &offset,
+              const SCEV* offset,
               Instruction* U, Value *O)
     : CallbackVH(U), Parent(parent), Offset(offset),
       OperandValToReplace(O),
@@ -58,10 +58,10 @@ public:
   /// getOffset - Return the offset to add to a theoeretical induction
   /// variable that starts at zero and counts up by the stride to compute
   /// the value for the use. This always has the same type as the stride.
-  SCEVHandle getOffset() const { return Offset; }
+  const SCEV* getOffset() const { return Offset; }
 
   /// setOffset - Assign a new offset to this use.
-  void setOffset(SCEVHandle Val) {
+  void setOffset(const SCEV* Val) {
     Offset = Val;
   }
 
@@ -96,7 +96,7 @@ private:
   IVUsersOfOneStride *Parent;
 
   /// Offset - The offset to add to the base induction expression.
-  SCEVHandle Offset;
+  const SCEV* Offset;
 
   /// OperandValToReplace - The Value of the operand in the user instruction
   /// that this IVStrideUse is representing.
@@ -158,7 +158,7 @@ public:
   /// initial value and the operand that uses the IV.
   ilist<IVStrideUse> Users;
 
-  void addUser(const SCEVHandle &Offset, Instruction *User, Value *Operand) {
+  void addUser(const SCEV* Offset, Instruction *User, Value *Operand) {
     Users.push_back(new IVStrideUse(this, Offset, User, Operand));
   }
 };
@@ -178,12 +178,12 @@ public:
 
   /// IVUsesByStride - A mapping from the strides in StrideOrder to the
   /// uses in IVUses.
-  std::map<SCEVHandle, IVUsersOfOneStride*> IVUsesByStride;
+  std::map<const SCEV*, IVUsersOfOneStride*> IVUsesByStride;
 
   /// StrideOrder - An ordering of the keys in IVUsesByStride that is stable:
   /// We use this to iterate over the IVUsesByStride collection without being
   /// dependent on random ordering of pointers in the process.
-  SmallVector<SCEVHandle, 16> StrideOrder;
+  SmallVector<const SCEV*, 16> StrideOrder;
 
 private:
   virtual void getAnalysisUsage(AnalysisUsage &AU) const;
@@ -203,7 +203,7 @@ public:
 
   /// getReplacementExpr - Return a SCEV expression which computes the
   /// value of the OperandValToReplace of the given IVStrideUse.
-  SCEVHandle getReplacementExpr(const IVStrideUse &U) const;
+  const SCEV* getReplacementExpr(const IVStrideUse &U) const;
 
   void print(raw_ostream &OS, const Module* = 0) const;
   virtual void print(std::ostream &OS, const Module* = 0) const;
diff --git a/include/llvm/Analysis/LoopVR.h b/include/llvm/Analysis/LoopVR.h
index 1d806f83aa92..36b62152f86b 100644
--- a/include/llvm/Analysis/LoopVR.h
+++ b/include/llvm/Analysis/LoopVR.h
@@ -78,9 +78,9 @@ public:
 private:
   ConstantRange compute(Value *V);
 
-  ConstantRange getRange(SCEVHandle S, Loop *L, ScalarEvolution &SE);
+  ConstantRange getRange(const SCEV* S, Loop *L, ScalarEvolution &SE);
 
-  ConstantRange getRange(SCEVHandle S, SCEVHandle T, ScalarEvolution &SE);
+  ConstantRange getRange(const SCEV* S, const SCEV* T, ScalarEvolution &SE);
 
   std::map<Value *, ConstantRange *> Map;
 };
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 8d5136cea2c3..1c1298a9a15a 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -32,10 +32,16 @@ namespace llvm {
   class APInt;
   class ConstantInt;
   class Type;
-  class SCEVHandle;
   class ScalarEvolution;
   class TargetData;
-  template<> struct DenseMapInfo<SCEVHandle>;
+  class SCEVConstant;
+  class SCEVTruncateExpr;
+  class SCEVZeroExtendExpr;
+  class SCEVCommutativeExpr;
+  class SCEVUDivExpr;
+  class SCEVSignExtendExpr;
+  class SCEVAddRecExpr;
+  class SCEVUnknown;
 
   /// SCEV - This class represents an analyzed expression in the program.  These
   /// are reference-counted opaque objects that the client is not allowed to
@@ -43,25 +49,14 @@ namespace llvm {
   ///
   class SCEV {
     const unsigned SCEVType;      // The SCEV baseclass this node corresponds to
-    mutable unsigned RefCount;
-
-    friend class SCEVHandle;
-    friend class DenseMapInfo<SCEVHandle>;
-    void addRef() const { ++RefCount; }
-    void dropRef() const {
-      if (--RefCount == 0)
-        delete this;
-    }
-
-    const ScalarEvolution* parent;
 
     SCEV(const SCEV &);            // DO NOT IMPLEMENT
     void operator=(const SCEV &);  // DO NOT IMPLEMENT
   protected:
     virtual ~SCEV();
   public:
-    explicit SCEV(unsigned SCEVTy, const ScalarEvolution* p) : 
-      SCEVType(SCEVTy), RefCount(0), parent(p) {}
+    explicit SCEV(unsigned SCEVTy) : 
+      SCEVType(SCEVTy) {}
 
     unsigned getSCEVType() const { return SCEVType; }
 
@@ -92,9 +87,9 @@ namespace llvm {
     /// the same value, but which uses the concrete value Conc instead of the
     /// symbolic value.  If this SCEV does not use the symbolic value, it
     /// returns itself.
-    virtual SCEVHandle
-    replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                      const SCEVHandle &Conc,
+    virtual const SCEV*
+    replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                      const SCEV* Conc,
                                       ScalarEvolution &SE) const = 0;
 
     /// dominates - Return true if elements that makes up this SCEV dominates
@@ -129,17 +124,16 @@ namespace llvm {
   /// None of the standard SCEV operations are valid on this class, it is just a
   /// marker.
   struct SCEVCouldNotCompute : public SCEV {
-    SCEVCouldNotCompute(const ScalarEvolution* p);
-    ~SCEVCouldNotCompute();
+    SCEVCouldNotCompute();
 
     // None of these methods are valid for this object.
     virtual bool isLoopInvariant(const Loop *L) const;
     virtual const Type *getType() const;
     virtual bool hasComputableLoopEvolution(const Loop *L) const;
     virtual void print(raw_ostream &OS) const;
-    virtual SCEVHandle
-    replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                      const SCEVHandle &Conc,
+    virtual const SCEV*
+    replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                      const SCEV* Conc,
                                       ScalarEvolution &SE) const;
 
     virtual bool dominates(BasicBlock *BB, DominatorTree *DT) const {
@@ -151,83 +145,6 @@ namespace llvm {
     static bool classof(const SCEV *S);
   };
 
-  /// SCEVHandle - This class is used to maintain the SCEV object's refcounts,
-  /// freeing the objects when the last reference is dropped.
-  class SCEVHandle {
-    const SCEV *S;
-    SCEVHandle();  // DO NOT IMPLEMENT
-  public:
-    SCEVHandle(const SCEV *s) : S(s) {
-      assert(S && "Cannot create a handle to a null SCEV!");
-      S->addRef();
-    }
-    SCEVHandle(const SCEVHandle &RHS) : S(RHS.S) {
-      S->addRef();
-    }
-    ~SCEVHandle() { S->dropRef(); }
-
-    operator const SCEV*() const { return S; }
-
-    const SCEV &operator*() const { return *S; }
-    const SCEV *operator->() const { return S; }
-
-    bool operator==(const SCEV *RHS) const { return S == RHS; }
-    bool operator!=(const SCEV *RHS) const { return S != RHS; }
-
-    const SCEVHandle &operator=(SCEV *RHS) {
-      if (S != RHS) {
-        S->dropRef();
-        S = RHS;
-        S->addRef();
-      }
-      return *this;
-    }
-
-    const SCEVHandle &operator=(const SCEVHandle &RHS) {
-      if (S != RHS.S) {
-        S->dropRef();
-        S = RHS.S;
-        S->addRef();
-      }
-      return *this;
-    }
-  };
-
-  template<typename From> struct simplify_type;
-  template<> struct simplify_type<const SCEVHandle> {
-    typedef const SCEV* SimpleType;
-    static SimpleType getSimplifiedValue(const SCEVHandle &Node) {
-      return Node;
-    }
-  };
-  template<> struct simplify_type<SCEVHandle>
-    : public simplify_type<const SCEVHandle> {};
-
-  // Specialize DenseMapInfo for SCEVHandle so that SCEVHandle may be used
-  // as a key in DenseMaps.
-  template<>
-  struct DenseMapInfo<SCEVHandle> {
-    static inline SCEVHandle getEmptyKey() {
-      static SCEVCouldNotCompute Empty(0);
-      if (Empty.RefCount == 0)
-        Empty.addRef();
-      return &Empty;
-    }
-    static inline SCEVHandle getTombstoneKey() {
-      static SCEVCouldNotCompute Tombstone(0);
-      if (Tombstone.RefCount == 0)
-        Tombstone.addRef();
-      return &Tombstone;
-    }
-    static unsigned getHashValue(const SCEVHandle &Val) {
-      return DenseMapInfo<const SCEV *>::getHashValue(Val);
-    }
-    static bool isEqual(const SCEVHandle &LHS, const SCEVHandle &RHS) {
-      return LHS == RHS;
-    }
-    static bool isPod() { return false; }
-  };
-
   /// ScalarEvolution - This class is the main scalar evolution driver.  Because
   /// client code (intentionally) can't do much with the SCEV objects directly,
   /// they must ask this class for services.
@@ -260,11 +177,11 @@ namespace llvm {
 
     /// CouldNotCompute - This SCEV is used to represent unknown trip
     /// counts and things.
-    SCEVHandle CouldNotCompute;
+    const SCEV* CouldNotCompute;
 
     /// Scalars - This is a cache of the scalars we have analyzed so far.
     ///
-    std::map<SCEVCallbackVH, SCEVHandle> Scalars;
+    std::map<SCEVCallbackVH, const SCEV*> Scalars;
 
     /// BackedgeTakenInfo - Information about the backedge-taken count
     /// of a loop. This currently inclues an exact count and a maximum count.
@@ -272,19 +189,16 @@ namespace llvm {
     struct BackedgeTakenInfo {
       /// Exact - An expression indicating the exact backedge-taken count of
       /// the loop if it is known, or a SCEVCouldNotCompute otherwise.
-      SCEVHandle Exact;
+      const SCEV* Exact;
 
       /// Exact - An expression indicating the least maximum backedge-taken
       /// count of the loop that is known, or a SCEVCouldNotCompute.
-      SCEVHandle Max;
+      const SCEV* Max;
 
-      /*implicit*/ BackedgeTakenInfo(SCEVHandle exact) :
+      /*implicit*/ BackedgeTakenInfo(const SCEV* exact) :
         Exact(exact), Max(exact) {}
 
-      /*implicit*/ BackedgeTakenInfo(const SCEV *exact) :
-        Exact(exact), Max(exact) {}
-
-      BackedgeTakenInfo(SCEVHandle exact, SCEVHandle max) :
+      BackedgeTakenInfo(const SCEV* exact, const SCEV* max) :
         Exact(exact), Max(max) {}
 
       /// hasAnyInfo - Test whether this BackedgeTakenInfo contains any
@@ -314,30 +228,30 @@ namespace llvm {
 
     /// createSCEV - We know that there is no SCEV for the specified value.
     /// Analyze the expression.
-    SCEVHandle createSCEV(Value *V);
+    const SCEV* createSCEV(Value *V);
 
     /// createNodeForPHI - Provide the special handling we need to analyze PHI
     /// SCEVs.
-    SCEVHandle createNodeForPHI(PHINode *PN);
+    const SCEV* createNodeForPHI(PHINode *PN);
 
     /// createNodeForGEP - Provide the special handling we need to analyze GEP
     /// SCEVs.
-    SCEVHandle createNodeForGEP(User *GEP);
+    const SCEV* createNodeForGEP(User *GEP);
 
     /// ReplaceSymbolicValueWithConcrete - This looks up the computed SCEV value
     /// for the specified instruction and replaces any references to the
     /// symbolic value SymName with the specified value.  This is used during
     /// PHI resolution.
     void ReplaceSymbolicValueWithConcrete(Instruction *I,
-                                          const SCEVHandle &SymName,
-                                          const SCEVHandle &NewVal);
+                                          const SCEV* SymName,
+                                          const SCEV* NewVal);
 
     /// getBECount - Subtract the end and start values and divide by the step,
     /// rounding up, to get the number of times the backedge is executed. Return
     /// CouldNotCompute if an intermediate computation overflows.
-    SCEVHandle getBECount(const SCEVHandle &Start,
-                          const SCEVHandle &End,
-                          const SCEVHandle &Step);
+    const SCEV* getBECount(const SCEV* Start,
+                          const SCEV* End,
+                          const SCEV* Step);
 
     /// getBackedgeTakenInfo - Return the BackedgeTakenInfo for the given
     /// loop, lazily computing new values if the loop hasn't been analyzed
@@ -375,7 +289,7 @@ namespace llvm {
 
     /// ComputeLoadConstantCompareBackedgeTakenCount - Given an exit condition
     /// of 'icmp op load X, cst', try to see if we can compute the trip count.
-    SCEVHandle
+    const SCEV*
       ComputeLoadConstantCompareBackedgeTakenCount(LoadInst *LI,
                                                    Constant *RHS,
                                                    const Loop *L,
@@ -386,18 +300,18 @@ namespace llvm {
     /// try to evaluate a few iterations of the loop until we get the exit
     /// condition gets a value of ExitWhen (true or false).  If we cannot
     /// evaluate the trip count of the loop, return CouldNotCompute.
-    SCEVHandle ComputeBackedgeTakenCountExhaustively(const Loop *L, Value *Cond,
+    const SCEV* ComputeBackedgeTakenCountExhaustively(const Loop *L, Value *Cond,
                                                      bool ExitWhen);
 
     /// HowFarToZero - Return the number of times a backedge comparing the
     /// specified value to zero will execute.  If not computable, return
     /// CouldNotCompute.
-    SCEVHandle HowFarToZero(const SCEV *V, const Loop *L);
+    const SCEV* HowFarToZero(const SCEV *V, const Loop *L);
 
     /// HowFarToNonZero - Return the number of times a backedge checking the
     /// specified value for nonzero will execute.  If not computable, return
     /// CouldNotCompute.
-    SCEVHandle HowFarToNonZero(const SCEV *V, const Loop *L);
+    const SCEV* HowFarToNonZero(const SCEV *V, const Loop *L);
 
     /// HowManyLessThans - Return the number of times a backedge containing the
     /// specified less-than comparison will execute.  If not computable, return
@@ -449,109 +363,115 @@ namespace llvm {
 
     /// getSCEV - Return a SCEV expression handle for the full generality of the
     /// specified expression.
-    SCEVHandle getSCEV(Value *V);
-
-    SCEVHandle getConstant(ConstantInt *V);
-    SCEVHandle getConstant(const APInt& Val);
-    SCEVHandle getConstant(const Type *Ty, uint64_t V, bool isSigned = false);
-    SCEVHandle getTruncateExpr(const SCEVHandle &Op, const Type *Ty);
-    SCEVHandle getZeroExtendExpr(const SCEVHandle &Op, const Type *Ty);
-    SCEVHandle getSignExtendExpr(const SCEVHandle &Op, const Type *Ty);
-    SCEVHandle getAnyExtendExpr(const SCEVHandle &Op, const Type *Ty);
-    SCEVHandle getAddExpr(SmallVectorImpl<SCEVHandle> &Ops);
-    SCEVHandle getAddExpr(const SCEVHandle &LHS, const SCEVHandle &RHS) {
-      SmallVector<SCEVHandle, 2> Ops;
+    const SCEV* getSCEV(Value *V);
+
+    const SCEV* getConstant(ConstantInt *V);
+    const SCEV* getConstant(const APInt& Val);
+    const SCEV* getConstant(const Type *Ty, uint64_t V, bool isSigned = false);
+    const SCEV* getTruncateExpr(const SCEV* Op, const Type *Ty);
+    const SCEV* getZeroExtendExpr(const SCEV* Op, const Type *Ty);
+    const SCEV* getSignExtendExpr(const SCEV* Op, const Type *Ty);
+    const SCEV* getAnyExtendExpr(const SCEV* Op, const Type *Ty);
+    const SCEV* getAddExpr(SmallVectorImpl<const SCEV*> &Ops);
+    const SCEV* getAddExpr(const SCEV* LHS, const SCEV* RHS) {
+      SmallVector<const SCEV*, 2> Ops;
       Ops.push_back(LHS);
       Ops.push_back(RHS);
       return getAddExpr(Ops);
     }
-    SCEVHandle getAddExpr(const SCEVHandle &Op0, const SCEVHandle &Op1,
-                          const SCEVHandle &Op2) {
-      SmallVector<SCEVHandle, 3> Ops;
+    const SCEV* getAddExpr(const SCEV* Op0, const SCEV* Op1,
+                          const SCEV* Op2) {
+      SmallVector<const SCEV*, 3> Ops;
       Ops.push_back(Op0);
       Ops.push_back(Op1);
       Ops.push_back(Op2);
       return getAddExpr(Ops);
     }
-    SCEVHandle getMulExpr(SmallVectorImpl<SCEVHandle> &Ops);
-    SCEVHandle getMulExpr(const SCEVHandle &LHS, const SCEVHandle &RHS) {
-      SmallVector<SCEVHandle, 2> Ops;
+    const SCEV* getMulExpr(SmallVectorImpl<const SCEV*> &Ops);
+    const SCEV* getMulExpr(const SCEV* LHS, const SCEV* RHS) {
+      SmallVector<const SCEV*, 2> Ops;
       Ops.push_back(LHS);
       Ops.push_back(RHS);
       return getMulExpr(Ops);
     }
-    SCEVHandle getUDivExpr(const SCEVHandle &LHS, const SCEVHandle &RHS);
-    SCEVHandle getAddRecExpr(const SCEVHandle &Start, const SCEVHandle &Step,
+    const SCEV* getUDivExpr(const SCEV* LHS, const SCEV* RHS);
+    const SCEV* getAddRecExpr(const SCEV* Start, const SCEV* Step,
                              const Loop *L);
-    SCEVHandle getAddRecExpr(SmallVectorImpl<SCEVHandle> &Operands,
+    const SCEV* getAddRecExpr(SmallVectorImpl<const SCEV*> &Operands,
                              const Loop *L);
-    SCEVHandle getAddRecExpr(const SmallVectorImpl<SCEVHandle> &Operands,
+    const SCEV* getAddRecExpr(const SmallVectorImpl<const SCEV*> &Operands,
                              const Loop *L) {
-      SmallVector<SCEVHandle, 4> NewOp(Operands.begin(), Operands.end());
+      SmallVector<const SCEV*, 4> NewOp(Operands.begin(), Operands.end());
       return getAddRecExpr(NewOp, L);
     }
-    SCEVHandle getSMaxExpr(const SCEVHandle &LHS, const SCEVHandle &RHS);
-    SCEVHandle getSMaxExpr(SmallVectorImpl<SCEVHandle> &Operands);
-    SCEVHandle getUMaxExpr(const SCEVHandle &LHS, const SCEVHandle &RHS);
-    SCEVHandle getUMaxExpr(SmallVectorImpl<SCEVHandle> &Operands);
-    SCEVHandle getSMinExpr(const SCEVHandle &LHS, const SCEVHandle &RHS);
-    SCEVHandle getUMinExpr(const SCEVHandle &LHS, const SCEVHandle &RHS);
-    SCEVHandle getUnknown(Value *V);
-    SCEVHandle getCouldNotCompute();
+    const SCEV* getSMaxExpr(const SCEV* LHS, const SCEV* RHS);
+    const SCEV* getSMaxExpr(SmallVectorImpl<const SCEV*> &Operands);
+    const SCEV* getUMaxExpr(const SCEV* LHS, const SCEV* RHS);
+    const SCEV* getUMaxExpr(SmallVectorImpl<const SCEV*> &Operands);
+    const SCEV* getSMinExpr(const SCEV* LHS, const SCEV* RHS);
+    const SCEV* getUMinExpr(const SCEV* LHS, const SCEV* RHS);
+    const SCEV* getUnknown(Value *V);
+    const SCEV* getCouldNotCompute();
 
     /// getNegativeSCEV - Return the SCEV object corresponding to -V.
     ///
-    SCEVHandle getNegativeSCEV(const SCEVHandle &V);
+    const SCEV* getNegativeSCEV(const SCEV* V);
 
     /// getNotSCEV - Return the SCEV object corresponding to ~V.
     ///
-    SCEVHandle getNotSCEV(const SCEVHandle &V);
+    const SCEV* getNotSCEV(const SCEV* V);
 
     /// getMinusSCEV - Return LHS-RHS.
     ///
-    SCEVHandle getMinusSCEV(const SCEVHandle &LHS,
-                            const SCEVHandle &RHS);
+    const SCEV* getMinusSCEV(const SCEV* LHS,
+                            const SCEV* RHS);
 
     /// getTruncateOrZeroExtend - Return a SCEV corresponding to a conversion
     /// of the input value to the specified type.  If the type must be
     /// extended, it is zero extended.
-    SCEVHandle getTruncateOrZeroExtend(const SCEVHandle &V, const Type *Ty);
+    const SCEV* getTruncateOrZeroExtend(const SCEV* V, const Type *Ty);
 
     /// getTruncateOrSignExtend - Return a SCEV corresponding to a conversion
     /// of the input value to the specified type.  If the type must be
     /// extended, it is sign extended.
-    SCEVHandle getTruncateOrSignExtend(const SCEVHandle &V, const Type *Ty);
+    const SCEV* getTruncateOrSignExtend(const SCEV* V, const Type *Ty);
 
     /// getNoopOrZeroExtend - Return a SCEV corresponding to a conversion of
     /// the input value to the specified type.  If the type must be extended,
     /// it is zero extended.  The conversion must not be narrowing.
-    SCEVHandle getNoopOrZeroExtend(const SCEVHandle &V, const Type *Ty);
+    const SCEV* getNoopOrZeroExtend(const SCEV* V, const Type *Ty);
 
     /// getNoopOrSignExtend - Return a SCEV corresponding to a conversion of
     /// the input value to the specified type.  If the type must be extended,
     /// it is sign extended.  The conversion must not be narrowing.
-    SCEVHandle getNoopOrSignExtend(const SCEVHandle &V, const Type *Ty);
+    const SCEV* getNoopOrSignExtend(const SCEV* V, const Type *Ty);
 
     /// getNoopOrAnyExtend - Return a SCEV corresponding to a conversion of
     /// the input value to the specified type. If the type must be extended,
     /// it is extended with unspecified bits. The conversion must not be
     /// narrowing.
-    SCEVHandle getNoopOrAnyExtend(const SCEVHandle &V, const Type *Ty);
+    const SCEV* getNoopOrAnyExtend(const SCEV* V, const Type *Ty);
 
     /// getTruncateOrNoop - Return a SCEV corresponding to a conversion of the
     /// input value to the specified type.  The conversion must not be
     /// widening.
-    SCEVHandle getTruncateOrNoop(const SCEVHandle &V, const Type *Ty);
+    const SCEV* getTruncateOrNoop(const SCEV* V, const Type *Ty);
 
     /// getIntegerSCEV - Given an integer or FP type, create a constant for the
     /// specified signed integer value and return a SCEV for the constant.
-    SCEVHandle getIntegerSCEV(int Val, const Type *Ty);
+    const SCEV* getIntegerSCEV(int Val, const Type *Ty);
 
     /// getUMaxFromMismatchedTypes - Promote the operands to the wider of
     /// the types using zero-extension, and then perform a umax operation
     /// with them.
-    SCEVHandle getUMaxFromMismatchedTypes(const SCEVHandle &LHS,
-                                          const SCEVHandle &RHS);
+    const SCEV* getUMaxFromMismatchedTypes(const SCEV* LHS,
+                                          const SCEV* RHS);
+
+    /// getUMinFromMismatchedTypes - Promote the operands to the wider of
+    /// the types using zero-extension, and then perform a umin operation
+    /// with them.
+    const SCEV* getUMinFromMismatchedTypes(const SCEV* LHS,
+                                           const SCEV* RHS);
 
     /// hasSCEV - Return true if the SCEV for this value has already been
     /// computed.
@@ -559,7 +479,7 @@ namespace llvm {
 
     /// setSCEV - Insert the specified SCEV into the map of current SCEVs for
     /// the specified value.
-    void setSCEV(Value *V, const SCEVHandle &H);
+    void setSCEV(Value *V, const SCEV* H);
 
     /// getSCEVAtScope - Return a SCEV expression handle for the specified value
     /// at the specified scope in the program.  The L value specifies a loop
@@ -571,11 +491,11 @@ namespace llvm {
     ///
     /// In the case that a relevant loop exit value cannot be computed, the
     /// original value V is returned.
-    SCEVHandle getSCEVAtScope(const SCEV *S, const Loop *L);
+    const SCEV* getSCEVAtScope(const SCEV *S, const Loop *L);
 
     /// getSCEVAtScope - This is a convenience function which does
     /// getSCEVAtScope(getSCEV(V), L).
-    SCEVHandle getSCEVAtScope(Value *V, const Loop *L);
+    const SCEV* getSCEVAtScope(Value *V, const Loop *L);
 
     /// isLoopGuardedByCond - Test whether entry to the loop is protected by
     /// a conditional between LHS and RHS.  This is used to help avoid max
@@ -594,12 +514,12 @@ namespace llvm {
     /// loop-invariant backedge-taken count (see
     /// hasLoopInvariantBackedgeTakenCount).
     ///
-    SCEVHandle getBackedgeTakenCount(const Loop *L);
+    const SCEV* getBackedgeTakenCount(const Loop *L);
 
     /// getMaxBackedgeTakenCount - Similar to getBackedgeTakenCount, except
     /// return the least SCEV value that is known never to be less than the
     /// actual backedge taken count.
-    SCEVHandle getMaxBackedgeTakenCount(const Loop *L);
+    const SCEV* getMaxBackedgeTakenCount(const Loop *L);
 
     /// hasLoopInvariantBackedgeTakenCount - Return true if the specified loop
     /// has an analyzable loop-invariant backedge-taken count.
@@ -615,15 +535,15 @@ namespace llvm {
     /// guaranteed to end in (at every loop iteration).  It is, at the same time,
     /// the minimum number of times S is divisible by 2.  For example, given {4,+,8}
     /// it returns 2.  If S is guaranteed to be 0, it returns the bitwidth of S.
-    uint32_t GetMinTrailingZeros(const SCEVHandle &S);
+    uint32_t GetMinTrailingZeros(const SCEV* S);
 
     /// GetMinLeadingZeros - Determine the minimum number of zero bits that S is
     /// guaranteed to begin with (at every loop iteration).
-    uint32_t GetMinLeadingZeros(const SCEVHandle &S);
+    uint32_t GetMinLeadingZeros(const SCEV* S);
 
     /// GetMinSignBits - Determine the minimum number of sign bits that S is
     /// guaranteed to begin with.
-    uint32_t GetMinSignBits(const SCEVHandle &S);
+    uint32_t GetMinSignBits(const SCEV* S);
 
     virtual bool runOnFunction(Function &F);
     virtual void releaseMemory();
@@ -633,6 +553,23 @@ namespace llvm {
     void print(std::ostream *OS, const Module* M = 0) const {
       if (OS) print(*OS, M);
     }
+    
+  private:
+    // Uniquing tables.
+    std::map<ConstantInt*, SCEVConstant*> SCEVConstants;
+    std::map<std::pair<const SCEV*, const Type*>,
+             SCEVTruncateExpr*> SCEVTruncates;
+    std::map<std::pair<const SCEV*, const Type*>,
+             SCEVZeroExtendExpr*> SCEVZeroExtends;
+    std::map<std::pair<unsigned, std::vector<const SCEV*> >,
+             SCEVCommutativeExpr*> SCEVCommExprs;
+    std::map<std::pair<const SCEV*, const SCEV*>,
+             SCEVUDivExpr*> SCEVUDivs;
+    std::map<std::pair<const SCEV*, const Type*>,
+             SCEVSignExtendExpr*> SCEVSignExtends;
+    std::map<std::pair<const Loop *, std::vector<const SCEV*> >,
+             SCEVAddRecExpr*> SCEVAddRecExprs;
+    std::map<Value*, SCEVUnknown*> SCEVUnknowns;
   };
 }
 
diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h
index b40fbf06f9c8..730c97fff4d7 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpander.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpander.h
@@ -28,7 +28,7 @@ namespace llvm {
   /// memory.
   struct SCEVExpander : public SCEVVisitor<SCEVExpander, Value*> {
     ScalarEvolution &SE;
-    std::map<SCEVHandle, AssertingVH<Value> > InsertedExpressions;
+    std::map<const SCEV*, AssertingVH<Value> > InsertedExpressions;
     std::set<Value*> InsertedValues;
 
     BasicBlock::iterator InsertPt;
@@ -77,12 +77,12 @@ namespace llvm {
     /// expression into the program.  The inserted code is inserted into the
     /// SCEVExpander's current insertion point. If a type is specified, the
     /// result will be expanded to have that type, with a cast if necessary.
-    Value *expandCodeFor(SCEVHandle SH, const Type *Ty = 0);
+    Value *expandCodeFor(const SCEV* SH, const Type *Ty = 0);
 
     /// expandCodeFor - Insert code to directly compute the specified SCEV
     /// expression into the program.  The inserted code is inserted into the
     /// specified block.
-    Value *expandCodeFor(SCEVHandle SH, const Type *Ty,
+    Value *expandCodeFor(const SCEV* SH, const Type *Ty,
                          BasicBlock::iterator IP) {
       setInsertionPoint(IP);
       return expandCodeFor(SH, Ty);
@@ -105,7 +105,8 @@ namespace llvm {
   private:
     /// expandAddToGEP - Expand a SCEVAddExpr with a pointer type into a GEP
     /// instead of using ptrtoint+arithmetic+inttoptr.
-    Value *expandAddToGEP(const SCEVHandle *op_begin, const SCEVHandle *op_end,
+    Value *expandAddToGEP(const SCEV* const *op_begin,
+                          const SCEV* const *op_end,
                           const PointerType *PTy, const Type *Ty, Value *V);
 
     Value *expand(const SCEV *S);
diff --git a/include/llvm/Analysis/ScalarEvolutionExpressions.h b/include/llvm/Analysis/ScalarEvolutionExpressions.h
index 28423569d2e3..8be1a934bc13 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpressions.h
@@ -36,10 +36,8 @@ namespace llvm {
     friend class ScalarEvolution;
 
     ConstantInt *V;
-    explicit SCEVConstant(ConstantInt *v, const ScalarEvolution* p) :
-      SCEV(scConstant, p), V(v) {}
-
-    virtual ~SCEVConstant();
+    explicit SCEVConstant(ConstantInt *v) :
+      SCEV(scConstant), V(v) {}
   public:
     ConstantInt *getValue() const { return V; }
 
@@ -53,8 +51,8 @@ namespace llvm {
 
     virtual const Type *getType() const;
 
-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                  ScalarEvolution &SE) const {
       return this;
     }
@@ -77,15 +75,13 @@ namespace llvm {
   ///
   class SCEVCastExpr : public SCEV {
   protected:
-    SCEVHandle Op;
+    const SCEV* Op;
     const Type *Ty;
 
-    SCEVCastExpr(unsigned SCEVTy, const SCEVHandle &op, const Type *ty,
-                 const ScalarEvolution* p);
-    virtual ~SCEVCastExpr();
+    SCEVCastExpr(unsigned SCEVTy, const SCEV* op, const Type *ty);
 
   public:
-    const SCEVHandle &getOperand() const { return Op; }
+    const SCEV* getOperand() const { return Op; }
     virtual const Type *getType() const { return Ty; }
 
     virtual bool isLoopInvariant(const Loop *L) const {
@@ -114,15 +110,13 @@ namespace llvm {
   class SCEVTruncateExpr : public SCEVCastExpr {
     friend class ScalarEvolution;
 
-    SCEVTruncateExpr(const SCEVHandle &op, const Type *ty,
-                     const ScalarEvolution* p);
-    virtual ~SCEVTruncateExpr();
+    SCEVTruncateExpr(const SCEV* op, const Type *ty);
 
   public:
-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                  ScalarEvolution &SE) const {
-      SCEVHandle H = Op->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
+      const SCEV* H = Op->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
       if (H == Op)
         return this;
       return SE.getTruncateExpr(H, Ty);
@@ -144,15 +138,13 @@ namespace llvm {
   class SCEVZeroExtendExpr : public SCEVCastExpr {
     friend class ScalarEvolution;
 
-    SCEVZeroExtendExpr(const SCEVHandle &op, const Type *ty,
-                       const ScalarEvolution* p);
-    virtual ~SCEVZeroExtendExpr();
+    SCEVZeroExtendExpr(const SCEV* op, const Type *ty);
 
   public:
-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                  ScalarEvolution &SE) const {
-      SCEVHandle H = Op->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
+      const SCEV* H = Op->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
       if (H == Op)
         return this;
       return SE.getZeroExtendExpr(H, Ty);
@@ -174,15 +166,13 @@ namespace llvm {
   class SCEVSignExtendExpr : public SCEVCastExpr {
     friend class ScalarEvolution;
 
-    SCEVSignExtendExpr(const SCEVHandle &op, const Type *ty,
-                       const ScalarEvolution* p);
-    virtual ~SCEVSignExtendExpr();
+    SCEVSignExtendExpr(const SCEV* op, const Type *ty);
 
   public:
-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                  ScalarEvolution &SE) const {
-      SCEVHandle H = Op->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
+      const SCEV* H = Op->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
       if (H == Op)
         return this;
       return SE.getSignExtendExpr(H, Ty);
@@ -204,22 +194,20 @@ namespace llvm {
   ///
   class SCEVNAryExpr : public SCEV {
   protected:
-    SmallVector<SCEVHandle, 8> Operands;
+    SmallVector<const SCEV*, 8> Operands;
 
-    SCEVNAryExpr(enum SCEVTypes T, const SmallVectorImpl<SCEVHandle> &ops,
-                 const ScalarEvolution* p)
-      : SCEV(T, p), Operands(ops.begin(), ops.end()) {}
-    virtual ~SCEVNAryExpr() {}
+    SCEVNAryExpr(enum SCEVTypes T, const SmallVectorImpl<const SCEV*> &ops)
+      : SCEV(T), Operands(ops.begin(), ops.end()) {}
 
   public:
     unsigned getNumOperands() const { return (unsigned)Operands.size(); }
-    const SCEVHandle &getOperand(unsigned i) const {
+    const SCEV* getOperand(unsigned i) const {
       assert(i < Operands.size() && "Operand index out of range!");
       return Operands[i];
     }
 
-    const SmallVectorImpl<SCEVHandle> &getOperands() const { return Operands; }
-    typedef SmallVectorImpl<SCEVHandle>::const_iterator op_iterator;
+    const SmallVectorImpl<const SCEV*> &getOperands() const { return Operands; }
+    typedef SmallVectorImpl<const SCEV*>::const_iterator op_iterator;
     op_iterator op_begin() const { return Operands.begin(); }
     op_iterator op_end() const { return Operands.end(); }
 
@@ -266,14 +254,12 @@ namespace llvm {
   class SCEVCommutativeExpr : public SCEVNAryExpr {
   protected:
     SCEVCommutativeExpr(enum SCEVTypes T,
-                        const SmallVectorImpl<SCEVHandle> &ops,
-                        const ScalarEvolution* p)
-      : SCEVNAryExpr(T, ops, p) {}
-    ~SCEVCommutativeExpr();
+                        const SmallVectorImpl<const SCEV*> &ops)
+      : SCEVNAryExpr(T, ops) {}
 
   public:
-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                  ScalarEvolution &SE) const;
 
     virtual const char *getOperationStr() const = 0;
@@ -297,9 +283,8 @@ namespace llvm {
   class SCEVAddExpr : public SCEVCommutativeExpr {
     friend class ScalarEvolution;
 
-    explicit SCEVAddExpr(const SmallVectorImpl<SCEVHandle> &ops,
-                         const ScalarEvolution* p)
-      : SCEVCommutativeExpr(scAddExpr, ops, p) {
+    explicit SCEVAddExpr(const SmallVectorImpl<const SCEV*> &ops)
+      : SCEVCommutativeExpr(scAddExpr, ops) {
     }
 
   public:
@@ -318,9 +303,8 @@ namespace llvm {
   class SCEVMulExpr : public SCEVCommutativeExpr {
     friend class ScalarEvolution;
 
-    explicit SCEVMulExpr(const SmallVectorImpl<SCEVHandle> &ops,
-                         const ScalarEvolution* p)
-      : SCEVCommutativeExpr(scMulExpr, ops, p) {
+    explicit SCEVMulExpr(const SmallVectorImpl<const SCEV*> &ops)
+      : SCEVCommutativeExpr(scMulExpr, ops) {
     }
 
   public:
@@ -340,15 +324,14 @@ namespace llvm {
   class SCEVUDivExpr : public SCEV {
     friend class ScalarEvolution;
 
-    SCEVHandle LHS, RHS;
-    SCEVUDivExpr(const SCEVHandle &lhs, const SCEVHandle &rhs,
-                 const ScalarEvolution* p)
-      : SCEV(scUDivExpr, p), LHS(lhs), RHS(rhs) {}
+    const SCEV* LHS;
+    const SCEV* RHS;
+    SCEVUDivExpr(const SCEV* lhs, const SCEV* rhs)
+      : SCEV(scUDivExpr), LHS(lhs), RHS(rhs) {}
 
-    virtual ~SCEVUDivExpr();
   public:
-    const SCEVHandle &getLHS() const { return LHS; }
-    const SCEVHandle &getRHS() const { return RHS; }
+    const SCEV* getLHS() const { return LHS; }
+    const SCEV* getRHS() const { return RHS; }
 
     virtual bool isLoopInvariant(const Loop *L) const {
       return LHS->isLoopInvariant(L) && RHS->isLoopInvariant(L);
@@ -359,11 +342,11 @@ namespace llvm {
              RHS->hasComputableLoopEvolution(L);
     }
 
-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                  ScalarEvolution &SE) const {
-      SCEVHandle L = LHS->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
-      SCEVHandle R = RHS->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
+      const SCEV* L = LHS->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
+      const SCEV* R = RHS->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
       if (L == LHS && R == RHS)
         return this;
       else
@@ -398,25 +381,23 @@ namespace llvm {
 
     const Loop *L;
 
-    SCEVAddRecExpr(const SmallVectorImpl<SCEVHandle> &ops, const Loop *l,
-                   const ScalarEvolution* p)
-      : SCEVNAryExpr(scAddRecExpr, ops, p), L(l) {
+    SCEVAddRecExpr(const SmallVectorImpl<const SCEV*> &ops, const Loop *l)
+      : SCEVNAryExpr(scAddRecExpr, ops), L(l) {
       for (size_t i = 0, e = Operands.size(); i != e; ++i)
         assert(Operands[i]->isLoopInvariant(l) &&
                "Operands of AddRec must be loop-invariant!");
     }
-    ~SCEVAddRecExpr();
 
   public:
-    const SCEVHandle &getStart() const { return Operands[0]; }
+    const SCEV* getStart() const { return Operands[0]; }
     const Loop *getLoop() const { return L; }
 
     /// getStepRecurrence - This method constructs and returns the recurrence
     /// indicating how much this expression steps by.  If this is a polynomial
     /// of degree N, it returns a chrec of degree N-1.
-    SCEVHandle getStepRecurrence(ScalarEvolution &SE) const {
+    const SCEV* getStepRecurrence(ScalarEvolution &SE) const {
       if (isAffine()) return getOperand(1);
-      return SE.getAddRecExpr(SmallVector<SCEVHandle, 3>(op_begin()+1,op_end()),
+      return SE.getAddRecExpr(SmallVector<const SCEV*, 3>(op_begin()+1,op_end()),
                               getLoop());
     }
 
@@ -444,7 +425,7 @@ namespace llvm {
 
     /// evaluateAtIteration - Return the value of this chain of recurrences at
     /// the specified iteration number.
-    SCEVHandle evaluateAtIteration(SCEVHandle It, ScalarEvolution &SE) const;
+    const SCEV* evaluateAtIteration(const SCEV* It, ScalarEvolution &SE) const;
 
     /// getNumIterationsInRange - Return the number of iterations of this loop
     /// that produce values in the specified constant range.  Another way of
@@ -452,11 +433,11 @@ namespace llvm {
     /// value is not in the condition, thus computing the exit count.  If the
     /// iteration count can't be computed, an instance of SCEVCouldNotCompute is
     /// returned.
-    SCEVHandle getNumIterationsInRange(ConstantRange Range,
+    const SCEV* getNumIterationsInRange(ConstantRange Range,
                                        ScalarEvolution &SE) const;
 
-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                  ScalarEvolution &SE) const;
 
     virtual void print(raw_ostream &OS) const;
@@ -475,9 +456,8 @@ namespace llvm {
   class SCEVSMaxExpr : public SCEVCommutativeExpr {
     friend class ScalarEvolution;
 
-    explicit SCEVSMaxExpr(const SmallVectorImpl<SCEVHandle> &ops,
-                          const ScalarEvolution* p)
-      : SCEVCommutativeExpr(scSMaxExpr, ops, p) {
+    explicit SCEVSMaxExpr(const SmallVectorImpl<const SCEV*> &ops)
+      : SCEVCommutativeExpr(scSMaxExpr, ops) {
     }
 
   public:
@@ -497,9 +477,8 @@ namespace llvm {
   class SCEVUMaxExpr : public SCEVCommutativeExpr {
     friend class ScalarEvolution;
 
-    explicit SCEVUMaxExpr(const SmallVectorImpl<SCEVHandle> &ops,
-                          const ScalarEvolution* p)
-      : SCEVCommutativeExpr(scUMaxExpr, ops, p) {
+    explicit SCEVUMaxExpr(const SmallVectorImpl<const SCEV*> &ops)
+      : SCEVCommutativeExpr(scUMaxExpr, ops) {
     }
 
   public:
@@ -522,11 +501,9 @@ namespace llvm {
     friend class ScalarEvolution;
 
     Value *V;
-    explicit SCEVUnknown(Value *v, const ScalarEvolution* p) :
-      SCEV(scUnknown, p), V(v) {}
-
-  protected:
-    ~SCEVUnknown();
+    explicit SCEVUnknown(Value *v) :
+      SCEV(scUnknown), V(v) {}
+      
   public:
     Value *getValue() const { return V; }
 
@@ -535,8 +512,8 @@ namespace llvm {
       return false; // not computable
     }
 
-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                  ScalarEvolution &SE) const {
       if (&*Sym == this) return Conc;
       return this;
diff --git a/include/llvm/CodeGen/BinaryObject.h b/include/llvm/CodeGen/BinaryObject.h
index 0780cd6ab4f4..4b66fe85678c 100644
--- a/include/llvm/CodeGen/BinaryObject.h
+++ b/include/llvm/CodeGen/BinaryObject.h
@@ -61,6 +61,11 @@ public:
     return Relocations;
   }
 
+  /// hasRelocations - Return true if 'Relocations' is not empty
+  bool hasRelocations() const {
+    return !Relocations.empty();
+  }
+
   /// emitByte - This callback is invoked when a byte needs to be
   /// written to the data stream.
   inline void emitByte(uint8_t B) {
@@ -317,6 +322,7 @@ public:
   void addRelocation(const MachineRelocation& relocation) {
     Relocations.push_back(relocation);
   }
+
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td
index bce3ce098f1b..5ed2f7734a75 100644
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@@ -116,6 +116,7 @@ def llvm_v2i64_ty      : LLVMType<v2i64>;    //  2 x i64
 def llvm_v2i32_ty      : LLVMType<v2i32>;    //  2 x i32
 def llvm_v1i64_ty      : LLVMType<v1i64>;    //  1 x i64
 def llvm_v4i32_ty      : LLVMType<v4i32>;    //  4 x i32
+def llvm_v2f32_ty      : LLVMType<v2f32>;    //  2 x float
 def llvm_v4f32_ty      : LLVMType<v4f32>;    //  4 x float
 def llvm_v2f64_ty      : LLVMType<v2f64>;    //  2 x double
 
diff --git a/include/llvm/IntrinsicsARM.td b/include/llvm/IntrinsicsARM.td
index e574938c72e3..a73dc458025b 100644
--- a/include/llvm/IntrinsicsARM.td
+++ b/include/llvm/IntrinsicsARM.td
@@ -19,3 +19,298 @@ let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
   def int_arm_thread_pointer : GCCBuiltin<"__builtin_thread_pointer">,
               Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 }
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON)
+
+let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
+
+  // The following classes do not correspond directly to GCC builtins.
+  class Neon_1Arg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class Neon_1Arg_Float_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class Neon_1Arg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMExtendedElementVectorType<0>], [IntrNoMem]>;
+  class Neon_1Arg_Long_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMTruncatedElementVectorType<0>], [IntrNoMem]>;
+  class Neon_2Arg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class Neon_2Arg_Float_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class Neon_2Arg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMExtendedElementVectorType<0>,
+                 LLVMExtendedElementVectorType<0>],
+                [IntrNoMem]>;
+  class Neon_2Arg_Long_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMTruncatedElementVectorType<0>,
+                 LLVMTruncatedElementVectorType<0>],
+                [IntrNoMem]>;
+  class Neon_2Arg_Wide_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMMatchType<0>, LLVMTruncatedElementVectorType<0>],
+                [IntrNoMem]>;
+  class Neon_3Arg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class Neon_3Arg_Long_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMMatchType<0>,
+                 LLVMTruncatedElementVectorType<0>,
+                 LLVMTruncatedElementVectorType<0>],
+                [IntrNoMem]>;
+  class Neon_CvtFxToFP_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+  class Neon_CvtFPToFx_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
+}
+
+// Arithmetic ops
+
+let Properties = [IntrNoMem, Commutative] in {
+
+  // Vector Add.
+  def int_arm_neon_vhadds : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vhaddu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vrhadds : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vrhaddu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vqadds : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vqaddu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vaddhn : Neon_2Arg_Narrow_Intrinsic;
+  def int_arm_neon_vraddhn : Neon_2Arg_Narrow_Intrinsic;
+  def int_arm_neon_vaddls : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vaddlu : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vaddws : Neon_2Arg_Wide_Intrinsic;
+  def int_arm_neon_vaddwu : Neon_2Arg_Wide_Intrinsic;
+
+  // Vector Multiply.
+  def int_arm_neon_vmulp : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vqdmulh : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vqrdmulh : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vmulls : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vmullu : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vmullp : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vqdmull : Neon_2Arg_Long_Intrinsic;
+
+  // Vector Multiply and Accumulate/Subtract.
+  def int_arm_neon_vmlals : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vmlalu : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vmlsls : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vmlslu : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vqdmlal : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic;
+
+  // Vector Maximum.
+  def int_arm_neon_vmaxs : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vmaxu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vmaxf : Neon_2Arg_Float_Intrinsic;
+
+  // Vector Minimum.
+  def int_arm_neon_vmins : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vminu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vminf : Neon_2Arg_Float_Intrinsic;
+
+  // Vector Reciprocal Step.
+  def int_arm_neon_vrecps : Neon_2Arg_Float_Intrinsic;
+
+  // Vector Reciprocal Square Root Step.
+  def int_arm_neon_vrsqrts : Neon_2Arg_Float_Intrinsic;
+}
+
+// Vector Subtract.
+def int_arm_neon_vhsubs : Neon_2Arg_Intrinsic;
+def int_arm_neon_vhsubu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqsubs : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqsubu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vsubhn : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vsubls : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vsublu : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vsubws : Neon_2Arg_Wide_Intrinsic;
+def int_arm_neon_vsubwu : Neon_2Arg_Wide_Intrinsic;
+
+// Vector Absolute Compare.
+let TargetPrefix = "arm" in {
+  def int_arm_neon_vacged : Intrinsic<[llvm_v2i32_ty],
+                                      [llvm_v2f32_ty, llvm_v2f32_ty],
+                                      [IntrNoMem]>;
+  def int_arm_neon_vacgeq : Intrinsic<[llvm_v4i32_ty],
+                                      [llvm_v4f32_ty, llvm_v4f32_ty],
+                                      [IntrNoMem]>;
+  def int_arm_neon_vacgtd : Intrinsic<[llvm_v2i32_ty],
+                                      [llvm_v2f32_ty, llvm_v2f32_ty],
+                                      [IntrNoMem]>;
+  def int_arm_neon_vacgtq : Intrinsic<[llvm_v4i32_ty],
+                                      [llvm_v4f32_ty, llvm_v4f32_ty],
+                                      [IntrNoMem]>;
+}
+
+// Vector Absolute Differences.
+def int_arm_neon_vabds : Neon_2Arg_Intrinsic;
+def int_arm_neon_vabdu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vabdf : Neon_2Arg_Float_Intrinsic;
+def int_arm_neon_vabdls : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vabdlu : Neon_2Arg_Long_Intrinsic;
+
+// Vector Absolute Difference and Accumulate.
+def int_arm_neon_vabas : Neon_3Arg_Intrinsic;
+def int_arm_neon_vabau : Neon_3Arg_Intrinsic;
+def int_arm_neon_vabals : Neon_3Arg_Long_Intrinsic;
+def int_arm_neon_vabalu : Neon_3Arg_Long_Intrinsic;
+
+// Vector Pairwise Add.
+def int_arm_neon_vpaddi : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpaddf : Neon_2Arg_Float_Intrinsic;
+
+// Vector Pairwise Add Long.
+// Note: This is different than the other "long" NEON intrinsics because
+// the result vector has half as many elements as the source vector.
+// The source and destination vector types must be specified separately.
+let TargetPrefix = "arm" in {
+  def int_arm_neon_vpaddls : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty],
+                                       [IntrNoMem]>;
+  def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty],
+                                       [IntrNoMem]>;
+}
+
+// Vector Pairwise Add and Accumulate Long.
+// Note: This is similar to vpaddl but the destination vector also appears
+// as the first argument.
+let TargetPrefix = "arm" in {
+  def int_arm_neon_vpadals : Intrinsic<[llvm_anyint_ty],
+                                       [LLVMMatchType<0>, llvm_anyint_ty],
+                                       [IntrNoMem]>;
+  def int_arm_neon_vpadalu : Intrinsic<[llvm_anyint_ty],
+                                       [LLVMMatchType<0>, llvm_anyint_ty],
+                                       [IntrNoMem]>;
+}
+
+// Vector Pairwise Maximum and Minimum.
+def int_arm_neon_vpmaxs : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpmaxu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpmaxf : Neon_2Arg_Float_Intrinsic;
+def int_arm_neon_vpmins : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpminu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpminf : Neon_2Arg_Float_Intrinsic;
+
+// Vector Shifts:
+//
+// The various saturating and rounding vector shift operations need to be
+// represented by intrinsics in LLVM, and even the basic VSHL variable shift
+// operation cannot be safely translated to LLVM's shift operators.  VSHL can
+// be used for both left and right shifts, or even combinations of the two,
+// depending on the signs of the shift amounts.  It also has well-defined
+// behavior for shift amounts that LLVM leaves undefined.  Only basic shifts
+// by constants can be represented with LLVM's shift operators.
+//
+// The shift counts for these intrinsics are always vectors, even for constant
+// shifts, where the constant is replicated.  For consistency with VSHL (and
+// other variable shift instructions), left shifts have positive shift counts
+// and right shifts have negative shift counts.  This convention is also used
+// for constant right shift intrinsics, and to help preserve sanity, the
+// intrinsic names use "shift" instead of either "shl" or "shr".  Where
+// applicable, signed and unsigned versions of the intrinsics are
+// distinguished with "s" and "u" suffixes.  A few NEON shift instructions,
+// such as VQSHLU, take signed operands but produce unsigned results; these
+// use a "su" suffix.
+
+// Vector Shift.
+def int_arm_neon_vshifts : Neon_2Arg_Intrinsic;
+def int_arm_neon_vshiftu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vshiftls : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vshiftlu : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vshiftn : Neon_2Arg_Narrow_Intrinsic;
+
+// Vector Rounding Shift.
+def int_arm_neon_vrshifts : Neon_2Arg_Intrinsic;
+def int_arm_neon_vrshiftu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vrshiftn : Neon_2Arg_Narrow_Intrinsic;
+
+// Vector Saturating Shift.
+def int_arm_neon_vqshifts : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqshiftu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqshiftsu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqshiftns : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vqshiftnu : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vqshiftnsu : Neon_2Arg_Narrow_Intrinsic;
+
+// Vector Saturating Rounding Shift.
+def int_arm_neon_vqrshifts : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqrshiftu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqrshiftns : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vqrshiftnu : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vqrshiftnsu : Neon_2Arg_Narrow_Intrinsic;
+
+// Vector Shift and Insert.
+def int_arm_neon_vshiftins : Neon_3Arg_Intrinsic;
+
+// Vector Absolute Value and Saturating Absolute Value.
+def int_arm_neon_vabs : Neon_1Arg_Intrinsic;
+def int_arm_neon_vabsf : Neon_1Arg_Float_Intrinsic;
+def int_arm_neon_vqabs : Neon_1Arg_Intrinsic;
+
+// Vector Saturating Negate.
+def int_arm_neon_vqneg : Neon_1Arg_Intrinsic;
+
+// Vector Count Leading Sign/Zero Bits.
+def int_arm_neon_vcls : Neon_1Arg_Intrinsic;
+def int_arm_neon_vclz : Neon_1Arg_Intrinsic;
+
+// Vector Count One Bits.
+def int_arm_neon_vcnt : Neon_1Arg_Intrinsic;
+
+// Vector Reciprocal Estimate.
+def int_arm_neon_vrecpe : Neon_1Arg_Intrinsic;
+def int_arm_neon_vrecpef : Neon_1Arg_Float_Intrinsic;
+
+// Vector Reciprocal Square Root Estimate.
+def int_arm_neon_vrsqrte : Neon_1Arg_Intrinsic;
+def int_arm_neon_vrsqrtef : Neon_1Arg_Float_Intrinsic;
+
+// Vector Conversions Between Floating-point and Fixed-point.
+def int_arm_neon_vcvtfp2fxs : Neon_CvtFPToFx_Intrinsic;
+def int_arm_neon_vcvtfp2fxu : Neon_CvtFPToFx_Intrinsic;
+def int_arm_neon_vcvtfxs2fp : Neon_CvtFxToFP_Intrinsic;
+def int_arm_neon_vcvtfxu2fp : Neon_CvtFxToFP_Intrinsic;
+
+// Narrowing and Lengthening Vector Moves.
+def int_arm_neon_vmovn : Neon_1Arg_Narrow_Intrinsic;
+def int_arm_neon_vqmovns : Neon_1Arg_Narrow_Intrinsic;
+def int_arm_neon_vqmovnu : Neon_1Arg_Narrow_Intrinsic;
+def int_arm_neon_vqmovnsu : Neon_1Arg_Narrow_Intrinsic;
+def int_arm_neon_vmovls : Neon_1Arg_Long_Intrinsic;
+def int_arm_neon_vmovlu : Neon_1Arg_Long_Intrinsic;
+
+let TargetPrefix = "arm" in {
+
+  // De-interleaving vector loads from N-element structures.
+  def int_arm_neon_vld3i : Intrinsic<[llvm_anyint_ty],
+                                     [llvm_ptr_ty], [IntrReadArgMem]>;
+  def int_arm_neon_vld3f : Intrinsic<[llvm_anyfloat_ty],
+                                     [llvm_ptr_ty], [IntrReadArgMem]>;
+  def int_arm_neon_vld4i : Intrinsic<[llvm_anyint_ty],
+                                     [llvm_ptr_ty], [IntrReadArgMem]>;
+  def int_arm_neon_vld4f : Intrinsic<[llvm_anyfloat_ty],
+                                     [llvm_ptr_ty], [IntrReadArgMem]>;
+
+  // Interleaving vector stores from N-element structures.
+  def int_arm_neon_vst3i : Intrinsic<[llvm_void_ty],
+                                     [llvm_anyint_ty, llvm_ptr_ty],
+                                     [IntrWriteArgMem]>;
+  def int_arm_neon_vst3f : Intrinsic<[llvm_void_ty],
+                                     [llvm_anyfloat_ty, llvm_ptr_ty],
+                                     [IntrWriteArgMem]>;
+  def int_arm_neon_vst4i : Intrinsic<[llvm_void_ty],
+                                     [llvm_anyint_ty, llvm_ptr_ty],
+                                     [IntrWriteArgMem]>;
+  def int_arm_neon_vst4f : Intrinsic<[llvm_void_ty],
+                                     [llvm_anyfloat_ty, llvm_ptr_ty],
+                                     [IntrWriteArgMem]>;
+}
diff --git a/include/llvm/Support/Timer.h b/include/llvm/Support/Timer.h
index 584199f44007..9a8247071c34 100644
--- a/include/llvm/Support/Timer.h
+++ b/include/llvm/Support/Timer.h
@@ -152,6 +152,7 @@ class TimerGroup {
   unsigned NumTimers;
   std::vector<Timer> TimersToPrint;
 public:
+  TimerGroup() : Name("Miscellaneous Ungrouped Timers"), NumTimers(0) {}
   explicit TimerGroup(const std::string &name) : Name(name), NumTimers(0) {}
   ~TimerGroup() {
     assert(NumTimers == 0 &&
diff --git a/include/llvm/Target/TargetELFWriterInfo.h b/include/llvm/Target/TargetELFWriterInfo.h
index f7e3392577d9..c1f54d201a0e 100644
--- a/include/llvm/Target/TargetELFWriterInfo.h
+++ b/include/llvm/Target/TargetELFWriterInfo.h
@@ -78,11 +78,32 @@ namespace llvm {
 
     /// Symbol Table Info
     unsigned getSymTabEntrySize() const { return is64Bit ? 24 : 16; }
-    unsigned getSymTabAlignment() const { return is64Bit ? 8 : 4; }
+
+    /// getPrefELFAlignment - Returns the preferred alignment for ELF. This
+    /// is used to align some sections.
+    unsigned getPrefELFAlignment() const { return is64Bit ? 8 : 4; }
+
+    /// getRelocationEntrySize - Entry size used in the relocation section
+    unsigned getRelocationEntrySize() const {
+      return is64Bit ? (hasRelocationAddend() ? 24 : 16)
+                     : (hasRelocationAddend() ? 12 : 8);
+    }
 
     /// getFunctionAlignment - Returns the alignment for function 'F', targets
     /// with different alignment constraints should overload this method
     virtual unsigned getFunctionAlignment(const Function *F) const;
+
+    /// getRelocationType - Returns the target specific ELF Relocation type.
+    /// 'MachineRelTy' contains the object code independent relocation type
+    virtual unsigned getRelocationType(unsigned MachineRelTy) const = 0;
+
+    /// hasRelocationAddend - True if the target uses an addend in the
+    /// ELF relocation entry.
+    virtual bool hasRelocationAddend() const = 0;
+
+    /// getAddendForRelTy - Gets the addend value for an ELF relocation entry
+    /// based on the target relocation type. If addend is not used returns 0.
+    virtual long int getAddendForRelTy(unsigned RelTy) const = 0;
   };
 
 } // end llvm namespace
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index d0620456399b..f689dcac305a 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -28,7 +28,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/ManagedStatic.h"
 #include <algorithm>
 using namespace llvm;
 
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index 6a53a83665c2..caeb14bef373 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -39,7 +39,7 @@ Pass *llvm::createIVUsersPass() {
 /// containsAddRecFromDifferentLoop - Determine whether expression S involves a
 /// subexpression that is an AddRec from a loop other than L.  An outer loop
 /// of L is OK, but not an inner loop nor a disjoint loop.
-static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) {
+static bool containsAddRecFromDifferentLoop(const SCEV* S, Loop *L) {
   // This is very common, put it first.
   if (isa<SCEVConstant>(S))
     return false;
@@ -80,10 +80,10 @@ static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) {
 /// a mix of loop invariant and loop variant expressions.  The start cannot,
 /// however, contain an AddRec from a different loop, unless that loop is an
 /// outer loop of the current loop.
-static bool getSCEVStartAndStride(const SCEVHandle &SH, Loop *L, Loop *UseLoop,
-                                  SCEVHandle &Start, SCEVHandle &Stride,
+static bool getSCEVStartAndStride(const SCEV* &SH, Loop *L, Loop *UseLoop,
+                                  const SCEV* &Start, const SCEV* &Stride,
                                   ScalarEvolution *SE, DominatorTree *DT) {
-  SCEVHandle TheAddRec = Start;   // Initialize to zero.
+  const SCEV* TheAddRec = Start;   // Initialize to zero.
 
   // If the outer level is an AddExpr, the operands are all start values except
   // for a nested AddRecExpr.
@@ -109,9 +109,9 @@ static bool getSCEVStartAndStride(const SCEVHandle &SH, Loop *L, Loop *UseLoop,
 
   // Use getSCEVAtScope to attempt to simplify other loops out of
   // the picture.
-  SCEVHandle AddRecStart = AddRec->getStart();
+  const SCEV* AddRecStart = AddRec->getStart();
   AddRecStart = SE->getSCEVAtScope(AddRecStart, UseLoop);
-  SCEVHandle AddRecStride = AddRec->getStepRecurrence(*SE);
+  const SCEV* AddRecStride = AddRec->getStepRecurrence(*SE);
 
   // FIXME: If Start contains an SCEVAddRecExpr from a different loop, other
   // than an outer loop of the current loop, reject it.  LSR has no concept of
@@ -196,13 +196,13 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
     return true;    // Instruction already handled.
 
   // Get the symbolic expression for this instruction.
-  SCEVHandle ISE = SE->getSCEV(I);
+  const SCEV* ISE = SE->getSCEV(I);
   if (isa<SCEVCouldNotCompute>(ISE)) return false;
 
   // Get the start and stride for this expression.
   Loop *UseLoop = LI->getLoopFor(I->getParent());
-  SCEVHandle Start = SE->getIntegerSCEV(0, ISE->getType());
-  SCEVHandle Stride = Start;
+  const SCEV* Start = SE->getIntegerSCEV(0, ISE->getType());
+  const SCEV* Stride = Start;
 
   if (!getSCEVStartAndStride(ISE, L, UseLoop, Start, Stride, SE, DT))
     return false;  // Non-reducible symbolic expression, bail out.
@@ -254,7 +254,7 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
       if (IVUseShouldUsePostIncValue(User, I, L, LI, DT, this)) {
         // The value used will be incremented by the stride more than we are
         // expecting, so subtract this off.
-        SCEVHandle NewStart = SE->getMinusSCEV(Start, Stride);
+        const SCEV* NewStart = SE->getMinusSCEV(Start, Stride);
         StrideUses->addUser(NewStart, User, I);
         StrideUses->Users.back().setIsUseOfPostIncrementedValue(true);
         DOUT << "   USING POSTINC SCEV, START=" << *NewStart<< "\n";
@@ -295,9 +295,9 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
 
 /// getReplacementExpr - Return a SCEV expression which computes the
 /// value of the OperandValToReplace of the given IVStrideUse.
-SCEVHandle IVUsers::getReplacementExpr(const IVStrideUse &U) const {
+const SCEV* IVUsers::getReplacementExpr(const IVStrideUse &U) const {
   // Start with zero.
-  SCEVHandle RetVal = SE->getIntegerSCEV(0, U.getParent()->Stride->getType());
+  const SCEV* RetVal = SE->getIntegerSCEV(0, U.getParent()->Stride->getType());
   // Create the basic add recurrence.
   RetVal = SE->getAddRecExpr(RetVal, U.getParent()->Stride, L);
   // Add the offset in a separate step, because it may be loop-variant.
@@ -308,7 +308,7 @@ SCEVHandle IVUsers::getReplacementExpr(const IVStrideUse &U) const {
     RetVal = SE->getAddExpr(RetVal, U.getParent()->Stride);
   // Evaluate the expression out of the loop, if possible.
   if (!L->contains(U.getUser()->getParent())) {
-    SCEVHandle ExitVal = SE->getSCEVAtScope(RetVal, L->getParentLoop());
+    const SCEV* ExitVal = SE->getSCEVAtScope(RetVal, L->getParentLoop());
     if (ExitVal->isLoopInvariant(L))
       RetVal = ExitVal;
   }
@@ -325,7 +325,7 @@ void IVUsers::print(raw_ostream &OS, const Module *M) const {
   OS << ":\n";
 
   for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e; ++Stride) {
-    std::map<SCEVHandle, IVUsersOfOneStride*>::const_iterator SI =
+    std::map<const SCEV*, IVUsersOfOneStride*>::const_iterator SI =
       IVUsesByStride.find(StrideOrder[Stride]);
     assert(SI != IVUsesByStride.end() && "Stride doesn't exist!");
     OS << "  Stride " << *SI->first->getType() << " " << *SI->first << ":\n";
diff --git a/lib/Analysis/LoopVR.cpp b/lib/Analysis/LoopVR.cpp
index 0a3d06bed7e8..ae715ac58632 100644
--- a/lib/Analysis/LoopVR.cpp
+++ b/lib/Analysis/LoopVR.cpp
@@ -26,8 +26,8 @@ char LoopVR::ID = 0;
 static RegisterPass<LoopVR> X("loopvr", "Loop Value Ranges", false, true);
 
 /// getRange - determine the range for a particular SCEV within a given Loop
-ConstantRange LoopVR::getRange(SCEVHandle S, Loop *L, ScalarEvolution &SE) {
-  SCEVHandle T = SE.getBackedgeTakenCount(L);
+ConstantRange LoopVR::getRange(const SCEV* S, Loop *L, ScalarEvolution &SE) {
+  const SCEV* T = SE.getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(T))
     return ConstantRange(cast<IntegerType>(S->getType())->getBitWidth(), true);
 
@@ -36,7 +36,7 @@ ConstantRange LoopVR::getRange(SCEVHandle S, Loop *L, ScalarEvolution &SE) {
 }
 
 /// getRange - determine the range for a particular SCEV with a given trip count
-ConstantRange LoopVR::getRange(SCEVHandle S, SCEVHandle T, ScalarEvolution &SE){
+ConstantRange LoopVR::getRange(const SCEV* S, const SCEV* T, ScalarEvolution &SE){
 
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
     return ConstantRange(C->getValue()->getValue());
@@ -182,8 +182,8 @@ ConstantRange LoopVR::getRange(SCEVHandle S, SCEVHandle T, ScalarEvolution &SE){
     if (!Trip) return FullSet;
 
     if (AddRec->isAffine()) {
-      SCEVHandle StartHandle = AddRec->getStart();
-      SCEVHandle StepHandle = AddRec->getOperand(1);
+      const SCEV* StartHandle = AddRec->getStart();
+      const SCEV* StepHandle = AddRec->getOperand(1);
 
       const SCEVConstant *Step = dyn_cast<SCEVConstant>(StepHandle);
       if (!Step) return FullSet;
@@ -194,7 +194,7 @@ ConstantRange LoopVR::getRange(SCEVHandle S, SCEVHandle T, ScalarEvolution &SE){
       if ((TripExt * StepExt).ugt(APInt::getLowBitsSet(ExWidth, ExWidth >> 1)))
         return FullSet;
 
-      SCEVHandle EndHandle = SE.getAddExpr(StartHandle,
+      const SCEV* EndHandle = SE.getAddExpr(StartHandle,
                                            SE.getMulExpr(T, StepHandle));
       const SCEVConstant *Start = dyn_cast<SCEVConstant>(StartHandle);
       const SCEVConstant *End = dyn_cast<SCEVConstant>(EndHandle);
@@ -254,7 +254,7 @@ ConstantRange LoopVR::compute(Value *V) {
 
   ScalarEvolution &SE = getAnalysis<ScalarEvolution>();
 
-  SCEVHandle S = SE.getSCEV(I);
+  const SCEV* S = SE.getSCEV(I);
   if (isa<SCEVUnknown>(S) || isa<SCEVCouldNotCompute>(S))
     return ConstantRange(cast<IntegerType>(V->getType())->getBitWidth(), false);
 
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 68aa595aa8dd..5cbb5fac8ac8 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -14,7 +14,7 @@
 // There are several aspects to this library.  First is the representation of
 // scalar expressions, which are represented as subclasses of the SCEV class.
 // These classes are used to represent certain types of subexpressions that we
-// can handle.  These classes are reference counted, managed by the SCEVHandle
+// can handle.  These classes are reference counted, managed by the const SCEV*
 // class.  We only create one SCEV of a particular shape, so pointer-comparisons
 // for equality are legal.
 //
@@ -76,7 +76,6 @@
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/InstIterator.h"
-#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/Statistic.h"
@@ -133,9 +132,8 @@ bool SCEV::isOne() const {
   return false;
 }
 
-SCEVCouldNotCompute::SCEVCouldNotCompute(const ScalarEvolution* p) :
-  SCEV(scCouldNotCompute, p) {}
-SCEVCouldNotCompute::~SCEVCouldNotCompute() {}
+SCEVCouldNotCompute::SCEVCouldNotCompute() :
+  SCEV(scCouldNotCompute) {}
 
 bool SCEVCouldNotCompute::isLoopInvariant(const Loop *L) const {
   assert(0 && "Attempt to use a SCEVCouldNotCompute object!");
@@ -152,9 +150,9 @@ bool SCEVCouldNotCompute::hasComputableLoopEvolution(const Loop *L) const {
   return false;
 }
 
-SCEVHandle SCEVCouldNotCompute::
-replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                  const SCEVHandle &Conc,
+const SCEV* SCEVCouldNotCompute::
+replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                  const SCEV* Conc,
                                   ScalarEvolution &SE) const {
   return this;
 }
@@ -169,26 +167,20 @@ bool SCEVCouldNotCompute::classof(const SCEV *S) {
 
 
 // SCEVConstants - Only allow the creation of one SCEVConstant for any
-// particular value.  Don't use a SCEVHandle here, or else the object will
+// particular value.  Don't use a const SCEV* here, or else the object will
 // never be deleted!
-static ManagedStatic<std::map<ConstantInt*, SCEVConstant*> > SCEVConstants;
 
-
-SCEVConstant::~SCEVConstant() {
-  SCEVConstants->erase(V);
-}
-
-SCEVHandle ScalarEvolution::getConstant(ConstantInt *V) {
-  SCEVConstant *&R = (*SCEVConstants)[V];
-  if (R == 0) R = new SCEVConstant(V, this);
+const SCEV* ScalarEvolution::getConstant(ConstantInt *V) {
+  SCEVConstant *&R = SCEVConstants[V];
+  if (R == 0) R = new SCEVConstant(V);
   return R;
 }
 
-SCEVHandle ScalarEvolution::getConstant(const APInt& Val) {
+const SCEV* ScalarEvolution::getConstant(const APInt& Val) {
   return getConstant(ConstantInt::get(Val));
 }
 
-SCEVHandle
+const SCEV*
 ScalarEvolution::getConstant(const Type *Ty, uint64_t V, bool isSigned) {
   return getConstant(ConstantInt::get(cast<IntegerType>(Ty), V, isSigned));
 }
@@ -200,92 +192,62 @@ void SCEVConstant::print(raw_ostream &OS) const {
 }
 
 SCEVCastExpr::SCEVCastExpr(unsigned SCEVTy,
-                           const SCEVHandle &op, const Type *ty,
-                           const ScalarEvolution* p)
-  : SCEV(SCEVTy, p), Op(op), Ty(ty) {}
-
-SCEVCastExpr::~SCEVCastExpr() {}
+                           const SCEV* op, const Type *ty)
+  : SCEV(SCEVTy), Op(op), Ty(ty) {}
 
 bool SCEVCastExpr::dominates(BasicBlock *BB, DominatorTree *DT) const {
   return Op->dominates(BB, DT);
 }
 
 // SCEVTruncates - Only allow the creation of one SCEVTruncateExpr for any
-// particular input.  Don't use a SCEVHandle here, or else the object will
+// particular input.  Don't use a const SCEV* here, or else the object will
 // never be deleted!
-static ManagedStatic<std::map<std::pair<const SCEV*, const Type*>, 
-                     SCEVTruncateExpr*> > SCEVTruncates;
 
-SCEVTruncateExpr::SCEVTruncateExpr(const SCEVHandle &op, const Type *ty,
-                                   const ScalarEvolution* p)
-  : SCEVCastExpr(scTruncate, op, ty, p) {
+SCEVTruncateExpr::SCEVTruncateExpr(const SCEV* op, const Type *ty)
+  : SCEVCastExpr(scTruncate, op, ty) {
   assert((Op->getType()->isInteger() || isa<PointerType>(Op->getType())) &&
          (Ty->isInteger() || isa<PointerType>(Ty)) &&
          "Cannot truncate non-integer value!");
 }
 
-SCEVTruncateExpr::~SCEVTruncateExpr() {
-  SCEVTruncates->erase(std::make_pair(Op, Ty));
-}
 
 void SCEVTruncateExpr::print(raw_ostream &OS) const {
   OS << "(trunc " << *Op->getType() << " " << *Op << " to " << *Ty << ")";
 }
 
 // SCEVZeroExtends - Only allow the creation of one SCEVZeroExtendExpr for any
-// particular input.  Don't use a SCEVHandle here, or else the object will never
+// particular input.  Don't use a const SCEV* here, or else the object will never
 // be deleted!
-static ManagedStatic<std::map<std::pair<const SCEV*, const Type*>,
-                     SCEVZeroExtendExpr*> > SCEVZeroExtends;
 
-SCEVZeroExtendExpr::SCEVZeroExtendExpr(const SCEVHandle &op, const Type *ty,
-                                       const ScalarEvolution* p)
-  : SCEVCastExpr(scZeroExtend, op, ty, p) {
+SCEVZeroExtendExpr::SCEVZeroExtendExpr(const SCEV* op, const Type *ty)
+  : SCEVCastExpr(scZeroExtend, op, ty) {
   assert((Op->getType()->isInteger() || isa<PointerType>(Op->getType())) &&
          (Ty->isInteger() || isa<PointerType>(Ty)) &&
          "Cannot zero extend non-integer value!");
 }
 
-SCEVZeroExtendExpr::~SCEVZeroExtendExpr() {
-  SCEVZeroExtends->erase(std::make_pair(Op, Ty));
-}
-
 void SCEVZeroExtendExpr::print(raw_ostream &OS) const {
   OS << "(zext " << *Op->getType() << " " << *Op << " to " << *Ty << ")";
 }
 
 // SCEVSignExtends - Only allow the creation of one SCEVSignExtendExpr for any
-// particular input.  Don't use a SCEVHandle here, or else the object will never
+// particular input.  Don't use a const SCEV* here, or else the object will never
 // be deleted!
-static ManagedStatic<std::map<std::pair<const SCEV*, const Type*>,
-                     SCEVSignExtendExpr*> > SCEVSignExtends;
 
-SCEVSignExtendExpr::SCEVSignExtendExpr(const SCEVHandle &op, const Type *ty,
-                                       const ScalarEvolution* p)
-  : SCEVCastExpr(scSignExtend, op, ty, p) {
+SCEVSignExtendExpr::SCEVSignExtendExpr(const SCEV* op, const Type *ty)
+  : SCEVCastExpr(scSignExtend, op, ty) {
   assert((Op->getType()->isInteger() || isa<PointerType>(Op->getType())) &&
          (Ty->isInteger() || isa<PointerType>(Ty)) &&
          "Cannot sign extend non-integer value!");
 }
 
-SCEVSignExtendExpr::~SCEVSignExtendExpr() {
-  SCEVSignExtends->erase(std::make_pair(Op, Ty));
-}
-
 void SCEVSignExtendExpr::print(raw_ostream &OS) const {
   OS << "(sext " << *Op->getType() << " " << *Op << " to " << *Ty << ")";
 }
 
 // SCEVCommExprs - Only allow the creation of one SCEVCommutativeExpr for any
-// particular input.  Don't use a SCEVHandle here, or else the object will never
+// particular input.  Don't use a const SCEV* here, or else the object will never
 // be deleted!
-static ManagedStatic<std::map<std::pair<unsigned, std::vector<const SCEV*> >,
-                     SCEVCommutativeExpr*> > SCEVCommExprs;
-
-SCEVCommutativeExpr::~SCEVCommutativeExpr() {
-  std::vector<const SCEV*> SCEVOps(Operands.begin(), Operands.end());
-  SCEVCommExprs->erase(std::make_pair(getSCEVType(), SCEVOps));
-}
 
 void SCEVCommutativeExpr::print(raw_ostream &OS) const {
   assert(Operands.size() > 1 && "This plus expr shouldn't exist!");
@@ -296,15 +258,15 @@ void SCEVCommutativeExpr::print(raw_ostream &OS) const {
   OS << ")";
 }
 
-SCEVHandle SCEVCommutativeExpr::
-replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                  const SCEVHandle &Conc,
+const SCEV* SCEVCommutativeExpr::
+replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                  const SCEV* Conc,
                                   ScalarEvolution &SE) const {
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-    SCEVHandle H =
+    const SCEV* H =
       getOperand(i)->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
     if (H != getOperand(i)) {
-      SmallVector<SCEVHandle, 8> NewOps;
+      SmallVector<const SCEV*, 8> NewOps;
       NewOps.reserve(getNumOperands());
       for (unsigned j = 0; j != i; ++j)
         NewOps.push_back(getOperand(j));
@@ -338,14 +300,8 @@ bool SCEVNAryExpr::dominates(BasicBlock *BB, DominatorTree *DT) const {
 
 
 // SCEVUDivs - Only allow the creation of one SCEVUDivExpr for any particular
-// input.  Don't use a SCEVHandle here, or else the object will never be
+// input.  Don't use a const SCEV* here, or else the object will never be
 // deleted!
-static ManagedStatic<std::map<std::pair<const SCEV*, const SCEV*>,
-                     SCEVUDivExpr*> > SCEVUDivs;
-
-SCEVUDivExpr::~SCEVUDivExpr() {
-  SCEVUDivs->erase(std::make_pair(LHS, RHS));
-}
 
 bool SCEVUDivExpr::dominates(BasicBlock *BB, DominatorTree *DT) const {
   return LHS->dominates(BB, DT) && RHS->dominates(BB, DT);
@@ -365,26 +321,18 @@ const Type *SCEVUDivExpr::getType() const {
 }
 
 // SCEVAddRecExprs - Only allow the creation of one SCEVAddRecExpr for any
-// particular input.  Don't use a SCEVHandle here, or else the object will never
+// particular input.  Don't use a const SCEV* here, or else the object will never
 // be deleted!
-static ManagedStatic<std::map<std::pair<const Loop *,
-                                        std::vector<const SCEV*> >,
-                     SCEVAddRecExpr*> > SCEVAddRecExprs;
-
-SCEVAddRecExpr::~SCEVAddRecExpr() {
-  std::vector<const SCEV*> SCEVOps(Operands.begin(), Operands.end());
-  SCEVAddRecExprs->erase(std::make_pair(L, SCEVOps));
-}
 
-SCEVHandle SCEVAddRecExpr::
-replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                  const SCEVHandle &Conc,
+const SCEV* SCEVAddRecExpr::
+replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                  const SCEV* Conc,
                                   ScalarEvolution &SE) const {
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-    SCEVHandle H =
+    const SCEV* H =
       getOperand(i)->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
     if (H != getOperand(i)) {
-      SmallVector<SCEVHandle, 8> NewOps;
+      SmallVector<const SCEV*, 8> NewOps;
       NewOps.reserve(getNumOperands());
       for (unsigned j = 0; j != i; ++j)
         NewOps.push_back(getOperand(j));
@@ -418,11 +366,8 @@ void SCEVAddRecExpr::print(raw_ostream &OS) const {
 }
 
 // SCEVUnknowns - Only allow the creation of one SCEVUnknown for any particular
-// value.  Don't use a SCEVHandle here, or else the object will never be
+// value.  Don't use a const SCEV* here, or else the object will never be
 // deleted!
-static ManagedStatic<std::map<Value*, SCEVUnknown*> > SCEVUnknowns;
-
-SCEVUnknown::~SCEVUnknown() { SCEVUnknowns->erase(V); }
 
 bool SCEVUnknown::isLoopInvariant(const Loop *L) const {
   // All non-instruction values are loop invariant.  All instructions are loop
@@ -578,7 +523,7 @@ namespace {
 /// this to depend on where the addresses of various SCEV objects happened to
 /// land in memory.
 ///
-static void GroupByComplexity(SmallVectorImpl<SCEVHandle> &Ops,
+static void GroupByComplexity(SmallVectorImpl<const SCEV*> &Ops,
                               LoopInfo *LI) {
   if (Ops.size() < 2) return;  // Noop
   if (Ops.size() == 2) {
@@ -621,7 +566,7 @@ static void GroupByComplexity(SmallVectorImpl<SCEVHandle> &Ops,
 
 /// BinomialCoefficient - Compute BC(It, K).  The result has width W.
 /// Assume, K > 0.
-static SCEVHandle BinomialCoefficient(SCEVHandle It, unsigned K,
+static const SCEV* BinomialCoefficient(const SCEV* It, unsigned K,
                                       ScalarEvolution &SE,
                                       const Type* ResultTy) {
   // Handle the simplest case efficiently.
@@ -714,15 +659,15 @@ static SCEVHandle BinomialCoefficient(SCEVHandle It, unsigned K,
 
   // Calculate the product, at width T+W
   const IntegerType *CalculationTy = IntegerType::get(CalculationBits);
-  SCEVHandle Dividend = SE.getTruncateOrZeroExtend(It, CalculationTy);
+  const SCEV* Dividend = SE.getTruncateOrZeroExtend(It, CalculationTy);
   for (unsigned i = 1; i != K; ++i) {
-    SCEVHandle S = SE.getMinusSCEV(It, SE.getIntegerSCEV(i, It->getType()));
+    const SCEV* S = SE.getMinusSCEV(It, SE.getIntegerSCEV(i, It->getType()));
     Dividend = SE.getMulExpr(Dividend,
                              SE.getTruncateOrZeroExtend(S, CalculationTy));
   }
 
   // Divide by 2^T
-  SCEVHandle DivResult = SE.getUDivExpr(Dividend, SE.getConstant(DivFactor));
+  const SCEV* DivResult = SE.getUDivExpr(Dividend, SE.getConstant(DivFactor));
 
   // Truncate the result, and divide by K! / 2^T.
 
@@ -739,14 +684,14 @@ static SCEVHandle BinomialCoefficient(SCEVHandle It, unsigned K,
 ///
 /// where BC(It, k) stands for binomial coefficient.
 ///
-SCEVHandle SCEVAddRecExpr::evaluateAtIteration(SCEVHandle It,
+const SCEV* SCEVAddRecExpr::evaluateAtIteration(const SCEV* It,
                                                ScalarEvolution &SE) const {
-  SCEVHandle Result = getStart();
+  const SCEV* Result = getStart();
   for (unsigned i = 1, e = getNumOperands(); i != e; ++i) {
     // The computation is correct in the face of overflow provided that the
     // multiplication is performed _after_ the evaluation of the binomial
     // coefficient.
-    SCEVHandle Coeff = BinomialCoefficient(It, i, SE, getType());
+    const SCEV* Coeff = BinomialCoefficient(It, i, SE, getType());
     if (isa<SCEVCouldNotCompute>(Coeff))
       return Coeff;
 
@@ -759,7 +704,7 @@ SCEVHandle SCEVAddRecExpr::evaluateAtIteration(SCEVHandle It,
 //                    SCEV Expression folder implementations
 //===----------------------------------------------------------------------===//
 
-SCEVHandle ScalarEvolution::getTruncateExpr(const SCEVHandle &Op,
+const SCEV* ScalarEvolution::getTruncateExpr(const SCEV* Op,
                                             const Type *Ty) {
   assert(getTypeSizeInBits(Op->getType()) > getTypeSizeInBits(Ty) &&
          "This is not a truncating conversion!");
@@ -785,18 +730,18 @@ SCEVHandle ScalarEvolution::getTruncateExpr(const SCEVHandle &Op,
 
   // If the input value is a chrec scev, truncate the chrec's operands.
   if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Op)) {
-    SmallVector<SCEVHandle, 4> Operands;
+    SmallVector<const SCEV*, 4> Operands;
     for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i)
       Operands.push_back(getTruncateExpr(AddRec->getOperand(i), Ty));
     return getAddRecExpr(Operands, AddRec->getLoop());
   }
 
-  SCEVTruncateExpr *&Result = (*SCEVTruncates)[std::make_pair(Op, Ty)];
-  if (Result == 0) Result = new SCEVTruncateExpr(Op, Ty, this);
+  SCEVTruncateExpr *&Result = SCEVTruncates[std::make_pair(Op, Ty)];
+  if (Result == 0) Result = new SCEVTruncateExpr(Op, Ty);
   return Result;
 }
 
-SCEVHandle ScalarEvolution::getZeroExtendExpr(const SCEVHandle &Op,
+const SCEV* ScalarEvolution::getZeroExtendExpr(const SCEV* Op,
                                               const Type *Ty) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
          "This is not an extending conversion!");
@@ -829,28 +774,28 @@ SCEVHandle ScalarEvolution::getZeroExtendExpr(const SCEVHandle &Op,
       // in infinite recursion. In the later case, the analysis code will
       // cope with a conservative value, and it will take care to purge
       // that value once it has finished.
-      SCEVHandle MaxBECount = getMaxBackedgeTakenCount(AR->getLoop());
+      const SCEV* MaxBECount = getMaxBackedgeTakenCount(AR->getLoop());
       if (!isa<SCEVCouldNotCompute>(MaxBECount)) {
         // Manually compute the final value for AR, checking for
         // overflow.
-        SCEVHandle Start = AR->getStart();
-        SCEVHandle Step = AR->getStepRecurrence(*this);
+        const SCEV* Start = AR->getStart();
+        const SCEV* Step = AR->getStepRecurrence(*this);
 
         // Check whether the backedge-taken count can be losslessly casted to
         // the addrec's type. The count is always unsigned.
-        SCEVHandle CastedMaxBECount =
+        const SCEV* CastedMaxBECount =
           getTruncateOrZeroExtend(MaxBECount, Start->getType());
-        SCEVHandle RecastedMaxBECount =
+        const SCEV* RecastedMaxBECount =
           getTruncateOrZeroExtend(CastedMaxBECount, MaxBECount->getType());
         if (MaxBECount == RecastedMaxBECount) {
           const Type *WideTy =
             IntegerType::get(getTypeSizeInBits(Start->getType()) * 2);
           // Check whether Start+Step*MaxBECount has no unsigned overflow.
-          SCEVHandle ZMul =
+          const SCEV* ZMul =
             getMulExpr(CastedMaxBECount,
                        getTruncateOrZeroExtend(Step, Start->getType()));
-          SCEVHandle Add = getAddExpr(Start, ZMul);
-          SCEVHandle OperandExtendedAdd =
+          const SCEV* Add = getAddExpr(Start, ZMul);
+          const SCEV* OperandExtendedAdd =
             getAddExpr(getZeroExtendExpr(Start, WideTy),
                        getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy),
                                   getZeroExtendExpr(Step, WideTy)));
@@ -862,7 +807,7 @@ SCEVHandle ScalarEvolution::getZeroExtendExpr(const SCEVHandle &Op,
 
           // Similar to above, only this time treat the step value as signed.
           // This covers loops that count down.
-          SCEVHandle SMul =
+          const SCEV* SMul =
             getMulExpr(CastedMaxBECount,
                        getTruncateOrSignExtend(Step, Start->getType()));
           Add = getAddExpr(Start, SMul);
@@ -879,12 +824,12 @@ SCEVHandle ScalarEvolution::getZeroExtendExpr(const SCEVHandle &Op,
       }
     }
 
-  SCEVZeroExtendExpr *&Result = (*SCEVZeroExtends)[std::make_pair(Op, Ty)];
-  if (Result == 0) Result = new SCEVZeroExtendExpr(Op, Ty, this);
+  SCEVZeroExtendExpr *&Result = SCEVZeroExtends[std::make_pair(Op, Ty)];
+  if (Result == 0) Result = new SCEVZeroExtendExpr(Op, Ty);
   return Result;
 }
 
-SCEVHandle ScalarEvolution::getSignExtendExpr(const SCEVHandle &Op,
+const SCEV* ScalarEvolution::getSignExtendExpr(const SCEV* Op,
                                               const Type *Ty) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
          "This is not an extending conversion!");
@@ -917,28 +862,28 @@ SCEVHandle ScalarEvolution::getSignExtendExpr(const SCEVHandle &Op,
       // in infinite recursion. In the later case, the analysis code will
       // cope with a conservative value, and it will take care to purge
       // that value once it has finished.
-      SCEVHandle MaxBECount = getMaxBackedgeTakenCount(AR->getLoop());
+      const SCEV* MaxBECount = getMaxBackedgeTakenCount(AR->getLoop());
       if (!isa<SCEVCouldNotCompute>(MaxBECount)) {
         // Manually compute the final value for AR, checking for
         // overflow.
-        SCEVHandle Start = AR->getStart();
-        SCEVHandle Step = AR->getStepRecurrence(*this);
+        const SCEV* Start = AR->getStart();
+        const SCEV* Step = AR->getStepRecurrence(*this);
 
         // Check whether the backedge-taken count can be losslessly casted to
         // the addrec's type. The count is always unsigned.
-        SCEVHandle CastedMaxBECount =
+        const SCEV* CastedMaxBECount =
           getTruncateOrZeroExtend(MaxBECount, Start->getType());
-        SCEVHandle RecastedMaxBECount =
+        const SCEV* RecastedMaxBECount =
           getTruncateOrZeroExtend(CastedMaxBECount, MaxBECount->getType());
         if (MaxBECount == RecastedMaxBECount) {
           const Type *WideTy =
             IntegerType::get(getTypeSizeInBits(Start->getType()) * 2);
           // Check whether Start+Step*MaxBECount has no signed overflow.
-          SCEVHandle SMul =
+          const SCEV* SMul =
             getMulExpr(CastedMaxBECount,
                        getTruncateOrSignExtend(Step, Start->getType()));
-          SCEVHandle Add = getAddExpr(Start, SMul);
-          SCEVHandle OperandExtendedAdd =
+          const SCEV* Add = getAddExpr(Start, SMul);
+          const SCEV* OperandExtendedAdd =
             getAddExpr(getSignExtendExpr(Start, WideTy),
                        getMulExpr(getZeroExtendExpr(CastedMaxBECount, WideTy),
                                   getSignExtendExpr(Step, WideTy)));
@@ -951,15 +896,15 @@ SCEVHandle ScalarEvolution::getSignExtendExpr(const SCEVHandle &Op,
       }
     }
 
-  SCEVSignExtendExpr *&Result = (*SCEVSignExtends)[std::make_pair(Op, Ty)];
-  if (Result == 0) Result = new SCEVSignExtendExpr(Op, Ty, this);
+  SCEVSignExtendExpr *&Result = SCEVSignExtends[std::make_pair(Op, Ty)];
+  if (Result == 0) Result = new SCEVSignExtendExpr(Op, Ty);
   return Result;
 }
 
 /// getAnyExtendExpr - Return a SCEV for the given operand extended with
 /// unspecified bits out to the given type.
 ///
-SCEVHandle ScalarEvolution::getAnyExtendExpr(const SCEVHandle &Op,
+const SCEV* ScalarEvolution::getAnyExtendExpr(const SCEV* Op,
                                              const Type *Ty) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
          "This is not an extending conversion!");
@@ -974,19 +919,19 @@ SCEVHandle ScalarEvolution::getAnyExtendExpr(const SCEVHandle &Op,
 
   // Peel off a truncate cast.
   if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(Op)) {
-    SCEVHandle NewOp = T->getOperand();
+    const SCEV* NewOp = T->getOperand();
     if (getTypeSizeInBits(NewOp->getType()) < getTypeSizeInBits(Ty))
       return getAnyExtendExpr(NewOp, Ty);
     return getTruncateOrNoop(NewOp, Ty);
   }
 
   // Next try a zext cast. If the cast is folded, use it.
-  SCEVHandle ZExt = getZeroExtendExpr(Op, Ty);
+  const SCEV* ZExt = getZeroExtendExpr(Op, Ty);
   if (!isa<SCEVZeroExtendExpr>(ZExt))
     return ZExt;
 
   // Next try a sext cast. If the cast is folded, use it.
-  SCEVHandle SExt = getSignExtendExpr(Op, Ty);
+  const SCEV* SExt = getSignExtendExpr(Op, Ty);
   if (!isa<SCEVSignExtendExpr>(SExt))
     return SExt;
 
@@ -1024,10 +969,10 @@ SCEVHandle ScalarEvolution::getAnyExtendExpr(const SCEVHandle &Op,
 /// is also used as a check to avoid infinite recursion.
 ///
 static bool
-CollectAddOperandsWithScales(DenseMap<SCEVHandle, APInt> &M,
-                             SmallVector<SCEVHandle, 8> &NewOps,
+CollectAddOperandsWithScales(DenseMap<const SCEV*, APInt> &M,
+                             SmallVector<const SCEV*, 8> &NewOps,
                              APInt &AccumulatedConstant,
-                             const SmallVectorImpl<SCEVHandle> &Ops,
+                             const SmallVectorImpl<const SCEV*> &Ops,
                              const APInt &Scale,
                              ScalarEvolution &SE) {
   bool Interesting = false;
@@ -1048,9 +993,9 @@ CollectAddOperandsWithScales(DenseMap<SCEVHandle, APInt> &M,
       } else {
         // A multiplication of a constant with some other value. Update
         // the map.
-        SmallVector<SCEVHandle, 4> MulOps(Mul->op_begin()+1, Mul->op_end());
-        SCEVHandle Key = SE.getMulExpr(MulOps);
-        std::pair<DenseMap<SCEVHandle, APInt>::iterator, bool> Pair =
+        SmallVector<const SCEV*, 4> MulOps(Mul->op_begin()+1, Mul->op_end());
+        const SCEV* Key = SE.getMulExpr(MulOps);
+        std::pair<DenseMap<const SCEV*, APInt>::iterator, bool> Pair =
           M.insert(std::make_pair(Key, APInt()));
         if (Pair.second) {
           Pair.first->second = NewScale;
@@ -1069,7 +1014,7 @@ CollectAddOperandsWithScales(DenseMap<SCEVHandle, APInt> &M,
       AccumulatedConstant += Scale * C->getValue()->getValue();
     } else {
       // An ordinary operand. Update the map.
-      std::pair<DenseMap<SCEVHandle, APInt>::iterator, bool> Pair =
+      std::pair<DenseMap<const SCEV*, APInt>::iterator, bool> Pair =
         M.insert(std::make_pair(Ops[i], APInt()));
       if (Pair.second) {
         Pair.first->second = Scale;
@@ -1096,7 +1041,7 @@ namespace {
 
 /// getAddExpr - Get a canonical add expression, or something simpler if
 /// possible.
-SCEVHandle ScalarEvolution::getAddExpr(SmallVectorImpl<SCEVHandle> &Ops) {
+const SCEV* ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV*> &Ops) {
   assert(!Ops.empty() && "Cannot get empty add!");
   if (Ops.size() == 1) return Ops[0];
 #ifndef NDEBUG
@@ -1140,8 +1085,8 @@ SCEVHandle ScalarEvolution::getAddExpr(SmallVectorImpl<SCEVHandle> &Ops) {
     if (Ops[i] == Ops[i+1]) {      //  X + Y + Y  -->  X + Y*2
       // Found a match, merge the two values into a multiply, and add any
       // remaining values to the result.
-      SCEVHandle Two = getIntegerSCEV(2, Ty);
-      SCEVHandle Mul = getMulExpr(Ops[i], Two);
+      const SCEV* Two = getIntegerSCEV(2, Ty);
+      const SCEV* Mul = getMulExpr(Ops[i], Two);
       if (Ops.size() == 2)
         return Mul;
       Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
@@ -1157,7 +1102,7 @@ SCEVHandle ScalarEvolution::getAddExpr(SmallVectorImpl<SCEVHandle> &Ops) {
     const SCEVTruncateExpr *Trunc = cast<SCEVTruncateExpr>(Ops[Idx]);
     const Type *DstType = Trunc->getType();
     const Type *SrcType = Trunc->getOperand()->getType();
-    SmallVector<SCEVHandle, 8> LargeOps;
+    SmallVector<const SCEV*, 8> LargeOps;
     bool Ok = true;
     // Check all the operands to see if they can be represented in the
     // source type of the truncate.
@@ -1173,7 +1118,7 @@ SCEVHandle ScalarEvolution::getAddExpr(SmallVectorImpl<SCEVHandle> &Ops) {
         // is much more likely to be foldable here.
         LargeOps.push_back(getSignExtendExpr(C, SrcType));
       } else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(Ops[i])) {
-        SmallVector<SCEVHandle, 8> LargeMulOps;
+        SmallVector<const SCEV*, 8> LargeMulOps;
         for (unsigned j = 0, f = M->getNumOperands(); j != f && Ok; ++j) {
           if (const SCEVTruncateExpr *T =
                 dyn_cast<SCEVTruncateExpr>(M->getOperand(j))) {
@@ -1201,7 +1146,7 @@ SCEVHandle ScalarEvolution::getAddExpr(SmallVectorImpl<SCEVHandle> &Ops) {
     }
     if (Ok) {
       // Evaluate the expression in the larger type.
-      SCEVHandle Fold = getAddExpr(LargeOps);
+      const SCEV* Fold = getAddExpr(LargeOps);
       // If it folds to something simple, use it. Otherwise, don't.
       if (isa<SCEVConstant>(Fold) || isa<SCEVUnknown>(Fold))
         return getTruncateExpr(Fold, DstType);
@@ -1238,23 +1183,23 @@ SCEVHandle ScalarEvolution::getAddExpr(SmallVectorImpl<SCEVHandle> &Ops) {
   // operands multiplied by constant values.
   if (Idx < Ops.size() && isa<SCEVMulExpr>(Ops[Idx])) {
     uint64_t BitWidth = getTypeSizeInBits(Ty);
-    DenseMap<SCEVHandle, APInt> M;
-    SmallVector<SCEVHandle, 8> NewOps;
+    DenseMap<const SCEV*, APInt> M;
+    SmallVector<const SCEV*, 8> NewOps;
     APInt AccumulatedConstant(BitWidth, 0);
     if (CollectAddOperandsWithScales(M, NewOps, AccumulatedConstant,
                                      Ops, APInt(BitWidth, 1), *this)) {
       // Some interesting folding opportunity is present, so its worthwhile to
       // re-generate the operands list. Group the operands by constant scale,
       // to avoid multiplying by the same constant scale multiple times.
-      std::map<APInt, SmallVector<SCEVHandle, 4>, APIntCompare> MulOpLists;
-      for (SmallVector<SCEVHandle, 8>::iterator I = NewOps.begin(),
+      std::map<APInt, SmallVector<const SCEV*, 4>, APIntCompare> MulOpLists;
+      for (SmallVector<const SCEV*, 8>::iterator I = NewOps.begin(),
            E = NewOps.end(); I != E; ++I)
         MulOpLists[M.find(*I)->second].push_back(*I);
       // Re-generate the operands list.
       Ops.clear();
       if (AccumulatedConstant != 0)
         Ops.push_back(getConstant(AccumulatedConstant));
-      for (std::map<APInt, SmallVector<SCEVHandle, 4>, APIntCompare>::iterator I =
+      for (std::map<APInt, SmallVector<const SCEV*, 4>, APIntCompare>::iterator I =
            MulOpLists.begin(), E = MulOpLists.end(); I != E; ++I)
         if (I->first != 0)
           Ops.push_back(getMulExpr(getConstant(I->first), getAddExpr(I->second)));
@@ -1276,17 +1221,17 @@ SCEVHandle ScalarEvolution::getAddExpr(SmallVectorImpl<SCEVHandle> &Ops) {
       for (unsigned AddOp = 0, e = Ops.size(); AddOp != e; ++AddOp)
         if (MulOpSCEV == Ops[AddOp] && !isa<SCEVConstant>(Ops[AddOp])) {
           // Fold W + X + (X * Y * Z)  -->  W + (X * ((Y*Z)+1))
-          SCEVHandle InnerMul = Mul->getOperand(MulOp == 0);
+          const SCEV* InnerMul = Mul->getOperand(MulOp == 0);
           if (Mul->getNumOperands() != 2) {
             // If the multiply has more than two operands, we must get the
             // Y*Z term.
-            SmallVector<SCEVHandle, 4> MulOps(Mul->op_begin(), Mul->op_end());
+            SmallVector<const SCEV*, 4> MulOps(Mul->op_begin(), Mul->op_end());
             MulOps.erase(MulOps.begin()+MulOp);
             InnerMul = getMulExpr(MulOps);
           }
-          SCEVHandle One = getIntegerSCEV(1, Ty);
-          SCEVHandle AddOne = getAddExpr(InnerMul, One);
-          SCEVHandle OuterMul = getMulExpr(AddOne, Ops[AddOp]);
+          const SCEV* One = getIntegerSCEV(1, Ty);
+          const SCEV* AddOne = getAddExpr(InnerMul, One);
+          const SCEV* OuterMul = getMulExpr(AddOne, Ops[AddOp]);
           if (Ops.size() == 2) return OuterMul;
           if (AddOp < Idx) {
             Ops.erase(Ops.begin()+AddOp);
@@ -1310,21 +1255,21 @@ SCEVHandle ScalarEvolution::getAddExpr(SmallVectorImpl<SCEVHandle> &Ops) {
              OMulOp != e; ++OMulOp)
           if (OtherMul->getOperand(OMulOp) == MulOpSCEV) {
             // Fold X + (A*B*C) + (A*D*E) --> X + (A*(B*C+D*E))
-            SCEVHandle InnerMul1 = Mul->getOperand(MulOp == 0);
+            const SCEV* InnerMul1 = Mul->getOperand(MulOp == 0);
             if (Mul->getNumOperands() != 2) {
-              SmallVector<SCEVHandle, 4> MulOps(Mul->op_begin(), Mul->op_end());
+              SmallVector<const SCEV*, 4> MulOps(Mul->op_begin(), Mul->op_end());
               MulOps.erase(MulOps.begin()+MulOp);
               InnerMul1 = getMulExpr(MulOps);
             }
-            SCEVHandle InnerMul2 = OtherMul->getOperand(OMulOp == 0);
+            const SCEV* InnerMul2 = OtherMul->getOperand(OMulOp == 0);
             if (OtherMul->getNumOperands() != 2) {
-              SmallVector<SCEVHandle, 4> MulOps(OtherMul->op_begin(),
+              SmallVector<const SCEV*, 4> MulOps(OtherMul->op_begin(),
                                              OtherMul->op_end());
               MulOps.erase(MulOps.begin()+OMulOp);
               InnerMul2 = getMulExpr(MulOps);
             }
-            SCEVHandle InnerMulSum = getAddExpr(InnerMul1,InnerMul2);
-            SCEVHandle OuterMul = getMulExpr(MulOpSCEV, InnerMulSum);
+            const SCEV* InnerMulSum = getAddExpr(InnerMul1,InnerMul2);
+            const SCEV* OuterMul = getMulExpr(MulOpSCEV, InnerMulSum);
             if (Ops.size() == 2) return OuterMul;
             Ops.erase(Ops.begin()+Idx);
             Ops.erase(Ops.begin()+OtherMulIdx-1);
@@ -1345,7 +1290,7 @@ SCEVHandle ScalarEvolution::getAddExpr(SmallVectorImpl<SCEVHandle> &Ops) {
   for (; Idx < Ops.size() && isa<SCEVAddRecExpr>(Ops[Idx]); ++Idx) {
     // Scan all of the other operands to this add and add them to the vector if
     // they are loop invariant w.r.t. the recurrence.
-    SmallVector<SCEVHandle, 8> LIOps;
+    SmallVector<const SCEV*, 8> LIOps;
     const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
       if (Ops[i]->isLoopInvariant(AddRec->getLoop())) {
@@ -1359,11 +1304,11 @@ SCEVHandle ScalarEvolution::getAddExpr(SmallVectorImpl<SCEVHandle> &Ops) {
       //  NLI + LI + {Start,+,Step}  -->  NLI + {LI+Start,+,Step}
       LIOps.push_back(AddRec->getStart());
 
-      SmallVector<SCEVHandle, 4> AddRecOps(AddRec->op_begin(),
+      SmallVector<const SCEV*, 4> AddRecOps(AddRec->op_begin(),
                                            AddRec->op_end());
       AddRecOps[0] = getAddExpr(LIOps);
 
-      SCEVHandle NewRec = getAddRecExpr(AddRecOps, AddRec->getLoop());
+      const SCEV* NewRec = getAddRecExpr(AddRecOps, AddRec->getLoop());
       // If all of the other operands were loop invariant, we are done.
       if (Ops.size() == 1) return NewRec;
 
@@ -1385,7 +1330,7 @@ SCEVHandle ScalarEvolution::getAddExpr(SmallVectorImpl<SCEVHandle> &Ops) {
         const SCEVAddRecExpr *OtherAddRec = cast<SCEVAddRecExpr>(Ops[OtherIdx]);
         if (AddRec->getLoop() == OtherAddRec->getLoop()) {
           // Other + {A,+,B} + {C,+,D}  -->  Other + {A+C,+,B+D}
-          SmallVector<SCEVHandle, 4> NewOps(AddRec->op_begin(), AddRec->op_end());
+          SmallVector<const SCEV*, 4> NewOps(AddRec->op_begin(), AddRec->op_end());
           for (unsigned i = 0, e = OtherAddRec->getNumOperands(); i != e; ++i) {
             if (i >= NewOps.size()) {
               NewOps.insert(NewOps.end(), OtherAddRec->op_begin()+i,
@@ -1394,7 +1339,7 @@ SCEVHandle ScalarEvolution::getAddExpr(SmallVectorImpl<SCEVHandle> &Ops) {
             }
             NewOps[i] = getAddExpr(NewOps[i], OtherAddRec->getOperand(i));
           }
-          SCEVHandle NewAddRec = getAddRecExpr(NewOps, AddRec->getLoop());
+          const SCEV* NewAddRec = getAddRecExpr(NewOps, AddRec->getLoop());
 
           if (Ops.size() == 2) return NewAddRec;
 
@@ -1412,16 +1357,16 @@ SCEVHandle ScalarEvolution::getAddExpr(SmallVectorImpl<SCEVHandle> &Ops) {
   // Okay, it looks like we really DO need an add expr.  Check to see if we
   // already have one, otherwise create a new one.
   std::vector<const SCEV*> SCEVOps(Ops.begin(), Ops.end());
-  SCEVCommutativeExpr *&Result = (*SCEVCommExprs)[std::make_pair(scAddExpr,
+  SCEVCommutativeExpr *&Result = SCEVCommExprs[std::make_pair(scAddExpr,
                                                                  SCEVOps)];
-  if (Result == 0) Result = new SCEVAddExpr(Ops, this);
+  if (Result == 0) Result = new SCEVAddExpr(Ops);
   return Result;
 }
 
 
 /// getMulExpr - Get a canonical multiply expression, or something simpler if
 /// possible.
-SCEVHandle ScalarEvolution::getMulExpr(SmallVectorImpl<SCEVHandle> &Ops) {
+const SCEV* ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV*> &Ops) {
   assert(!Ops.empty() && "Cannot get empty mul!");
 #ifndef NDEBUG
   for (unsigned i = 1, e = Ops.size(); i != e; ++i)
@@ -1502,7 +1447,7 @@ SCEVHandle ScalarEvolution::getMulExpr(SmallVectorImpl<SCEVHandle> &Ops) {
   for (; Idx < Ops.size() && isa<SCEVAddRecExpr>(Ops[Idx]); ++Idx) {
     // Scan all of the other operands to this mul and add them to the vector if
     // they are loop invariant w.r.t. the recurrence.
-    SmallVector<SCEVHandle, 8> LIOps;
+    SmallVector<const SCEV*, 8> LIOps;
     const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
       if (Ops[i]->isLoopInvariant(AddRec->getLoop())) {
@@ -1514,7 +1459,7 @@ SCEVHandle ScalarEvolution::getMulExpr(SmallVectorImpl<SCEVHandle> &Ops) {
     // If we found some loop invariants, fold them into the recurrence.
     if (!LIOps.empty()) {
       //  NLI * LI * {Start,+,Step}  -->  NLI * {LI*Start,+,LI*Step}
-      SmallVector<SCEVHandle, 4> NewOps;
+      SmallVector<const SCEV*, 4> NewOps;
       NewOps.reserve(AddRec->getNumOperands());
       if (LIOps.size() == 1) {
         const SCEV *Scale = LIOps[0];
@@ -1522,13 +1467,13 @@ SCEVHandle ScalarEvolution::getMulExpr(SmallVectorImpl<SCEVHandle> &Ops) {
           NewOps.push_back(getMulExpr(Scale, AddRec->getOperand(i)));
       } else {
         for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) {
-          SmallVector<SCEVHandle, 4> MulOps(LIOps.begin(), LIOps.end());
+          SmallVector<const SCEV*, 4> MulOps(LIOps.begin(), LIOps.end());
           MulOps.push_back(AddRec->getOperand(i));
           NewOps.push_back(getMulExpr(MulOps));
         }
       }
 
-      SCEVHandle NewRec = getAddRecExpr(NewOps, AddRec->getLoop());
+      const SCEV* NewRec = getAddRecExpr(NewOps, AddRec->getLoop());
 
       // If all of the other operands were loop invariant, we are done.
       if (Ops.size() == 1) return NewRec;
@@ -1552,14 +1497,14 @@ SCEVHandle ScalarEvolution::getMulExpr(SmallVectorImpl<SCEVHandle> &Ops) {
         if (AddRec->getLoop() == OtherAddRec->getLoop()) {
           // F * G  -->  {A,+,B} * {C,+,D}  -->  {A*C,+,F*D + G*B + B*D}
           const SCEVAddRecExpr *F = AddRec, *G = OtherAddRec;
-          SCEVHandle NewStart = getMulExpr(F->getStart(),
+          const SCEV* NewStart = getMulExpr(F->getStart(),
                                                  G->getStart());
-          SCEVHandle B = F->getStepRecurrence(*this);
-          SCEVHandle D = G->getStepRecurrence(*this);
-          SCEVHandle NewStep = getAddExpr(getMulExpr(F, D),
+          const SCEV* B = F->getStepRecurrence(*this);
+          const SCEV* D = G->getStepRecurrence(*this);
+          const SCEV* NewStep = getAddExpr(getMulExpr(F, D),
                                           getMulExpr(G, B),
                                           getMulExpr(B, D));
-          SCEVHandle NewAddRec = getAddRecExpr(NewStart, NewStep,
+          const SCEV* NewAddRec = getAddRecExpr(NewStart, NewStep,
                                                F->getLoop());
           if (Ops.size() == 2) return NewAddRec;
 
@@ -1577,17 +1522,17 @@ SCEVHandle ScalarEvolution::getMulExpr(SmallVectorImpl<SCEVHandle> &Ops) {
   // Okay, it looks like we really DO need an mul expr.  Check to see if we
   // already have one, otherwise create a new one.
   std::vector<const SCEV*> SCEVOps(Ops.begin(), Ops.end());
-  SCEVCommutativeExpr *&Result = (*SCEVCommExprs)[std::make_pair(scMulExpr,
+  SCEVCommutativeExpr *&Result = SCEVCommExprs[std::make_pair(scMulExpr,
                                                                  SCEVOps)];
   if (Result == 0)
-    Result = new SCEVMulExpr(Ops, this);
+    Result = new SCEVMulExpr(Ops);
   return Result;
 }
 
 /// getUDivExpr - Get a canonical multiply expression, or something simpler if
 /// possible.
-SCEVHandle ScalarEvolution::getUDivExpr(const SCEVHandle &LHS,
-                                        const SCEVHandle &RHS) {
+const SCEV* ScalarEvolution::getUDivExpr(const SCEV* LHS,
+                                        const SCEV* RHS) {
   assert(getEffectiveSCEVType(LHS->getType()) ==
          getEffectiveSCEVType(RHS->getType()) &&
          "SCEVUDivExpr operand types don't match!");
@@ -1620,24 +1565,24 @@ SCEVHandle ScalarEvolution::getUDivExpr(const SCEVHandle &LHS,
             getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy),
                           getZeroExtendExpr(Step, ExtTy),
                           AR->getLoop())) {
-          SmallVector<SCEVHandle, 4> Operands;
+          SmallVector<const SCEV*, 4> Operands;
           for (unsigned i = 0, e = AR->getNumOperands(); i != e; ++i)
             Operands.push_back(getUDivExpr(AR->getOperand(i), RHS));
           return getAddRecExpr(Operands, AR->getLoop());
         }
     // (A*B)/C --> A*(B/C) if safe and B/C can be folded.
     if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(LHS)) {
-      SmallVector<SCEVHandle, 4> Operands;
+      SmallVector<const SCEV*, 4> Operands;
       for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i)
         Operands.push_back(getZeroExtendExpr(M->getOperand(i), ExtTy));
       if (getZeroExtendExpr(M, ExtTy) == getMulExpr(Operands))
         // Find an operand that's safely divisible.
         for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) {
-          SCEVHandle Op = M->getOperand(i);
-          SCEVHandle Div = getUDivExpr(Op, RHSC);
+          const SCEV* Op = M->getOperand(i);
+          const SCEV* Div = getUDivExpr(Op, RHSC);
           if (!isa<SCEVUDivExpr>(Div) && getMulExpr(Div, RHSC) == Op) {
-            const SmallVectorImpl<SCEVHandle> &MOperands = M->getOperands();
-            Operands = SmallVector<SCEVHandle, 4>(MOperands.begin(),
+            const SmallVectorImpl<const SCEV*> &MOperands = M->getOperands();
+            Operands = SmallVector<const SCEV*, 4>(MOperands.begin(),
                                                   MOperands.end());
             Operands[i] = Div;
             return getMulExpr(Operands);
@@ -1646,13 +1591,13 @@ SCEVHandle ScalarEvolution::getUDivExpr(const SCEVHandle &LHS,
     }
     // (A+B)/C --> (A/C + B/C) if safe and A/C and B/C can be folded.
     if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(LHS)) {
-      SmallVector<SCEVHandle, 4> Operands;
+      SmallVector<const SCEV*, 4> Operands;
       for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i)
         Operands.push_back(getZeroExtendExpr(A->getOperand(i), ExtTy));
       if (getZeroExtendExpr(A, ExtTy) == getAddExpr(Operands)) {
         Operands.clear();
         for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i) {
-          SCEVHandle Op = getUDivExpr(A->getOperand(i), RHS);
+          const SCEV* Op = getUDivExpr(A->getOperand(i), RHS);
           if (isa<SCEVUDivExpr>(Op) || getMulExpr(Op, RHS) != A->getOperand(i))
             break;
           Operands.push_back(Op);
@@ -1670,17 +1615,17 @@ SCEVHandle ScalarEvolution::getUDivExpr(const SCEVHandle &LHS,
     }
   }
 
-  SCEVUDivExpr *&Result = (*SCEVUDivs)[std::make_pair(LHS, RHS)];
-  if (Result == 0) Result = new SCEVUDivExpr(LHS, RHS, this);
+  SCEVUDivExpr *&Result = SCEVUDivs[std::make_pair(LHS, RHS)];
+  if (Result == 0) Result = new SCEVUDivExpr(LHS, RHS);
   return Result;
 }
 
 
 /// getAddRecExpr - Get an add recurrence expression for the specified loop.
 /// Simplify the expression as much as possible.
-SCEVHandle ScalarEvolution::getAddRecExpr(const SCEVHandle &Start,
-                               const SCEVHandle &Step, const Loop *L) {
-  SmallVector<SCEVHandle, 4> Operands;
+const SCEV* ScalarEvolution::getAddRecExpr(const SCEV* Start,
+                               const SCEV* Step, const Loop *L) {
+  SmallVector<const SCEV*, 4> Operands;
   Operands.push_back(Start);
   if (const SCEVAddRecExpr *StepChrec = dyn_cast<SCEVAddRecExpr>(Step))
     if (StepChrec->getLoop() == L) {
@@ -1695,7 +1640,7 @@ SCEVHandle ScalarEvolution::getAddRecExpr(const SCEVHandle &Start,
 
 /// getAddRecExpr - Get an add recurrence expression for the specified loop.
 /// Simplify the expression as much as possible.
-SCEVHandle ScalarEvolution::getAddRecExpr(SmallVectorImpl<SCEVHandle> &Operands,
+const SCEV* ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV*> &Operands,
                                           const Loop *L) {
   if (Operands.size() == 1) return Operands[0];
 #ifndef NDEBUG
@@ -1714,9 +1659,8 @@ SCEVHandle ScalarEvolution::getAddRecExpr(SmallVectorImpl<SCEVHandle> &Operands,
   if (const SCEVAddRecExpr *NestedAR = dyn_cast<SCEVAddRecExpr>(Operands[0])) {
     const Loop* NestedLoop = NestedAR->getLoop();
     if (L->getLoopDepth() < NestedLoop->getLoopDepth()) {
-      SmallVector<SCEVHandle, 4> NestedOperands(NestedAR->op_begin(),
+      SmallVector<const SCEV*, 4> NestedOperands(NestedAR->op_begin(),
                                                 NestedAR->op_end());
-      SCEVHandle NestedARHandle(NestedAR);
       Operands[0] = NestedAR->getStart();
       NestedOperands[0] = getAddRecExpr(Operands, L);
       return getAddRecExpr(NestedOperands, NestedLoop);
@@ -1724,21 +1668,21 @@ SCEVHandle ScalarEvolution::getAddRecExpr(SmallVectorImpl<SCEVHandle> &Operands,
   }
 
   std::vector<const SCEV*> SCEVOps(Operands.begin(), Operands.end());
-  SCEVAddRecExpr *&Result = (*SCEVAddRecExprs)[std::make_pair(L, SCEVOps)];
-  if (Result == 0) Result = new SCEVAddRecExpr(Operands, L, this);
+  SCEVAddRecExpr *&Result = SCEVAddRecExprs[std::make_pair(L, SCEVOps)];
+  if (Result == 0) Result = new SCEVAddRecExpr(Operands, L);
   return Result;
 }
 
-SCEVHandle ScalarEvolution::getSMaxExpr(const SCEVHandle &LHS,
-                                        const SCEVHandle &RHS) {
-  SmallVector<SCEVHandle, 2> Ops;
+const SCEV* ScalarEvolution::getSMaxExpr(const SCEV* LHS,
+                                        const SCEV* RHS) {
+  SmallVector<const SCEV*, 2> Ops;
   Ops.push_back(LHS);
   Ops.push_back(RHS);
   return getSMaxExpr(Ops);
 }
 
-SCEVHandle
-ScalarEvolution::getSMaxExpr(SmallVectorImpl<SCEVHandle> &Ops) {
+const SCEV*
+ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV*> &Ops) {
   assert(!Ops.empty() && "Cannot get empty smax!");
   if (Ops.size() == 1) return Ops[0];
 #ifndef NDEBUG
@@ -1810,22 +1754,22 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<SCEVHandle> &Ops) {
   // Okay, it looks like we really DO need an smax expr.  Check to see if we
   // already have one, otherwise create a new one.
   std::vector<const SCEV*> SCEVOps(Ops.begin(), Ops.end());
-  SCEVCommutativeExpr *&Result = (*SCEVCommExprs)[std::make_pair(scSMaxExpr,
+  SCEVCommutativeExpr *&Result = SCEVCommExprs[std::make_pair(scSMaxExpr,
                                                                  SCEVOps)];
-  if (Result == 0) Result = new SCEVSMaxExpr(Ops, this);
+  if (Result == 0) Result = new SCEVSMaxExpr(Ops);
   return Result;
 }
 
-SCEVHandle ScalarEvolution::getUMaxExpr(const SCEVHandle &LHS,
-                                        const SCEVHandle &RHS) {
-  SmallVector<SCEVHandle, 2> Ops;
+const SCEV* ScalarEvolution::getUMaxExpr(const SCEV* LHS,
+                                        const SCEV* RHS) {
+  SmallVector<const SCEV*, 2> Ops;
   Ops.push_back(LHS);
   Ops.push_back(RHS);
   return getUMaxExpr(Ops);
 }
 
-SCEVHandle
-ScalarEvolution::getUMaxExpr(SmallVectorImpl<SCEVHandle> &Ops) {
+const SCEV*
+ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV*> &Ops) {
   assert(!Ops.empty() && "Cannot get empty umax!");
   if (Ops.size() == 1) return Ops[0];
 #ifndef NDEBUG
@@ -1897,31 +1841,31 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<SCEVHandle> &Ops) {
   // Okay, it looks like we really DO need a umax expr.  Check to see if we
   // already have one, otherwise create a new one.
   std::vector<const SCEV*> SCEVOps(Ops.begin(), Ops.end());
-  SCEVCommutativeExpr *&Result = (*SCEVCommExprs)[std::make_pair(scUMaxExpr,
+  SCEVCommutativeExpr *&Result = SCEVCommExprs[std::make_pair(scUMaxExpr,
                                                                  SCEVOps)];
-  if (Result == 0) Result = new SCEVUMaxExpr(Ops, this);
+  if (Result == 0) Result = new SCEVUMaxExpr(Ops);
   return Result;
 }
 
-SCEVHandle ScalarEvolution::getSMinExpr(const SCEVHandle &LHS,
-                                        const SCEVHandle &RHS) {
+const SCEV* ScalarEvolution::getSMinExpr(const SCEV* LHS,
+                                        const SCEV* RHS) {
   // ~smax(~x, ~y) == smin(x, y).
   return getNotSCEV(getSMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS)));
 }
 
-SCEVHandle ScalarEvolution::getUMinExpr(const SCEVHandle &LHS,
-                                        const SCEVHandle &RHS) {
+const SCEV* ScalarEvolution::getUMinExpr(const SCEV* LHS,
+                                        const SCEV* RHS) {
   // ~umax(~x, ~y) == umin(x, y)
   return getNotSCEV(getUMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS)));
 }
 
-SCEVHandle ScalarEvolution::getUnknown(Value *V) {
+const SCEV* ScalarEvolution::getUnknown(Value *V) {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
     return getConstant(CI);
   if (isa<ConstantPointerNull>(V))
     return getIntegerSCEV(0, V->getType());
-  SCEVUnknown *&Result = (*SCEVUnknowns)[V];
-  if (Result == 0) Result = new SCEVUnknown(V, this);
+  SCEVUnknown *&Result = SCEVUnknowns[V];
+  if (Result == 0) Result = new SCEVUnknown(V);
   return Result;
 }
 
@@ -1975,7 +1919,7 @@ const Type *ScalarEvolution::getEffectiveSCEVType(const Type *Ty) const {
   return TD->getIntPtrType();
 }
 
-SCEVHandle ScalarEvolution::getCouldNotCompute() {
+const SCEV* ScalarEvolution::getCouldNotCompute() {
   return CouldNotCompute;
 }
 
@@ -1987,19 +1931,19 @@ bool ScalarEvolution::hasSCEV(Value *V) const {
 
 /// getSCEV - Return an existing SCEV if it exists, otherwise analyze the
 /// expression and create a new one.
-SCEVHandle ScalarEvolution::getSCEV(Value *V) {
+const SCEV* ScalarEvolution::getSCEV(Value *V) {
   assert(isSCEVable(V->getType()) && "Value is not SCEVable!");
 
-  std::map<SCEVCallbackVH, SCEVHandle>::iterator I = Scalars.find(V);
+  std::map<SCEVCallbackVH, const SCEV*>::iterator I = Scalars.find(V);
   if (I != Scalars.end()) return I->second;
-  SCEVHandle S = createSCEV(V);
+  const SCEV* S = createSCEV(V);
   Scalars.insert(std::make_pair(SCEVCallbackVH(V, this), S));
   return S;
 }
 
 /// getIntegerSCEV - Given an integer or FP type, create a constant for the
 /// specified signed integer value and return a SCEV for the constant.
-SCEVHandle ScalarEvolution::getIntegerSCEV(int Val, const Type *Ty) {
+const SCEV* ScalarEvolution::getIntegerSCEV(int Val, const Type *Ty) {
   Ty = getEffectiveSCEVType(Ty);
   Constant *C;
   if (Val == 0)
@@ -2014,7 +1958,7 @@ SCEVHandle ScalarEvolution::getIntegerSCEV(int Val, const Type *Ty) {
 
 /// getNegativeSCEV - Return a SCEV corresponding to -V = -1*V
 ///
-SCEVHandle ScalarEvolution::getNegativeSCEV(const SCEVHandle &V) {
+const SCEV* ScalarEvolution::getNegativeSCEV(const SCEV* V) {
   if (const SCEVConstant *VC = dyn_cast<SCEVConstant>(V))
     return getUnknown(ConstantExpr::getNeg(VC->getValue()));
 
@@ -2024,20 +1968,20 @@ SCEVHandle ScalarEvolution::getNegativeSCEV(const SCEVHandle &V) {
 }
 
 /// getNotSCEV - Return a SCEV corresponding to ~V = -1-V
-SCEVHandle ScalarEvolution::getNotSCEV(const SCEVHandle &V) {
+const SCEV* ScalarEvolution::getNotSCEV(const SCEV* V) {
   if (const SCEVConstant *VC = dyn_cast<SCEVConstant>(V))
     return getUnknown(ConstantExpr::getNot(VC->getValue()));
 
   const Type *Ty = V->getType();
   Ty = getEffectiveSCEVType(Ty);
-  SCEVHandle AllOnes = getConstant(ConstantInt::getAllOnesValue(Ty));
+  const SCEV* AllOnes = getConstant(ConstantInt::getAllOnesValue(Ty));
   return getMinusSCEV(AllOnes, V);
 }
 
 /// getMinusSCEV - Return a SCEV corresponding to LHS - RHS.
 ///
-SCEVHandle ScalarEvolution::getMinusSCEV(const SCEVHandle &LHS,
-                                         const SCEVHandle &RHS) {
+const SCEV* ScalarEvolution::getMinusSCEV(const SCEV* LHS,
+                                         const SCEV* RHS) {
   // X - Y --> X + -Y
   return getAddExpr(LHS, getNegativeSCEV(RHS));
 }
@@ -2045,8 +1989,8 @@ SCEVHandle ScalarEvolution::getMinusSCEV(const SCEVHandle &LHS,
 /// getTruncateOrZeroExtend - Return a SCEV corresponding to a conversion of the
 /// input value to the specified type.  If the type must be extended, it is zero
 /// extended.
-SCEVHandle
-ScalarEvolution::getTruncateOrZeroExtend(const SCEVHandle &V,
+const SCEV*
+ScalarEvolution::getTruncateOrZeroExtend(const SCEV* V,
                                          const Type *Ty) {
   const Type *SrcTy = V->getType();
   assert((SrcTy->isInteger() || (TD && isa<PointerType>(SrcTy))) &&
@@ -2062,8 +2006,8 @@ ScalarEvolution::getTruncateOrZeroExtend(const SCEVHandle &V,
 /// getTruncateOrSignExtend - Return a SCEV corresponding to a conversion of the
 /// input value to the specified type.  If the type must be extended, it is sign
 /// extended.
-SCEVHandle
-ScalarEvolution::getTruncateOrSignExtend(const SCEVHandle &V,
+const SCEV*
+ScalarEvolution::getTruncateOrSignExtend(const SCEV* V,
                                          const Type *Ty) {
   const Type *SrcTy = V->getType();
   assert((SrcTy->isInteger() || (TD && isa<PointerType>(SrcTy))) &&
@@ -2079,8 +2023,8 @@ ScalarEvolution::getTruncateOrSignExtend(const SCEVHandle &V,
 /// getNoopOrZeroExtend - Return a SCEV corresponding to a conversion of the
 /// input value to the specified type.  If the type must be extended, it is zero
 /// extended.  The conversion must not be narrowing.
-SCEVHandle
-ScalarEvolution::getNoopOrZeroExtend(const SCEVHandle &V, const Type *Ty) {
+const SCEV*
+ScalarEvolution::getNoopOrZeroExtend(const SCEV* V, const Type *Ty) {
   const Type *SrcTy = V->getType();
   assert((SrcTy->isInteger() || (TD && isa<PointerType>(SrcTy))) &&
          (Ty->isInteger() || (TD && isa<PointerType>(Ty))) &&
@@ -2095,8 +2039,8 @@ ScalarEvolution::getNoopOrZeroExtend(const SCEVHandle &V, const Type *Ty) {
 /// getNoopOrSignExtend - Return a SCEV corresponding to a conversion of the
 /// input value to the specified type.  If the type must be extended, it is sign
 /// extended.  The conversion must not be narrowing.
-SCEVHandle
-ScalarEvolution::getNoopOrSignExtend(const SCEVHandle &V, const Type *Ty) {
+const SCEV*
+ScalarEvolution::getNoopOrSignExtend(const SCEV* V, const Type *Ty) {
   const Type *SrcTy = V->getType();
   assert((SrcTy->isInteger() || (TD && isa<PointerType>(SrcTy))) &&
          (Ty->isInteger() || (TD && isa<PointerType>(Ty))) &&
@@ -2112,8 +2056,8 @@ ScalarEvolution::getNoopOrSignExtend(const SCEVHandle &V, const Type *Ty) {
 /// the input value to the specified type. If the type must be extended,
 /// it is extended with unspecified bits. The conversion must not be
 /// narrowing.
-SCEVHandle
-ScalarEvolution::getNoopOrAnyExtend(const SCEVHandle &V, const Type *Ty) {
+const SCEV*
+ScalarEvolution::getNoopOrAnyExtend(const SCEV* V, const Type *Ty) {
   const Type *SrcTy = V->getType();
   assert((SrcTy->isInteger() || (TD && isa<PointerType>(SrcTy))) &&
          (Ty->isInteger() || (TD && isa<PointerType>(Ty))) &&
@@ -2127,8 +2071,8 @@ ScalarEvolution::getNoopOrAnyExtend(const SCEVHandle &V, const Type *Ty) {
 
 /// getTruncateOrNoop - Return a SCEV corresponding to a conversion of the
 /// input value to the specified type.  The conversion must not be widening.
-SCEVHandle
-ScalarEvolution::getTruncateOrNoop(const SCEVHandle &V, const Type *Ty) {
+const SCEV*
+ScalarEvolution::getTruncateOrNoop(const SCEV* V, const Type *Ty) {
   const Type *SrcTy = V->getType();
   assert((SrcTy->isInteger() || (TD && isa<PointerType>(SrcTy))) &&
          (Ty->isInteger() || (TD && isa<PointerType>(Ty))) &&
@@ -2143,10 +2087,10 @@ ScalarEvolution::getTruncateOrNoop(const SCEVHandle &V, const Type *Ty) {
 /// getUMaxFromMismatchedTypes - Promote the operands to the wider of
 /// the types using zero-extension, and then perform a umax operation
 /// with them.
-SCEVHandle ScalarEvolution::getUMaxFromMismatchedTypes(const SCEVHandle &LHS,
-                                                       const SCEVHandle &RHS) {
-  SCEVHandle PromotedLHS = LHS;
-  SCEVHandle PromotedRHS = RHS;
+const SCEV* ScalarEvolution::getUMaxFromMismatchedTypes(const SCEV* LHS,
+                                                       const SCEV* RHS) {
+  const SCEV* PromotedLHS = LHS;
+  const SCEV* PromotedRHS = RHS;
 
   if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(RHS->getType()))
     PromotedRHS = getZeroExtendExpr(RHS, LHS->getType());
@@ -2156,17 +2100,33 @@ SCEVHandle ScalarEvolution::getUMaxFromMismatchedTypes(const SCEVHandle &LHS,
   return getUMaxExpr(PromotedLHS, PromotedRHS);
 }
 
+/// getUMinFromMismatchedTypes - Promote the operands to the wider of
+/// the types using zero-extension, and then perform a umin operation
+/// with them.
+const SCEV* ScalarEvolution::getUMinFromMismatchedTypes(const SCEV* LHS,
+                                                       const SCEV* RHS) {
+  const SCEV* PromotedLHS = LHS;
+  const SCEV* PromotedRHS = RHS;
+
+  if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(RHS->getType()))
+    PromotedRHS = getZeroExtendExpr(RHS, LHS->getType());
+  else
+    PromotedLHS = getNoopOrZeroExtend(LHS, RHS->getType());
+
+  return getUMinExpr(PromotedLHS, PromotedRHS);
+}
+
 /// ReplaceSymbolicValueWithConcrete - This looks up the computed SCEV value for
 /// the specified instruction and replaces any references to the symbolic value
 /// SymName with the specified value.  This is used during PHI resolution.
 void ScalarEvolution::
-ReplaceSymbolicValueWithConcrete(Instruction *I, const SCEVHandle &SymName,
-                                 const SCEVHandle &NewVal) {
-  std::map<SCEVCallbackVH, SCEVHandle>::iterator SI =
+ReplaceSymbolicValueWithConcrete(Instruction *I, const SCEV* SymName,
+                                 const SCEV* NewVal) {
+  std::map<SCEVCallbackVH, const SCEV*>::iterator SI =
     Scalars.find(SCEVCallbackVH(I, this));
   if (SI == Scalars.end()) return;
 
-  SCEVHandle NV =
+  const SCEV* NV =
     SI->second->replaceSymbolicValuesWithConcrete(SymName, NewVal, *this);
   if (NV == SI->second) return;  // No change.
 
@@ -2182,7 +2142,7 @@ ReplaceSymbolicValueWithConcrete(Instruction *I, const SCEVHandle &SymName,
 /// createNodeForPHI - PHI nodes have two cases.  Either the PHI node exists in
 /// a loop header, making it a potential recurrence, or it doesn't.
 ///
-SCEVHandle ScalarEvolution::createNodeForPHI(PHINode *PN) {
+const SCEV* ScalarEvolution::createNodeForPHI(PHINode *PN) {
   if (PN->getNumIncomingValues() == 2)  // The loops have been canonicalized.
     if (const Loop *L = LI->getLoopFor(PN->getParent()))
       if (L->getHeader() == PN->getParent()) {
@@ -2192,14 +2152,14 @@ SCEVHandle ScalarEvolution::createNodeForPHI(PHINode *PN) {
         unsigned BackEdge     = IncomingEdge^1;
 
         // While we are analyzing this PHI node, handle its value symbolically.
-        SCEVHandle SymbolicName = getUnknown(PN);
+        const SCEV* SymbolicName = getUnknown(PN);
         assert(Scalars.find(PN) == Scalars.end() &&
                "PHI node already processed?");
         Scalars.insert(std::make_pair(SCEVCallbackVH(PN, this), SymbolicName));
 
         // Using this symbolic name for the PHI, analyze the value coming around
         // the back-edge.
-        SCEVHandle BEValue = getSCEV(PN->getIncomingValue(BackEdge));
+        const SCEV* BEValue = getSCEV(PN->getIncomingValue(BackEdge));
 
         // NOTE: If BEValue is loop invariant, we know that the PHI node just
         // has a special value for the first iteration of the loop.
@@ -2219,19 +2179,19 @@ SCEVHandle ScalarEvolution::createNodeForPHI(PHINode *PN) {
 
           if (FoundIndex != Add->getNumOperands()) {
             // Create an add with everything but the specified operand.
-            SmallVector<SCEVHandle, 8> Ops;
+            SmallVector<const SCEV*, 8> Ops;
             for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
               if (i != FoundIndex)
                 Ops.push_back(Add->getOperand(i));
-            SCEVHandle Accum = getAddExpr(Ops);
+            const SCEV* Accum = getAddExpr(Ops);
 
             // This is not a valid addrec if the step amount is varying each
             // loop iteration, but is not itself an addrec in this loop.
             if (Accum->isLoopInvariant(L) ||
                 (isa<SCEVAddRecExpr>(Accum) &&
                  cast<SCEVAddRecExpr>(Accum)->getLoop() == L)) {
-              SCEVHandle StartVal = getSCEV(PN->getIncomingValue(IncomingEdge));
-              SCEVHandle PHISCEV  = getAddRecExpr(StartVal, Accum, L);
+              const SCEV* StartVal = getSCEV(PN->getIncomingValue(IncomingEdge));
+              const SCEV* PHISCEV  = getAddRecExpr(StartVal, Accum, L);
 
               // Okay, for the entire analysis of this edge we assumed the PHI
               // to be symbolic.  We now need to go back and update all of the
@@ -2250,13 +2210,13 @@ SCEVHandle ScalarEvolution::createNodeForPHI(PHINode *PN) {
           // Because the other in-value of i (0) fits the evolution of BEValue
           // i really is an addrec evolution.
           if (AddRec->getLoop() == L && AddRec->isAffine()) {
-            SCEVHandle StartVal = getSCEV(PN->getIncomingValue(IncomingEdge));
+            const SCEV* StartVal = getSCEV(PN->getIncomingValue(IncomingEdge));
 
             // If StartVal = j.start - j.stride, we can use StartVal as the
             // initial step of the addrec evolution.
             if (StartVal == getMinusSCEV(AddRec->getOperand(0),
                                             AddRec->getOperand(1))) {
-              SCEVHandle PHISCEV = 
+              const SCEV* PHISCEV = 
                  getAddRecExpr(StartVal, AddRec->getOperand(1), L);
 
               // Okay, for the entire analysis of this edge we assumed the PHI
@@ -2280,14 +2240,14 @@ SCEVHandle ScalarEvolution::createNodeForPHI(PHINode *PN) {
 /// createNodeForGEP - Expand GEP instructions into add and multiply
 /// operations. This allows them to be analyzed by regular SCEV code.
 ///
-SCEVHandle ScalarEvolution::createNodeForGEP(User *GEP) {
+const SCEV* ScalarEvolution::createNodeForGEP(User *GEP) {
 
   const Type *IntPtrTy = TD->getIntPtrType();
   Value *Base = GEP->getOperand(0);
   // Don't attempt to analyze GEPs over unsized objects.
   if (!cast<PointerType>(Base->getType())->getElementType()->isSized())
     return getUnknown(GEP);
-  SCEVHandle TotalOffset = getIntegerSCEV(0, IntPtrTy);
+  const SCEV* TotalOffset = getIntegerSCEV(0, IntPtrTy);
   gep_type_iterator GTI = gep_type_begin(GEP);
   for (GetElementPtrInst::op_iterator I = next(GEP->op_begin()),
                                       E = GEP->op_end();
@@ -2303,7 +2263,7 @@ SCEVHandle ScalarEvolution::createNodeForGEP(User *GEP) {
                                   getIntegerSCEV(Offset, IntPtrTy));
     } else {
       // For an array, add the element offset, explicitly scaled.
-      SCEVHandle LocalOffset = getSCEV(Index);
+      const SCEV* LocalOffset = getSCEV(Index);
       if (!isa<PointerType>(LocalOffset->getType()))
         // Getelementptr indicies are signed.
         LocalOffset = getTruncateOrSignExtend(LocalOffset,
@@ -2323,7 +2283,7 @@ SCEVHandle ScalarEvolution::createNodeForGEP(User *GEP) {
 /// the minimum number of times S is divisible by 2.  For example, given {4,+,8}
 /// it returns 2.  If S is guaranteed to be 0, it returns the bitwidth of S.
 uint32_t
-ScalarEvolution::GetMinTrailingZeros(const SCEVHandle &S) {
+ScalarEvolution::GetMinTrailingZeros(const SCEV* S) {
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
     return C->getValue()->getValue().countTrailingZeros();
 
@@ -2400,7 +2360,7 @@ ScalarEvolution::GetMinTrailingZeros(const SCEVHandle &S) {
 }
 
 uint32_t
-ScalarEvolution::GetMinLeadingZeros(const SCEVHandle &S) {
+ScalarEvolution::GetMinLeadingZeros(const SCEV* S) {
   // TODO: Handle other SCEV expression types here.
 
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
@@ -2426,7 +2386,7 @@ ScalarEvolution::GetMinLeadingZeros(const SCEVHandle &S) {
 }
 
 uint32_t
-ScalarEvolution::GetMinSignBits(const SCEVHandle &S) {
+ScalarEvolution::GetMinSignBits(const SCEV* S) {
   // TODO: Handle other SCEV expression types here.
 
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
@@ -2453,7 +2413,7 @@ ScalarEvolution::GetMinSignBits(const SCEVHandle &S) {
 /// createSCEV - We know that there is no SCEV for the specified value.
 /// Analyze the expression.
 ///
-SCEVHandle ScalarEvolution::createSCEV(Value *V) {
+const SCEV* ScalarEvolution::createSCEV(Value *V) {
   if (!isSCEVable(V->getType()))
     return getUnknown(V);
 
@@ -2517,7 +2477,7 @@ SCEVHandle ScalarEvolution::createSCEV(Value *V) {
     // In order for this transformation to be safe, the LHS must be of the
     // form X*(2^n) and the Or constant must be less than 2^n.
     if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
-      SCEVHandle LHS = getSCEV(U->getOperand(0));
+      const SCEV* LHS = getSCEV(U->getOperand(0));
       const APInt &CIVal = CI->getValue();
       if (GetMinTrailingZeros(LHS) >=
           (CIVal.getBitWidth() - CIVal.countLeadingZeros()))
@@ -2547,7 +2507,7 @@ SCEVHandle ScalarEvolution::createSCEV(Value *V) {
             if (const SCEVZeroExtendExpr *Z =
                   dyn_cast<SCEVZeroExtendExpr>(getSCEV(U->getOperand(0)))) {
               const Type *UTy = U->getType();
-              SCEVHandle Z0 = Z->getOperand();
+              const SCEV* Z0 = Z->getOperand();
               const Type *Z0Ty = Z0->getType();
               unsigned Z0TySize = getTypeSizeInBits(Z0Ty);
 
@@ -2716,14 +2676,14 @@ SCEVHandle ScalarEvolution::createSCEV(Value *V) {
 /// loop-invariant backedge-taken count (see
 /// hasLoopInvariantBackedgeTakenCount).
 ///
-SCEVHandle ScalarEvolution::getBackedgeTakenCount(const Loop *L) {
+const SCEV* ScalarEvolution::getBackedgeTakenCount(const Loop *L) {
   return getBackedgeTakenInfo(L).Exact;
 }
 
 /// getMaxBackedgeTakenCount - Similar to getBackedgeTakenCount, except
 /// return the least SCEV value that is known never to be less than the
 /// actual backedge taken count.
-SCEVHandle ScalarEvolution::getMaxBackedgeTakenCount(const Loop *L) {
+const SCEV* ScalarEvolution::getMaxBackedgeTakenCount(const Loop *L) {
   return getBackedgeTakenInfo(L).Max;
 }
 
@@ -2790,7 +2750,7 @@ void ScalarEvolution::forgetLoopPHIs(const Loop *L) {
   SmallVector<Instruction *, 16> Worklist;
   for (BasicBlock::iterator I = Header->begin();
        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
-    std::map<SCEVCallbackVH, SCEVHandle>::iterator It = Scalars.find((Value*)I);
+    std::map<SCEVCallbackVH, const SCEV*>::iterator It = Scalars.find((Value*)I);
     if (It != Scalars.end() && !isa<SCEVUnknown>(It->second))
       Worklist.push_back(PN);
   }
@@ -2812,8 +2772,8 @@ ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) {
   L->getExitingBlocks(ExitingBlocks);
 
   // Examine all exits and pick the most conservative values.
-  SCEVHandle BECount = CouldNotCompute;
-  SCEVHandle MaxBECount = CouldNotCompute;
+  const SCEV* BECount = CouldNotCompute;
+  const SCEV* MaxBECount = CouldNotCompute;
   bool CouldNotComputeBECount = false;
   bool CouldNotComputeMaxBECount = false;
   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
@@ -2822,7 +2782,7 @@ ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) {
 
     if (NewBTI.Exact == CouldNotCompute) {
       // We couldn't compute an exact value for this exit, so
-      // we don't be able to compute an exact value for the loop.
+      // we won't be able to compute an exact value for the loop.
       CouldNotComputeBECount = true;
       BECount = CouldNotCompute;
     } else if (!CouldNotComputeBECount) {
@@ -2838,7 +2798,7 @@ ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) {
     }
     if (NewBTI.Max == CouldNotCompute) {
       // We couldn't compute an maximum value for this exit, so
-      // we don't be able to compute an maximum value for the loop.
+      // we won't be able to compute an maximum value for the loop.
       CouldNotComputeMaxBECount = true;
       MaxBECount = CouldNotCompute;
     } else if (!CouldNotComputeMaxBECount) {
@@ -2937,23 +2897,21 @@ ScalarEvolution::ComputeBackedgeTakenCountFromExitCond(const Loop *L,
         ComputeBackedgeTakenCountFromExitCond(L, BO->getOperand(0), TBB, FBB);
       BackedgeTakenInfo BTI1 =
         ComputeBackedgeTakenCountFromExitCond(L, BO->getOperand(1), TBB, FBB);
-      SCEVHandle BECount = CouldNotCompute;
-      SCEVHandle MaxBECount = CouldNotCompute;
+      const SCEV* BECount = CouldNotCompute;
+      const SCEV* MaxBECount = CouldNotCompute;
       if (L->contains(TBB)) {
         // Both conditions must be true for the loop to continue executing.
         // Choose the less conservative count.
-        // TODO: Take the minimum of the exact counts.
-        if (BTI0.Exact == BTI1.Exact)
-          BECount = BTI0.Exact;
-        // TODO: Take the minimum of the maximum counts.
+        if (BTI0.Exact == CouldNotCompute || BTI1.Exact == CouldNotCompute)
+          BECount = CouldNotCompute;
+        else
+          BECount = getUMinFromMismatchedTypes(BTI0.Exact, BTI1.Exact);
         if (BTI0.Max == CouldNotCompute)
           MaxBECount = BTI1.Max;
         else if (BTI1.Max == CouldNotCompute)
           MaxBECount = BTI0.Max;
-        else if (const SCEVConstant *C0 = dyn_cast<SCEVConstant>(BTI0.Max))
-          if (const SCEVConstant *C1 = dyn_cast<SCEVConstant>(BTI1.Max))
-              MaxBECount = getConstant(APIntOps::umin(C0->getValue()->getValue(),
-                                                      C1->getValue()->getValue()));
+        else
+          MaxBECount = getUMinFromMismatchedTypes(BTI0.Max, BTI1.Max);
       } else {
         // Both conditions must be true for the loop to exit.
         assert(L->contains(FBB) && "Loop block has no successor in loop!");
@@ -2971,23 +2929,21 @@ ScalarEvolution::ComputeBackedgeTakenCountFromExitCond(const Loop *L,
         ComputeBackedgeTakenCountFromExitCond(L, BO->getOperand(0), TBB, FBB);
       BackedgeTakenInfo BTI1 =
         ComputeBackedgeTakenCountFromExitCond(L, BO->getOperand(1), TBB, FBB);
-      SCEVHandle BECount = CouldNotCompute;
-      SCEVHandle MaxBECount = CouldNotCompute;
+      const SCEV* BECount = CouldNotCompute;
+      const SCEV* MaxBECount = CouldNotCompute;
       if (L->contains(FBB)) {
         // Both conditions must be false for the loop to continue executing.
         // Choose the less conservative count.
-        // TODO: Take the minimum of the exact counts.
-        if (BTI0.Exact == BTI1.Exact)
-          BECount = BTI0.Exact;
-        // TODO: Take the minimum of the maximum counts.
+        if (BTI0.Exact == CouldNotCompute || BTI1.Exact == CouldNotCompute)
+          BECount = CouldNotCompute;
+        else
+          BECount = getUMinFromMismatchedTypes(BTI0.Exact, BTI1.Exact);
         if (BTI0.Max == CouldNotCompute)
           MaxBECount = BTI1.Max;
         else if (BTI1.Max == CouldNotCompute)
           MaxBECount = BTI0.Max;
-        else if (const SCEVConstant *C0 = dyn_cast<SCEVConstant>(BTI0.Max))
-          if (const SCEVConstant *C1 = dyn_cast<SCEVConstant>(BTI1.Max))
-              MaxBECount = getConstant(APIntOps::umin(C0->getValue()->getValue(),
-                                                      C1->getValue()->getValue()));
+        else
+          MaxBECount = getUMinFromMismatchedTypes(BTI0.Max, BTI1.Max);
       } else {
         // Both conditions must be false for the loop to exit.
         assert(L->contains(TBB) && "Loop block has no successor in loop!");
@@ -3029,7 +2985,7 @@ ScalarEvolution::ComputeBackedgeTakenCountFromExitCondICmp(const Loop *L,
   // Handle common loops like: for (X = "string"; *X; ++X)
   if (LoadInst *LI = dyn_cast<LoadInst>(ExitCond->getOperand(0)))
     if (Constant *RHS = dyn_cast<Constant>(ExitCond->getOperand(1))) {
-      SCEVHandle ItCnt =
+      const SCEV* ItCnt =
         ComputeLoadConstantCompareBackedgeTakenCount(LI, RHS, L, Cond);
       if (!isa<SCEVCouldNotCompute>(ItCnt)) {
         unsigned BitWidth = getTypeSizeInBits(ItCnt->getType());
@@ -3039,8 +2995,8 @@ ScalarEvolution::ComputeBackedgeTakenCountFromExitCondICmp(const Loop *L,
       }
     }
 
-  SCEVHandle LHS = getSCEV(ExitCond->getOperand(0));
-  SCEVHandle RHS = getSCEV(ExitCond->getOperand(1));
+  const SCEV* LHS = getSCEV(ExitCond->getOperand(0));
+  const SCEV* RHS = getSCEV(ExitCond->getOperand(1));
 
   // Try to evaluate any dependencies out of the loop.
   LHS = getSCEVAtScope(LHS, L);
@@ -3063,20 +3019,20 @@ ScalarEvolution::ComputeBackedgeTakenCountFromExitCondICmp(const Loop *L,
         ConstantRange CompRange(
             ICmpInst::makeConstantRange(Cond, RHSC->getValue()->getValue()));
 
-        SCEVHandle Ret = AddRec->getNumIterationsInRange(CompRange, *this);
+        const SCEV* Ret = AddRec->getNumIterationsInRange(CompRange, *this);
         if (!isa<SCEVCouldNotCompute>(Ret)) return Ret;
       }
 
   switch (Cond) {
   case ICmpInst::ICMP_NE: {                     // while (X != Y)
     // Convert to: while (X-Y != 0)
-    SCEVHandle TC = HowFarToZero(getMinusSCEV(LHS, RHS), L);
+    const SCEV* TC = HowFarToZero(getMinusSCEV(LHS, RHS), L);
     if (!isa<SCEVCouldNotCompute>(TC)) return TC;
     break;
   }
   case ICmpInst::ICMP_EQ: {
     // Convert to: while (X-Y == 0)           // while (X == Y)
-    SCEVHandle TC = HowFarToNonZero(getMinusSCEV(LHS, RHS), L);
+    const SCEV* TC = HowFarToNonZero(getMinusSCEV(LHS, RHS), L);
     if (!isa<SCEVCouldNotCompute>(TC)) return TC;
     break;
   }
@@ -3120,8 +3076,8 @@ ScalarEvolution::ComputeBackedgeTakenCountFromExitCondICmp(const Loop *L,
 static ConstantInt *
 EvaluateConstantChrecAtConstant(const SCEVAddRecExpr *AddRec, ConstantInt *C,
                                 ScalarEvolution &SE) {
-  SCEVHandle InVal = SE.getConstant(C);
-  SCEVHandle Val = AddRec->evaluateAtIteration(InVal, SE);
+  const SCEV* InVal = SE.getConstant(C);
+  const SCEV* Val = AddRec->evaluateAtIteration(InVal, SE);
   assert(isa<SCEVConstant>(Val) &&
          "Evaluation of SCEV at constant didn't fold correctly?");
   return cast<SCEVConstant>(Val)->getValue();
@@ -3164,7 +3120,7 @@ GetAddressedElementFromGlobal(GlobalVariable *GV,
 /// ComputeLoadConstantCompareBackedgeTakenCount - Given an exit condition of
 /// 'icmp op load X, cst', try to see if we can compute the backedge
 /// execution count.
-SCEVHandle ScalarEvolution::
+const SCEV* ScalarEvolution::
 ComputeLoadConstantCompareBackedgeTakenCount(LoadInst *LI, Constant *RHS,
                                              const Loop *L,
                                              ICmpInst::Predicate predicate) {
@@ -3198,7 +3154,7 @@ ComputeLoadConstantCompareBackedgeTakenCount(LoadInst *LI, Constant *RHS,
 
   // Okay, we know we have a (load (gep GV, 0, X)) comparison with a constant.
   // Check to see if X is a loop variant variable value now.
-  SCEVHandle Idx = getSCEV(VarIdx);
+  const SCEV* Idx = getSCEV(VarIdx);
   Idx = getSCEVAtScope(Idx, L);
 
   // We can only recognize very limited forms of loop index expressions, in
@@ -3374,7 +3330,7 @@ getConstantEvolutionLoopExitValue(PHINode *PN, const APInt& BEs, const Loop *L){
 /// try to evaluate a few iterations of the loop until we get the exit
 /// condition gets a value of ExitWhen (true or false).  If we cannot
 /// evaluate the trip count of the loop, return CouldNotCompute.
-SCEVHandle ScalarEvolution::
+const SCEV* ScalarEvolution::
 ComputeBackedgeTakenCountExhaustively(const Loop *L, Value *Cond, bool ExitWhen) {
   PHINode *PN = getConstantEvolvingPHI(Cond, L);
   if (PN == 0) return CouldNotCompute;
@@ -3431,7 +3387,7 @@ ComputeBackedgeTakenCountExhaustively(const Loop *L, Value *Cond, bool ExitWhen)
 ///
 /// In the case that a relevant loop exit value cannot be computed, the
 /// original value V is returned.
-SCEVHandle ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
+const SCEV* ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
   // FIXME: this should be turned into a virtual method on SCEV!
 
   if (isa<SCEVConstant>(V)) return V;
@@ -3448,7 +3404,7 @@ SCEVHandle ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
             // to see if the loop that contains it has a known backedge-taken
             // count.  If so, we may be able to force computation of the exit
             // value.
-            SCEVHandle BackedgeTakenCount = getBackedgeTakenCount(LI);
+            const SCEV* BackedgeTakenCount = getBackedgeTakenCount(LI);
             if (const SCEVConstant *BTCC =
                   dyn_cast<SCEVConstant>(BackedgeTakenCount)) {
               // Okay, we know how many times the containing loop executes.  If
@@ -3486,7 +3442,7 @@ SCEVHandle ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
             if (!isSCEVable(Op->getType()))
               return V;
 
-            SCEVHandle OpV = getSCEVAtScope(getSCEV(Op), L);
+            const SCEV* OpV = getSCEVAtScope(getSCEV(Op), L);
             if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(OpV)) {
               Constant *C = SC->getValue();
               if (C->getType() != Op->getType())
@@ -3532,11 +3488,11 @@ SCEVHandle ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
     // Avoid performing the look-up in the common case where the specified
     // expression has no loop-variant portions.
     for (unsigned i = 0, e = Comm->getNumOperands(); i != e; ++i) {
-      SCEVHandle OpAtScope = getSCEVAtScope(Comm->getOperand(i), L);
+      const SCEV* OpAtScope = getSCEVAtScope(Comm->getOperand(i), L);
       if (OpAtScope != Comm->getOperand(i)) {
         // Okay, at least one of these operands is loop variant but might be
         // foldable.  Build a new instance of the folded commutative expression.
-        SmallVector<SCEVHandle, 8> NewOps(Comm->op_begin(), Comm->op_begin()+i);
+        SmallVector<const SCEV*, 8> NewOps(Comm->op_begin(), Comm->op_begin()+i);
         NewOps.push_back(OpAtScope);
 
         for (++i; i != e; ++i) {
@@ -3559,8 +3515,8 @@ SCEVHandle ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
   }
 
   if (const SCEVUDivExpr *Div = dyn_cast<SCEVUDivExpr>(V)) {
-    SCEVHandle LHS = getSCEVAtScope(Div->getLHS(), L);
-    SCEVHandle RHS = getSCEVAtScope(Div->getRHS(), L);
+    const SCEV* LHS = getSCEVAtScope(Div->getLHS(), L);
+    const SCEV* RHS = getSCEVAtScope(Div->getRHS(), L);
     if (LHS == Div->getLHS() && RHS == Div->getRHS())
       return Div;   // must be loop invariant
     return getUDivExpr(LHS, RHS);
@@ -3572,7 +3528,7 @@ SCEVHandle ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
     if (!L || !AddRec->getLoop()->contains(L->getHeader())) {
       // To evaluate this recurrence, we need to know how many times the AddRec
       // loop iterates.  Compute this now.
-      SCEVHandle BackedgeTakenCount = getBackedgeTakenCount(AddRec->getLoop());
+      const SCEV* BackedgeTakenCount = getBackedgeTakenCount(AddRec->getLoop());
       if (BackedgeTakenCount == CouldNotCompute) return AddRec;
 
       // Then, evaluate the AddRec.
@@ -3582,21 +3538,21 @@ SCEVHandle ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
   }
 
   if (const SCEVZeroExtendExpr *Cast = dyn_cast<SCEVZeroExtendExpr>(V)) {
-    SCEVHandle Op = getSCEVAtScope(Cast->getOperand(), L);
+    const SCEV* Op = getSCEVAtScope(Cast->getOperand(), L);
     if (Op == Cast->getOperand())
       return Cast;  // must be loop invariant
     return getZeroExtendExpr(Op, Cast->getType());
   }
 
   if (const SCEVSignExtendExpr *Cast = dyn_cast<SCEVSignExtendExpr>(V)) {
-    SCEVHandle Op = getSCEVAtScope(Cast->getOperand(), L);
+    const SCEV* Op = getSCEVAtScope(Cast->getOperand(), L);
     if (Op == Cast->getOperand())
       return Cast;  // must be loop invariant
     return getSignExtendExpr(Op, Cast->getType());
   }
 
   if (const SCEVTruncateExpr *Cast = dyn_cast<SCEVTruncateExpr>(V)) {
-    SCEVHandle Op = getSCEVAtScope(Cast->getOperand(), L);
+    const SCEV* Op = getSCEVAtScope(Cast->getOperand(), L);
     if (Op == Cast->getOperand())
       return Cast;  // must be loop invariant
     return getTruncateExpr(Op, Cast->getType());
@@ -3608,7 +3564,7 @@ SCEVHandle ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
 
 /// getSCEVAtScope - This is a convenience function which does
 /// getSCEVAtScope(getSCEV(V), L).
-SCEVHandle ScalarEvolution::getSCEVAtScope(Value *V, const Loop *L) {
+const SCEV* ScalarEvolution::getSCEVAtScope(Value *V, const Loop *L) {
   return getSCEVAtScope(getSCEV(V), L);
 }
 
@@ -3621,7 +3577,7 @@ SCEVHandle ScalarEvolution::getSCEVAtScope(Value *V, const Loop *L) {
 /// A and B isn't important.
 ///
 /// If the equation does not have a solution, SCEVCouldNotCompute is returned.
-static SCEVHandle SolveLinEquationWithOverflow(const APInt &A, const APInt &B,
+static const SCEV* SolveLinEquationWithOverflow(const APInt &A, const APInt &B,
                                                ScalarEvolution &SE) {
   uint32_t BW = A.getBitWidth();
   assert(BW == B.getBitWidth() && "Bit widths must be the same.");
@@ -3664,7 +3620,7 @@ static SCEVHandle SolveLinEquationWithOverflow(const APInt &A, const APInt &B,
 /// given quadratic chrec {L,+,M,+,N}.  This returns either the two roots (which
 /// might be the same) or two SCEVCouldNotCompute objects.
 ///
-static std::pair<SCEVHandle,SCEVHandle>
+static std::pair<const SCEV*,const SCEV*>
 SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
   assert(AddRec->getNumOperands() == 3 && "This is not a quadratic chrec!");
   const SCEVConstant *LC = dyn_cast<SCEVConstant>(AddRec->getOperand(0));
@@ -3723,7 +3679,7 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
 
 /// HowFarToZero - Return the number of times a backedge comparing the specified
 /// value to zero will execute.  If not computable, return CouldNotCompute.
-SCEVHandle ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
+const SCEV* ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
   // If the value is a constant
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
     // If the value is already zero, the branch will execute zero times.
@@ -3748,8 +3704,8 @@ SCEVHandle ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
     // where BW is the common bit width of Start and Step.
 
     // Get the initial value for the loop.
-    SCEVHandle Start = getSCEVAtScope(AddRec->getStart(), L->getParentLoop());
-    SCEVHandle Step = getSCEVAtScope(AddRec->getOperand(1), L->getParentLoop());
+    const SCEV* Start = getSCEVAtScope(AddRec->getStart(), L->getParentLoop());
+    const SCEV* Step = getSCEVAtScope(AddRec->getOperand(1), L->getParentLoop());
 
     if (const SCEVConstant *StepC = dyn_cast<SCEVConstant>(Step)) {
       // For now we handle only constant steps.
@@ -3769,7 +3725,7 @@ SCEVHandle ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
   } else if (AddRec->isQuadratic() && AddRec->getType()->isInteger()) {
     // If this is a quadratic (3-term) AddRec {L,+,M,+,N}, find the roots of
     // the quadratic equation to solve it.
-    std::pair<SCEVHandle,SCEVHandle> Roots = SolveQuadraticEquation(AddRec,
+    std::pair<const SCEV*,const SCEV*> Roots = SolveQuadraticEquation(AddRec,
                                                                     *this);
     const SCEVConstant *R1 = dyn_cast<SCEVConstant>(Roots.first);
     const SCEVConstant *R2 = dyn_cast<SCEVConstant>(Roots.second);
@@ -3788,7 +3744,7 @@ SCEVHandle ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
         // We can only use this value if the chrec ends up with an exact zero
         // value at this index.  When solving for "X*X != 5", for example, we
         // should not accept a root of 2.
-        SCEVHandle Val = AddRec->evaluateAtIteration(R1, *this);
+        const SCEV* Val = AddRec->evaluateAtIteration(R1, *this);
         if (Val->isZero())
           return R1;  // We found a quadratic root!
       }
@@ -3801,7 +3757,7 @@ SCEVHandle ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
 /// HowFarToNonZero - Return the number of times a backedge checking the
 /// specified value for nonzero will execute.  If not computable, return
 /// CouldNotCompute
-SCEVHandle ScalarEvolution::HowFarToNonZero(const SCEV *V, const Loop *L) {
+const SCEV* ScalarEvolution::HowFarToNonZero(const SCEV *V, const Loop *L) {
   // Loops that look like: while (X == 0) are very strange indeed.  We don't
   // handle them yet except for the trivial case.  This could be expanded in the
   // future as needed.
@@ -3862,7 +3818,7 @@ ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(BasicBlock *BB) {
 /// more general, since a front-end may have replicated the controlling
 /// expression.
 ///
-static bool HasSameValue(const SCEVHandle &A, const SCEVHandle &B) {
+static bool HasSameValue(const SCEV* A, const SCEV* B) {
   // Quick check to see if they are the same SCEV.
   if (A == B) return true;
 
@@ -3977,8 +3933,8 @@ bool ScalarEvolution::isLoopGuardedByCond(const Loop *L,
 
     if (!PreCondLHS->getType()->isInteger()) continue;
 
-    SCEVHandle PreCondLHSSCEV = getSCEV(PreCondLHS);
-    SCEVHandle PreCondRHSSCEV = getSCEV(PreCondRHS);
+    const SCEV* PreCondLHSSCEV = getSCEV(PreCondLHS);
+    const SCEV* PreCondRHSSCEV = getSCEV(PreCondRHS);
     if ((HasSameValue(LHS, PreCondLHSSCEV) &&
          HasSameValue(RHS, PreCondRHSSCEV)) ||
         (HasSameValue(LHS, getNotSCEV(PreCondRHSSCEV)) &&
@@ -3992,22 +3948,22 @@ bool ScalarEvolution::isLoopGuardedByCond(const Loop *L,
 /// getBECount - Subtract the end and start values and divide by the step,
 /// rounding up, to get the number of times the backedge is executed. Return
 /// CouldNotCompute if an intermediate computation overflows.
-SCEVHandle ScalarEvolution::getBECount(const SCEVHandle &Start,
-                                       const SCEVHandle &End,
-                                       const SCEVHandle &Step) {
+const SCEV* ScalarEvolution::getBECount(const SCEV* Start,
+                                       const SCEV* End,
+                                       const SCEV* Step) {
   const Type *Ty = Start->getType();
-  SCEVHandle NegOne = getIntegerSCEV(-1, Ty);
-  SCEVHandle Diff = getMinusSCEV(End, Start);
-  SCEVHandle RoundUp = getAddExpr(Step, NegOne);
+  const SCEV* NegOne = getIntegerSCEV(-1, Ty);
+  const SCEV* Diff = getMinusSCEV(End, Start);
+  const SCEV* RoundUp = getAddExpr(Step, NegOne);
 
   // Add an adjustment to the difference between End and Start so that
   // the division will effectively round up.
-  SCEVHandle Add = getAddExpr(Diff, RoundUp);
+  const SCEV* Add = getAddExpr(Diff, RoundUp);
 
   // Check Add for unsigned overflow.
   // TODO: More sophisticated things could be done here.
   const Type *WideTy = IntegerType::get(getTypeSizeInBits(Ty) + 1);
-  SCEVHandle OperandExtendedAdd =
+  const SCEV* OperandExtendedAdd =
     getAddExpr(getZeroExtendExpr(Diff, WideTy),
                getZeroExtendExpr(RoundUp, WideTy));
   if (getZeroExtendExpr(Add, WideTy) != OperandExtendedAdd)
@@ -4032,7 +3988,7 @@ HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
   if (AddRec->isAffine()) {
     // FORNOW: We only support unit strides.
     unsigned BitWidth = getTypeSizeInBits(AddRec->getType());
-    SCEVHandle Step = AddRec->getStepRecurrence(*this);
+    const SCEV* Step = AddRec->getStepRecurrence(*this);
 
     // TODO: handle non-constant strides.
     const SCEVConstant *CStep = dyn_cast<SCEVConstant>(Step);
@@ -4068,10 +4024,10 @@ HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
     // treat m-n as signed nor unsigned due to overflow possibility.
 
     // First, we get the value of the LHS in the first iteration: n
-    SCEVHandle Start = AddRec->getOperand(0);
+    const SCEV* Start = AddRec->getOperand(0);
 
     // Determine the minimum constant start value.
-    SCEVHandle MinStart = isa<SCEVConstant>(Start) ? Start :
+    const SCEV* MinStart = isa<SCEVConstant>(Start) ? Start :
       getConstant(isSigned ? APInt::getSignedMinValue(BitWidth) :
                              APInt::getMinValue(BitWidth));
 
@@ -4079,7 +4035,7 @@ HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
     // then we know that it will run exactly (m-n)/s times. Otherwise, we
     // only know that it will execute (max(m,n)-n)/s times. In both cases,
     // the division must round up.
-    SCEVHandle End = RHS;
+    const SCEV* End = RHS;
     if (!isLoopGuardedByCond(L,
                              isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
                              getMinusSCEV(Start, Step), RHS))
@@ -4087,7 +4043,7 @@ HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
                      : getUMaxExpr(RHS, Start);
 
     // Determine the maximum constant end value.
-    SCEVHandle MaxEnd =
+    const SCEV* MaxEnd =
       isa<SCEVConstant>(End) ? End :
       getConstant(isSigned ? APInt::getSignedMaxValue(BitWidth)
                                .ashr(GetMinSignBits(End) - 1) :
@@ -4096,11 +4052,11 @@ HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
 
     // Finally, we subtract these two values and divide, rounding up, to get
     // the number of times the backedge is executed.
-    SCEVHandle BECount = getBECount(Start, End, Step);
+    const SCEV* BECount = getBECount(Start, End, Step);
 
     // The maximum backedge count is similar, except using the minimum start
     // value and the maximum end value.
-    SCEVHandle MaxBECount = getBECount(MinStart, MaxEnd, Step);;
+    const SCEV* MaxBECount = getBECount(MinStart, MaxEnd, Step);;
 
     return BackedgeTakenInfo(BECount, MaxBECount);
   }
@@ -4113,7 +4069,7 @@ HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
 /// this is that it returns the first iteration number where the value is not in
 /// the condition, thus computing the exit count. If the iteration count can't
 /// be computed, an instance of SCEVCouldNotCompute is returned.
-SCEVHandle SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
+const SCEV* SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
                                                    ScalarEvolution &SE) const {
   if (Range.isFullSet())  // Infinite loop.
     return SE.getCouldNotCompute();
@@ -4121,9 +4077,9 @@ SCEVHandle SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
   // If the start is a non-zero constant, shift the range to simplify things.
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(getStart()))
     if (!SC->getValue()->isZero()) {
-      SmallVector<SCEVHandle, 4> Operands(op_begin(), op_end());
+      SmallVector<const SCEV*, 4> Operands(op_begin(), op_end());
       Operands[0] = SE.getIntegerSCEV(0, SC->getType());
-      SCEVHandle Shifted = SE.getAddRecExpr(Operands, getLoop());
+      const SCEV* Shifted = SE.getAddRecExpr(Operands, getLoop());
       if (const SCEVAddRecExpr *ShiftedAddRec =
             dyn_cast<SCEVAddRecExpr>(Shifted))
         return ShiftedAddRec->getNumIterationsInRange(
@@ -4182,12 +4138,12 @@ SCEVHandle SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
     // quadratic equation to solve it.  To do this, we must frame our problem in
     // terms of figuring out when zero is crossed, instead of when
     // Range.getUpper() is crossed.
-    SmallVector<SCEVHandle, 4> NewOps(op_begin(), op_end());
+    SmallVector<const SCEV*, 4> NewOps(op_begin(), op_end());
     NewOps[0] = SE.getNegativeSCEV(SE.getConstant(Range.getUpper()));
-    SCEVHandle NewAddRec = SE.getAddRecExpr(NewOps, getLoop());
+    const SCEV* NewAddRec = SE.getAddRecExpr(NewOps, getLoop());
 
     // Next, solve the constructed addrec
-    std::pair<SCEVHandle,SCEVHandle> Roots =
+    std::pair<const SCEV*,const SCEV*> Roots =
       SolveQuadraticEquation(cast<SCEVAddRecExpr>(NewAddRec), SE);
     const SCEVConstant *R1 = dyn_cast<SCEVConstant>(Roots.first);
     const SCEVConstant *R2 = dyn_cast<SCEVConstant>(Roots.second);
@@ -4293,7 +4249,7 @@ ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se)
 //===----------------------------------------------------------------------===//
 
 ScalarEvolution::ScalarEvolution()
-  : FunctionPass(&ID), CouldNotCompute(new SCEVCouldNotCompute(0)) {
+  : FunctionPass(&ID), CouldNotCompute(new SCEVCouldNotCompute()) {
 }
 
 bool ScalarEvolution::runOnFunction(Function &F) {
@@ -4308,6 +4264,45 @@ void ScalarEvolution::releaseMemory() {
   BackedgeTakenCounts.clear();
   ConstantEvolutionLoopExitValue.clear();
   ValuesAtScopes.clear();
+  
+  for (std::map<ConstantInt*, SCEVConstant*>::iterator
+       I = SCEVConstants.begin(), E = SCEVConstants.end(); I != E; ++I)
+    delete I->second;
+  for (std::map<std::pair<const SCEV*, const Type*>,
+       SCEVTruncateExpr*>::iterator I = SCEVTruncates.begin(),
+       E = SCEVTruncates.end(); I != E; ++I)
+    delete I->second;
+  for (std::map<std::pair<const SCEV*, const Type*>,
+       SCEVZeroExtendExpr*>::iterator I = SCEVZeroExtends.begin(),
+       E = SCEVZeroExtends.end(); I != E; ++I)
+    delete I->second;
+  for (std::map<std::pair<unsigned, std::vector<const SCEV*> >,
+       SCEVCommutativeExpr*>::iterator I = SCEVCommExprs.begin(),
+       E = SCEVCommExprs.end(); I != E; ++I)
+    delete I->second;
+  for (std::map<std::pair<const SCEV*, const SCEV*>, SCEVUDivExpr*>::iterator
+       I = SCEVUDivs.begin(), E = SCEVUDivs.end(); I != E; ++I)
+    delete I->second;
+  for (std::map<std::pair<const SCEV*, const Type*>,
+       SCEVSignExtendExpr*>::iterator I =  SCEVSignExtends.begin(),
+       E = SCEVSignExtends.end(); I != E; ++I)
+    delete I->second;
+  for (std::map<std::pair<const Loop *, std::vector<const SCEV*> >,
+       SCEVAddRecExpr*>::iterator I = SCEVAddRecExprs.begin(),
+       E = SCEVAddRecExprs.end(); I != E; ++I)
+    delete I->second;
+  for (std::map<Value*, SCEVUnknown*>::iterator I = SCEVUnknowns.begin(),
+       E = SCEVUnknowns.end(); I != E; ++I)
+    delete I->second;
+  
+  SCEVConstants.clear();
+  SCEVTruncates.clear();
+  SCEVZeroExtends.clear();
+  SCEVCommExprs.clear();
+  SCEVUDivs.clear();
+  SCEVSignExtends.clear();
+  SCEVAddRecExprs.clear();
+  SCEVUnknowns.clear();
 }
 
 void ScalarEvolution::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -4355,12 +4350,12 @@ void ScalarEvolution::print(raw_ostream &OS, const Module* ) const {
     if (isSCEVable(I->getType())) {
       OS << *I;
       OS << "  -->  ";
-      SCEVHandle SV = SE.getSCEV(&*I);
+      const SCEV* SV = SE.getSCEV(&*I);
       SV->print(OS);
 
       const Loop *L = LI->getLoopFor((*I).getParent());
 
-      SCEVHandle AtUse = SE.getSCEVAtScope(SV, L);
+      const SCEV* AtUse = SE.getSCEVAtScope(SV, L);
       if (AtUse != SV) {
         OS << "  -->  ";
         AtUse->print(OS);
@@ -4368,7 +4363,7 @@ void ScalarEvolution::print(raw_ostream &OS, const Module* ) const {
 
       if (L) {
         OS << "\t\t" "Exits: ";
-        SCEVHandle ExitValue = SE.getSCEVAtScope(SV, L->getParentLoop());
+        const SCEV* ExitValue = SE.getSCEVAtScope(SV, L->getParentLoop());
         if (!ExitValue->isLoopInvariant(L)) {
           OS << "<<Unknown>>";
         } else {
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 2a73c27405a8..c5591d702730 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -152,8 +152,8 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode, Value *LHS,
 /// TODO: When ScalarEvolution gets a SCEVSDivExpr, this can be made
 /// unnecessary; in its place, just signed-divide Ops[i] by the scale and
 /// check to see if the divide was folded.
-static bool FactorOutConstant(SCEVHandle &S,
-                              SCEVHandle &Remainder,
+static bool FactorOutConstant(const SCEV* &S,
+                              const SCEV* &Remainder,
                               const APInt &Factor,
                               ScalarEvolution &SE) {
   // Everything is divisible by one.
@@ -168,7 +168,7 @@ static bool FactorOutConstant(SCEVHandle &S,
     // the value at this scale. It will be considered for subsequent
     // smaller scales.
     if (C->isZero() || !CI->isZero()) {
-      SCEVHandle Div = SE.getConstant(CI);
+      const SCEV* Div = SE.getConstant(CI);
       S = Div;
       Remainder =
         SE.getAddExpr(Remainder,
@@ -182,8 +182,8 @@ static bool FactorOutConstant(SCEVHandle &S,
   if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S))
     if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
       if (!C->getValue()->getValue().srem(Factor)) {
-        const SmallVectorImpl<SCEVHandle> &MOperands = M->getOperands();
-        SmallVector<SCEVHandle, 4> NewMulOps(MOperands.begin(), MOperands.end());
+        const SmallVectorImpl<const SCEV*> &MOperands = M->getOperands();
+        SmallVector<const SCEV*, 4> NewMulOps(MOperands.begin(), MOperands.end());
         NewMulOps[0] =
           SE.getConstant(C->getValue()->getValue().sdiv(Factor));
         S = SE.getMulExpr(NewMulOps);
@@ -192,13 +192,13 @@ static bool FactorOutConstant(SCEVHandle &S,
 
   // In an AddRec, check if both start and step are divisible.
   if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(S)) {
-    SCEVHandle Step = A->getStepRecurrence(SE);
-    SCEVHandle StepRem = SE.getIntegerSCEV(0, Step->getType());
+    const SCEV* Step = A->getStepRecurrence(SE);
+    const SCEV* StepRem = SE.getIntegerSCEV(0, Step->getType());
     if (!FactorOutConstant(Step, StepRem, Factor, SE))
       return false;
     if (!StepRem->isZero())
       return false;
-    SCEVHandle Start = A->getStart();
+    const SCEV* Start = A->getStart();
     if (!FactorOutConstant(Start, Remainder, Factor, SE))
       return false;
     S = SE.getAddRecExpr(Start, Step, A->getLoop());
@@ -233,14 +233,14 @@ static bool FactorOutConstant(SCEVHandle &S,
 /// loop-invariant portions of expressions, after considering what
 /// can be folded using target addressing modes.
 ///
-Value *SCEVExpander::expandAddToGEP(const SCEVHandle *op_begin,
-                                    const SCEVHandle *op_end,
+Value *SCEVExpander::expandAddToGEP(const SCEV* const *op_begin,
+                                    const SCEV* const *op_end,
                                     const PointerType *PTy,
                                     const Type *Ty,
                                     Value *V) {
   const Type *ElTy = PTy->getElementType();
   SmallVector<Value *, 4> GepIndices;
-  SmallVector<SCEVHandle, 8> Ops(op_begin, op_end);
+  SmallVector<const SCEV*, 8> Ops(op_begin, op_end);
   bool AnyNonZeroIndices = false;
 
   // Decend down the pointer's type and attempt to convert the other
@@ -251,14 +251,14 @@ Value *SCEVExpander::expandAddToGEP(const SCEVHandle *op_begin,
   for (;;) {
     APInt ElSize = APInt(SE.getTypeSizeInBits(Ty),
                          ElTy->isSized() ?  SE.TD->getTypeAllocSize(ElTy) : 0);
-    SmallVector<SCEVHandle, 8> NewOps;
-    SmallVector<SCEVHandle, 8> ScaledOps;
+    SmallVector<const SCEV*, 8> NewOps;
+    SmallVector<const SCEV*, 8> ScaledOps;
     for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
       // Split AddRecs up into parts as either of the parts may be usable
       // without the other.
       if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Ops[i]))
         if (!A->getStart()->isZero()) {
-          SCEVHandle Start = A->getStart();
+          const SCEV* Start = A->getStart();
           Ops.push_back(SE.getAddRecExpr(SE.getIntegerSCEV(0, A->getType()),
                                          A->getStepRecurrence(SE),
                                          A->getLoop()));
@@ -267,8 +267,8 @@ Value *SCEVExpander::expandAddToGEP(const SCEVHandle *op_begin,
         }
       // If the scale size is not 0, attempt to factor out a scale.
       if (ElSize != 0) {
-        SCEVHandle Op = Ops[i];
-        SCEVHandle Remainder = SE.getIntegerSCEV(0, Op->getType());
+        const SCEV* Op = Ops[i];
+        const SCEV* Remainder = SE.getIntegerSCEV(0, Op->getType());
         if (FactorOutConstant(Op, Remainder, ElSize, SE)) {
           ScaledOps.push_back(Op); // Op now has ElSize factored out.
           NewOps.push_back(Remainder);
@@ -364,7 +364,7 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
   // comments on expandAddToGEP for details.
   if (SE.TD)
     if (const PointerType *PTy = dyn_cast<PointerType>(V->getType())) {
-      const SmallVectorImpl<SCEVHandle> &Ops = S->getOperands();
+      const SmallVectorImpl<const SCEV*> &Ops = S->getOperands();
       return expandAddToGEP(&Ops[0], &Ops[Ops.size() - 1],
                             PTy, Ty, V);
     }
@@ -420,7 +420,7 @@ Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
 /// Move parts of Base into Rest to leave Base with the minimal
 /// expression that provides a pointer operand suitable for a
 /// GEP expansion.
-static void ExposePointerBase(SCEVHandle &Base, SCEVHandle &Rest,
+static void ExposePointerBase(const SCEV* &Base, const SCEV* &Rest,
                               ScalarEvolution &SE) {
   while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Base)) {
     Base = A->getStart();
@@ -431,7 +431,7 @@ static void ExposePointerBase(SCEVHandle &Base, SCEVHandle &Rest,
   }
   if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(Base)) {
     Base = A->getOperand(A->getNumOperands()-1);
-    SmallVector<SCEVHandle, 8> NewAddOps(A->op_begin(), A->op_end());
+    SmallVector<const SCEV*, 8> NewAddOps(A->op_begin(), A->op_end());
     NewAddOps.back() = Rest;
     Rest = SE.getAddExpr(NewAddOps);
     ExposePointerBase(Base, Rest, SE);
@@ -455,9 +455,9 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   if (CanonicalIV &&
       SE.getTypeSizeInBits(CanonicalIV->getType()) >
       SE.getTypeSizeInBits(Ty)) {
-    SCEVHandle Start = SE.getAnyExtendExpr(S->getStart(),
+    const SCEV* Start = SE.getAnyExtendExpr(S->getStart(),
                                            CanonicalIV->getType());
-    SCEVHandle Step = SE.getAnyExtendExpr(S->getStepRecurrence(SE),
+    const SCEV* Step = SE.getAnyExtendExpr(S->getStepRecurrence(SE),
                                           CanonicalIV->getType());
     Value *V = expand(SE.getAddRecExpr(Start, Step, S->getLoop()));
     BasicBlock::iterator SaveInsertPt = getInsertionPoint();
@@ -472,16 +472,16 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
 
   // {X,+,F} --> X + {0,+,F}
   if (!S->getStart()->isZero()) {
-    const SmallVectorImpl<SCEVHandle> &SOperands = S->getOperands();
-    SmallVector<SCEVHandle, 4> NewOps(SOperands.begin(), SOperands.end());
+    const SmallVectorImpl<const SCEV*> &SOperands = S->getOperands();
+    SmallVector<const SCEV*, 4> NewOps(SOperands.begin(), SOperands.end());
     NewOps[0] = SE.getIntegerSCEV(0, Ty);
-    SCEVHandle Rest = SE.getAddRecExpr(NewOps, L);
+    const SCEV* Rest = SE.getAddRecExpr(NewOps, L);
 
     // Turn things like ptrtoint+arithmetic+inttoptr into GEP. See the
     // comments on expandAddToGEP for details.
     if (SE.TD) {
-      SCEVHandle Base = S->getStart();
-      SCEVHandle RestArray[1] = { Rest };
+      const SCEV* Base = S->getStart();
+      const SCEV* RestArray[1] = { Rest };
       // Dig into the expression to find the pointer base for a GEP.
       ExposePointerBase(Base, RestArray[0], SE);
       // If we found a pointer, expand the AddRec with a GEP.
@@ -581,20 +581,20 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   // folders, then expandCodeFor the closed form.  This allows the folders to
   // simplify the expression without having to build a bunch of special code
   // into this folder.
-  SCEVHandle IH = SE.getUnknown(I);   // Get I as a "symbolic" SCEV.
+  const SCEV* IH = SE.getUnknown(I);   // Get I as a "symbolic" SCEV.
 
   // Promote S up to the canonical IV type, if the cast is foldable.
-  SCEVHandle NewS = S;
-  SCEVHandle Ext = SE.getNoopOrAnyExtend(S, I->getType());
+  const SCEV* NewS = S;
+  const SCEV* Ext = SE.getNoopOrAnyExtend(S, I->getType());
   if (isa<SCEVAddRecExpr>(Ext))
     NewS = Ext;
 
-  SCEVHandle V = cast<SCEVAddRecExpr>(NewS)->evaluateAtIteration(IH, SE);
+  const SCEV* V = cast<SCEVAddRecExpr>(NewS)->evaluateAtIteration(IH, SE);
   //cerr << "Evaluated: " << *this << "\n     to: " << *V << "\n";
 
   // Truncate the result down to the original type, if needed.
-  SCEVHandle T = SE.getTruncateOrNoop(V, Ty);
-  return expand(V);
+  const SCEV* T = SE.getTruncateOrNoop(V, Ty);
+  return expand(T);
 }
 
 Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) {
@@ -654,7 +654,7 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
   return LHS;
 }
 
-Value *SCEVExpander::expandCodeFor(SCEVHandle SH, const Type *Ty) {
+Value *SCEVExpander::expandCodeFor(const SCEV* SH, const Type *Ty) {
   // Expand the code for this SCEV.
   Value *V = expand(SH);
   if (Ty) {
@@ -667,7 +667,7 @@ Value *SCEVExpander::expandCodeFor(SCEVHandle SH, const Type *Ty) {
 
 Value *SCEVExpander::expand(const SCEV *S) {
   // Check to see if we already expanded this.
-  std::map<SCEVHandle, AssertingVH<Value> >::iterator I =
+  std::map<const SCEV*, AssertingVH<Value> >::iterator I =
     InsertedExpressions.find(S);
   if (I != InsertedExpressions.end())
     return I->second;
@@ -685,7 +685,7 @@ Value *
 SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
                                                     const Type *Ty) {
   assert(Ty->isInteger() && "Can only insert integer induction variables!");
-  SCEVHandle H = SE.getAddRecExpr(SE.getIntegerSCEV(0, Ty),
+  const SCEV* H = SE.getAddRecExpr(SE.getIntegerSCEV(0, Ty),
                                   SE.getIntegerSCEV(1, Ty), L);
   return expand(H);
 }
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 17ffa2d2de6b..7509e91bdc85 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -624,8 +624,12 @@ bool llvm::MaskedValueIsZero(Value *V, const APInt &Mask,
 /// 'Op' must have a scalar integer type.
 ///
 unsigned llvm::ComputeNumSignBits(Value *V, TargetData *TD, unsigned Depth) {
+  assert((TD || V->getType()->isIntOrIntVector()) &&
+         "ComputeNumSignBits requires a TargetData object to operate "
+         "on non-integer values!");
   const Type *Ty = V->getType();
-  unsigned TyBits = Ty->getScalarSizeInBits();
+  unsigned TyBits = TD ? TD->getTypeSizeInBits(V->getType()->getScalarType()) :
+                         Ty->getScalarSizeInBits();
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
 
diff --git a/lib/CodeGen/ELF.h b/lib/CodeGen/ELF.h
index 796bc2c3b2c0..28b6be8910e5 100644
--- a/lib/CodeGen/ELF.h
+++ b/lib/CodeGen/ELF.h
@@ -128,7 +128,13 @@ namespace llvm {
   /// added to logical symbol table for the module.  This is eventually
   /// turned into a real symbol table in the file.
   struct ELFSym {
-    const GlobalValue *GV;    // The global value this corresponds to.
+    // The global value this corresponds to. Global symbols can be on of the 
+    // 3 types : if this symbol has a zero initializer, it is common or should
+    // be placed in bss section otherwise it's a constant.
+    const GlobalValue *GV;
+    bool IsCommon;
+    bool IsBss;
+    bool IsConstant;
 
     // ELF specific fields
     unsigned NameIdx;         // Index in .strtab of name, once emitted.
@@ -159,8 +165,9 @@ namespace llvm {
       STV_PROTECTED = 3 // Visible in other components but not preemptable
     };
 
-    ELFSym(const GlobalValue *gv) : GV(gv), NameIdx(0), Value(0),
-                                    Size(0), Info(0), Other(0),
+    ELFSym(const GlobalValue *gv) : GV(gv), IsCommon(false), IsBss(false),
+                                    IsConstant(false), NameIdx(0), Value(0),
+                                    Size(0), Info(0), Other(STV_DEFAULT),
                                     SectionIdx(ELFSection::SHN_UNDEF) {
       if (!GV)
         return;
@@ -180,16 +187,47 @@ namespace llvm {
       }
     }
 
-    void SetBind(unsigned X) {
+    unsigned getBind() {
+      return (Info >> 4) & 0xf;
+    }
+
+    void setBind(unsigned X) {
       assert(X == (X & 0xF) && "Bind value out of range!");
       Info = (Info & 0x0F) | (X << 4);
     }
-    void SetType(unsigned X) {
+    void setType(unsigned X) {
       assert(X == (X & 0xF) && "Type value out of range!");
       Info = (Info & 0xF0) | X;
     }
   };
 
+  /// ELFRelocation - This class contains all the information necessary to
+  /// to generate any 32-bit or 64-bit ELF relocation entry.
+  class ELFRelocation {
+    uint64_t r_offset;    // offset in the section of the object this applies to
+    uint32_t r_symidx;    // symbol table index of the symbol to use
+    uint32_t r_type;      // machine specific relocation type
+    int64_t  r_add;       // explicit relocation addend
+    bool     r_rela;      // if true then the addend is part of the entry
+                          // otherwise the addend is at the location specified
+                          // by r_offset
+  public:
+    uint64_t getInfo(bool is64Bit) const {
+      if (is64Bit)
+        return ((uint64_t)r_symidx << 32) + ((uint64_t)r_type & 0xFFFFFFFFL);
+      else
+        return (r_symidx << 8)  + (r_type & 0xFFL);
+    }
+
+    uint64_t getOffset() const { return r_offset; }
+    int64_t getAddend() const { return r_add; }
+
+    ELFRelocation(uint64_t off, uint32_t sym, uint32_t type,
+                  bool rela = true, int64_t addend = 0) :
+      r_offset(off), r_symidx(sym), r_type(type),
+      r_add(addend), r_rela(rela) {}
+  };
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/CodeGen/ELFCodeEmitter.cpp b/lib/CodeGen/ELFCodeEmitter.cpp
index ca683969e411..8cb7c94c5d0f 100644
--- a/lib/CodeGen/ELFCodeEmitter.cpp
+++ b/lib/CodeGen/ELFCodeEmitter.cpp
@@ -71,39 +71,38 @@ bool ELFCodeEmitter::finishFunction(MachineFunction &MF) {
   // Update Section Size
   ES->Size = CurBufferPtr - BufferBegin;
 
+  // Set the symbol type as a function
+  FnSym.setType(ELFSym::STT_FUNC);
+  FnSym.SectionIdx = ES->SectionIdx;
+  FnSym.Size = CurBufferPtr-FnStartPtr;
+
+  // Offset from start of Section
+  FnSym.Value = FnStartPtr-BufferBegin;
+
   // Figure out the binding (linkage) of the symbol.
   switch (MF.getFunction()->getLinkage()) {
   default:
     // appending linkage is illegal for functions.
     assert(0 && "Unknown linkage type!");
   case GlobalValue::ExternalLinkage:
-    FnSym.SetBind(ELFSym::STB_GLOBAL);
+    FnSym.setBind(ELFSym::STB_GLOBAL);
+    EW.SymbolList.push_back(FnSym);
     break;
   case GlobalValue::LinkOnceAnyLinkage:
   case GlobalValue::LinkOnceODRLinkage:
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
-    FnSym.SetBind(ELFSym::STB_WEAK);
+    FnSym.setBind(ELFSym::STB_WEAK);
+    EW.SymbolList.push_back(FnSym);
     break;
   case GlobalValue::PrivateLinkage:
     assert (0 && "PrivateLinkage should not be in the symbol table.");
   case GlobalValue::InternalLinkage:
-    FnSym.SetBind(ELFSym::STB_LOCAL);
+    FnSym.setBind(ELFSym::STB_LOCAL);
+    EW.SymbolList.push_front(FnSym);
     break;
   }
 
-  // Set the symbol type as a function
-  FnSym.SetType(ELFSym::STT_FUNC);
-
-  FnSym.SectionIdx = ES->SectionIdx;
-  FnSym.Size = CurBufferPtr-FnStartPtr;
-
-  // Offset from start of Section
-  FnSym.Value = FnStartPtr-BufferBegin;
-
-  // Finally, add it to the symtab.
-  EW.SymbolList.push_back(FnSym);
-
   // Relocations
   // -----------
   // If we have emitted any relocations to function-specific objects such as 
@@ -113,7 +112,6 @@ bool ELFCodeEmitter::finishFunction(MachineFunction &MF) {
   for (unsigned i = 0, e = Relocations.size(); i != e; ++i) {
     MachineRelocation &MR = Relocations[i];
     intptr_t Addr;
-
     if (MR.isBasicBlock()) {
       Addr = getMachineBasicBlockAddress(MR.getBasicBlock());
       MR.setConstantVal(ES->SectionIdx);
diff --git a/lib/CodeGen/ELFWriter.cpp b/lib/CodeGen/ELFWriter.cpp
index aeccefbd3e80..03db65699a2a 100644
--- a/lib/CodeGen/ELFWriter.cpp
+++ b/lib/CodeGen/ELFWriter.cpp
@@ -136,104 +136,40 @@ bool ELFWriter::doInitialization(Module &M) {
   ElfHdr.emitWord16(0); // Placeholder
 
   // Add the null section, which is required to be first in the file.
-  getSection("", ELFSection::SHT_NULL, 0);
-
-  // Start up the symbol table.  The first entry in the symtab is the null 
-  // entry.
-  SymbolList.push_back(ELFSym(0));
+  getNullSection();
 
   return false;
 }
 
-void ELFWriter::EmitGlobal(GlobalVariable *GV) {
+unsigned ELFWriter::getGlobalELFLinkage(const GlobalVariable *GV) {
+  if (GV->hasInternalLinkage())
+    return ELFSym::STB_LOCAL;
+
+  if (GV->hasWeakLinkage())
+    return ELFSym::STB_WEAK;
 
-  // XXX: put local symbols *before* global ones!
+  return ELFSym::STB_GLOBAL;
+}
+
+// For global symbols without a section, return the Null section as a
+// placeholder
+ELFSection &ELFWriter::getGlobalSymELFSection(const GlobalVariable *GV,
+                                              ELFSym &Sym) {
   const Section *S = TAI->SectionForGlobal(GV);
+  unsigned Flags = S->getFlags();
+  unsigned SectionType = ELFSection::SHT_PROGBITS;
+  unsigned SHdrFlags = ELFSection::SHF_ALLOC;
   DOUT << "Section " << S->getName() << " for global " << GV->getName() << "\n";
 
-  // If this is an external global, emit it now.  TODO: Note that it would be
-  // better to ignore the symbol here and only add it to the symbol table if
-  // referenced.
+  // If this is an external global, the symbol does not have a section.
   if (!GV->hasInitializer()) {
-    ELFSym ExternalSym(GV);
-    ExternalSym.SetBind(ELFSym::STB_GLOBAL);
-    ExternalSym.SetType(ELFSym::STT_NOTYPE);
-    ExternalSym.SectionIdx = ELFSection::SHN_UNDEF;
-    SymbolList.push_back(ExternalSym);
-    return;
+    Sym.SectionIdx = ELFSection::SHN_UNDEF;
+    return getNullSection();
   }
 
   const TargetData *TD = TM.getTargetData();
   unsigned Align = TD->getPreferredAlignment(GV);
   Constant *CV = GV->getInitializer();
-  unsigned Size = TD->getTypeAllocSize(CV->getType());
-
-  // If this global has a zero initializer, go to .bss or common section.
-  if (CV->isNullValue() || isa<UndefValue>(CV)) {
-    // If this global is part of the common block, add it now.  Variables are
-    // part of the common block if they are zero initialized and allowed to be
-    // merged with other symbols.
-    if (GV->hasLinkOnceLinkage() || GV->hasWeakLinkage() ||
-        GV->hasCommonLinkage()) {
-      ELFSym CommonSym(GV);
-      // Value for common symbols is the alignment required.
-      CommonSym.Value = Align;
-      CommonSym.Size  = Size;
-      CommonSym.SetBind(ELFSym::STB_GLOBAL);
-      CommonSym.SetType(ELFSym::STT_OBJECT);
-      CommonSym.SectionIdx = ELFSection::SHN_COMMON;
-      SymbolList.push_back(CommonSym);
-      getSection(S->getName(), ELFSection::SHT_NOBITS,
-        ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC, 1);
-      return;
-    }
-
-    // Otherwise, this symbol is part of the .bss section.  Emit it now.
-    // Handle alignment.  Ensure section is aligned at least as much as required
-    // by this symbol.
-    ELFSection &BSSSection = getBSSSection();
-    BSSSection.Align = std::max(BSSSection.Align, Align);
-
-    // Within the section, emit enough virtual padding to get us to an alignment
-    // boundary.
-    if (Align)
-      BSSSection.Size = (BSSSection.Size + Align - 1) & ~(Align-1);
-
-    ELFSym BSSSym(GV);
-    BSSSym.Value = BSSSection.Size;
-    BSSSym.Size = Size;
-    BSSSym.SetType(ELFSym::STT_OBJECT);
-
-    switch (GV->getLinkage()) {
-    default:  // weak/linkonce/common handled above
-      assert(0 && "Unexpected linkage type!");
-    case GlobalValue::AppendingLinkage:  // FIXME: This should be improved!
-    case GlobalValue::ExternalLinkage:
-      BSSSym.SetBind(ELFSym::STB_GLOBAL);
-      break;
-    case GlobalValue::InternalLinkage:
-      BSSSym.SetBind(ELFSym::STB_LOCAL);
-      break;
-    }
-
-    // Set the idx of the .bss section
-    BSSSym.SectionIdx = BSSSection.SectionIdx;
-    if (!GV->hasPrivateLinkage())
-      SymbolList.push_back(BSSSym);
-
-    // Reserve space in the .bss section for this symbol.
-    BSSSection.Size += Size;
-    return;
-  }
-
-  /// Emit the Global symbol to the right ELF section
-  ELFSym GblSym(GV);
-  GblSym.Size = Size;
-  GblSym.SetType(ELFSym::STT_OBJECT);
-  GblSym.SetBind(ELFSym::STB_GLOBAL);
-  unsigned Flags = S->getFlags();
-  unsigned SectType = ELFSection::SHT_PROGBITS;
-  unsigned SHdrFlags = ELFSection::SHF_ALLOC;
 
   if (Flags & SectionFlags::Code)
     SHdrFlags |= ELFSection::SHF_EXECINSTR;
@@ -246,29 +182,81 @@ void ELFWriter::EmitGlobal(GlobalVariable *GV) {
   if (Flags & SectionFlags::Strings)
     SHdrFlags |= ELFSection::SHF_STRINGS;
 
-  // Remove tab from section name prefix
-  std::string SectionName(S->getName());
-  size_t Pos = SectionName.find("\t");
-  if (Pos != std::string::npos)
-    SectionName.erase(Pos, 1);
-
-  // The section alignment should be bound to the element with
-  // the largest alignment
-  ELFSection &ElfS = getSection(SectionName, SectType, SHdrFlags);
-  GblSym.SectionIdx = ElfS.SectionIdx;
-  if (Align > ElfS.Align)
-    ElfS.Align = Align;
-
-  // S.Value should contain the symbol index inside the section,
-  // and all symbols should start on their required alignment boundary
-  GblSym.Value = (ElfS.size() + (Align-1)) & (-Align);
-  ElfS.emitAlignment(Align);
-
-  // Emit the constant symbol to its section
-  EmitGlobalConstant(CV, ElfS);
+  // If this global has a zero initializer, go to .bss or common section.
+  // Variables are part of the common block if they are zero initialized
+  // and allowed to be merged with other symbols.
+  if (CV->isNullValue() || isa<UndefValue>(CV)) {
+    SectionType = ELFSection::SHT_NOBITS;
+    ELFSection &ElfS = getSection(S->getName(), SectionType, SHdrFlags);
+    if (GV->hasLinkOnceLinkage() || GV->hasWeakLinkage() ||
+        GV->hasCommonLinkage()) {
+      Sym.SectionIdx = ELFSection::SHN_COMMON;
+      Sym.IsCommon = true;
+      ElfS.Align = 1;
+      return ElfS;
+    }
+    Sym.IsBss = true;
+    Sym.SectionIdx = ElfS.SectionIdx;
+    if (Align) ElfS.Size = (ElfS.Size + Align-1) & ~(Align-1);
+    ElfS.Align = std::max(ElfS.Align, Align);
+    return ElfS;
+  }
+
+  Sym.IsConstant = true;
+  ELFSection &ElfS = getSection(S->getName(), SectionType, SHdrFlags);
+  Sym.SectionIdx = ElfS.SectionIdx;
+  ElfS.Align = std::max(ElfS.Align, Align);
+  return ElfS;
+}
+
+void ELFWriter::EmitFunctionDeclaration(const Function *F) {
+  ELFSym GblSym(F);
+  GblSym.setBind(ELFSym::STB_GLOBAL);
+  GblSym.setType(ELFSym::STT_NOTYPE);
+  GblSym.SectionIdx = ELFSection::SHN_UNDEF;
   SymbolList.push_back(GblSym);
 }
 
+void ELFWriter::EmitGlobalVar(const GlobalVariable *GV) {
+  unsigned SymBind = getGlobalELFLinkage(GV);
+  unsigned Align=0, Size=0;
+  ELFSym GblSym(GV);
+  GblSym.setBind(SymBind);
+
+  if (GV->hasInitializer()) {
+    GblSym.setType(ELFSym::STT_OBJECT);
+    const TargetData *TD = TM.getTargetData();
+    Align = TD->getPreferredAlignment(GV);
+    Size = TD->getTypeAllocSize(GV->getInitializer()->getType());
+    GblSym.Size = Size;
+  } else {
+    GblSym.setType(ELFSym::STT_NOTYPE);
+  }
+
+  ELFSection &GblSection = getGlobalSymELFSection(GV, GblSym);
+
+  if (GblSym.IsCommon) {
+    GblSym.Value = Align;
+  } else if (GblSym.IsBss) {
+    GblSym.Value = GblSection.Size;
+    GblSection.Size += Size;
+  } else if (GblSym.IsConstant){
+    // GblSym.Value should contain the symbol index inside the section,
+    // and all symbols should start on their required alignment boundary
+    GblSym.Value = (GblSection.size() + (Align-1)) & (-Align);
+    GblSection.emitAlignment(Align);
+    EmitGlobalConstant(GV->getInitializer(), GblSection);
+  }
+
+  // Local symbols should come first on the symbol table.
+  if (!GV->hasPrivateLinkage()) {
+    if (SymBind == ELFSym::STB_LOCAL)
+      SymbolList.push_front(GblSym);
+    else
+      SymbolList.push_back(GblSym);
+  }
+}
+
 void ELFWriter::EmitGlobalConstantStruct(const ConstantStruct *CVS,
                                          ELFSection &GblS) {
 
@@ -306,6 +294,7 @@ void ELFWriter::EmitGlobalConstant(const Constant *CV, ELFSection &GblS) {
   if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV)) {
     if (CVA->isString()) {
       std::string GblStr = CVA->getAsString();
+      GblStr.resize(GblStr.size()-1);
       GblS.emitString(GblStr);
     } else { // Not a string.  Print the values in successive locations
       for (unsigned i = 0, e = CVA->getNumOperands(); i != e; ++i)
@@ -370,13 +359,39 @@ bool ELFWriter::doFinalization(Module &M) {
 
   // Build and emit data, bss and "common" sections.
   for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    EmitGlobal(I);
+       I != E; ++I) {
+    EmitGlobalVar(I);
+    GblSymLookup[I] = 0;
+  }
+
+  // Emit all pending globals
+  // TODO: this should be done only for referenced symbols
+  for (SetVector<GlobalValue*>::const_iterator I = PendingGlobals.begin(),
+       E = PendingGlobals.end(); I != E; ++I) {
+
+    // No need to emit the symbol again
+    if (GblSymLookup.find(*I) != GblSymLookup.end())
+      continue;
+
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(*I)) {
+      EmitGlobalVar(GV);
+    } else if (Function *F = dyn_cast<Function>(*I)) {
+      // If function is not in GblSymLookup, it doesn't have a body,
+      // so emit the symbol as a function declaration (no section associated)
+      EmitFunctionDeclaration(F);
+    } else {
+      assert("unknown howto handle pending global");
+    }
+    GblSymLookup[*I] = 0;
+  }
 
   // Emit non-executable stack note
   if (TAI->getNonexecutableStackDirective())
     getNonExecStackSection();
 
+  // Emit string table
+  EmitStringTable();
+
   // Emit the symbol table now, if non-empty.
   EmitSymbolTable();
 
@@ -400,6 +415,67 @@ bool ELFWriter::doFinalization(Module &M) {
 
 /// EmitRelocations - Emit relocations
 void ELFWriter::EmitRelocations() {
+
+  // Create Relocation sections for each section which needs it.
+  for (std::list<ELFSection>::iterator I = SectionList.begin(),
+       E = SectionList.end(); I != E; ++I) {
+
+    // This section does not have relocations
+    if (!I->hasRelocations()) continue;
+
+    // Get the relocation section for section 'I'
+    bool HasRelA = TEW->hasRelocationAddend();
+    ELFSection &RelSec = getRelocSection(I->getName(), HasRelA);
+
+    // 'Link' - Section hdr idx of the associated symbol table
+    // 'Info' - Section hdr idx of the section to which the relocation applies
+    ELFSection &SymTab = getSymbolTableSection();
+    RelSec.Link = SymTab.SectionIdx;
+    RelSec.Info = I->SectionIdx;
+    RelSec.EntSize = TEW->getRelocationEntrySize();
+
+    // Get the relocations from Section
+    std::vector<MachineRelocation> Relos = I->getRelocations();
+    for (std::vector<MachineRelocation>::iterator MRI = Relos.begin(),
+         MRE = Relos.end(); MRI != MRE; ++MRI) {
+      MachineRelocation &MR = *MRI;
+
+      // Offset from the start of the section containing the symbol
+      unsigned Offset = MR.getMachineCodeOffset();
+
+      // Symbol index in the symbol table
+      unsigned SymIdx = 0;
+
+      // Target specific ELF relocation type
+      unsigned RelType = TEW->getRelocationType(MR.getRelocationType());
+
+      // Constant addend used to compute the value to be stored 
+      // into the relocatable field
+      int64_t Addend = TEW->getAddendForRelTy(RelType);
+
+      // There are several machine relocations types, and each one of
+      // them needs a different approach to retrieve the symbol table index.
+      if (MR.isGlobalValue()) {
+        const GlobalValue *G = MR.getGlobalValue();
+        SymIdx = GblSymLookup[G];
+      } else {
+        assert(0 && "dunno how to handle other relocation types");
+      }
+
+      // Get the relocation entry and emit to the relocation section
+      ELFRelocation Rel(Offset, SymIdx, RelType, HasRelA, Addend);
+      EmitRelocation(RelSec, Rel, HasRelA);
+    }
+  }
+}
+
+/// EmitRelocation - Write relocation 'Rel' to the relocation section 'Rel'
+void ELFWriter::EmitRelocation(BinaryObject &RelSec, ELFRelocation &Rel,
+                               bool HasRelA) {
+  RelSec.emitWord(Rel.getOffset());
+  RelSec.emitWord(Rel.getInfo(is64Bit));
+  if (HasRelA)
+    RelSec.emitWord(Rel.getAddend());
 }
 
 /// EmitSymbol - Write symbol 'Sym' to the symbol table 'SymbolTable'
@@ -448,28 +524,28 @@ void ELFWriter::EmitSectionHeader(BinaryObject &SHdrTab,
   }
 }
 
-/// EmitSymbolTable - If the current symbol table is non-empty, emit the string
-/// table for it and then the symbol table itself.
-void ELFWriter::EmitSymbolTable() {
-  if (SymbolList.size() == 1) return;  // Only the null entry.
-
-  // FIXME: compact all local symbols to the start of the symtab.
-  unsigned FirstNonLocalSymbol = 1;
-
+/// EmitStringTable - If the current symbol table is non-empty, emit the string
+/// table for it
+void ELFWriter::EmitStringTable() {
+  if (!SymbolList.size()) return;  // Empty symbol table.
   ELFSection &StrTab = getStringTableSection();
 
   // Set the zero'th symbol to a null byte, as required.
   StrTab.emitByte(0);
 
+  // Walk on the symbol list and write symbol names into the
+  // string table.
   unsigned Index = 1;
-  for (unsigned i = 1, e = SymbolList.size(); i != e; ++i) {
+  for (std::list<ELFSym>::iterator I = SymbolList.begin(),
+       E = SymbolList.end(); I != E; ++I) {
+
     // Use the name mangler to uniquify the LLVM symbol.
-    std::string Name = Mang->getValueName(SymbolList[i].GV);
+    std::string Name = Mang->getValueName(I->GV);
 
     if (Name.empty()) {
-      SymbolList[i].NameIdx = 0;
+      I->NameIdx = 0;
     } else {
-      SymbolList[i].NameIdx = Index;
+      I->NameIdx = Index;
       StrTab.emitString(Name);
 
       // Keep track of the number of bytes emitted to this section.
@@ -478,20 +554,45 @@ void ELFWriter::EmitSymbolTable() {
   }
   assert(Index == StrTab.size());
   StrTab.Size = Index;
+}
+
+/// EmitSymbolTable - Emit the symbol table itself.
+void ELFWriter::EmitSymbolTable() {
+  if (!SymbolList.size()) return;  // Empty symbol table.
 
+  unsigned FirstNonLocalSymbol = 1;
   // Now that we have emitted the string table and know the offset into the
   // string table of each symbol, emit the symbol table itself.
   ELFSection &SymTab = getSymbolTableSection();
-  SymTab.Align = TEW->getSymTabAlignment();
-  SymTab.Link  = StrTab.SectionIdx;      // Section Index of .strtab.
-  SymTab.Info  = FirstNonLocalSymbol;    // First non-STB_LOCAL symbol.
+  SymTab.Align = TEW->getPrefELFAlignment();
+
+  // Section Index of .strtab.
+  SymTab.Link = getStringTableSection().SectionIdx;
 
   // Size of each symtab entry.
   SymTab.EntSize = TEW->getSymTabEntrySize();
 
-  for (unsigned i = 0, e = SymbolList.size(); i != e; ++i)
-    EmitSymbol(SymTab, SymbolList[i]);
+  // The first entry in the symtab is the null symbol
+  ELFSym NullSym = ELFSym(0);
+  EmitSymbol(SymTab, NullSym);
+
+  // Emit all the symbols to the symbol table. Skip the null
+  // symbol, cause it's emitted already
+  unsigned Index = 1;
+  for (std::list<ELFSym>::iterator I = SymbolList.begin(),
+       E = SymbolList.end(); I != E; ++I, ++Index) {
+    // Keep track of the first non-local symbol
+    if (I->getBind() == ELFSym::STB_LOCAL)
+      FirstNonLocalSymbol++;
+
+    // Emit symbol to the symbol table
+    EmitSymbol(SymTab, *I);
 
+    // Record the symbol table index for each global value
+    GblSymLookup[I->GV] = Index;
+  }
+
+  SymTab.Info = FirstNonLocalSymbol;
   SymTab.Size = SymTab.size();
 }
 
@@ -500,7 +601,7 @@ void ELFWriter::EmitSymbolTable() {
 /// section names.
 void ELFWriter::EmitSectionTableStringTable() {
   // First step: add the section for the string table to the list of sections:
-  ELFSection &SHStrTab = getSection(".shstrtab", ELFSection::SHT_STRTAB, 0);
+  ELFSection &SHStrTab = getSectionHeaderStringTableSection();
 
   // Now that we know which section number is the .shstrtab section, update the
   // e_shstrndx entry in the ELF header.
@@ -559,7 +660,7 @@ void ELFWriter::OutputSectionsAndSectionTable() {
   }
 
   // Align Section Header.
-  unsigned TableAlign = is64Bit ? 8 : 4;
+  unsigned TableAlign = TEW->getPrefELFAlignment();
   FileOff = (FileOff+TableAlign-1) & ~(TableAlign-1);
 
   // Now that we know where all of the sections will be emitted, set the e_shnum
@@ -586,13 +687,12 @@ void ELFWriter::OutputSectionsAndSectionTable() {
          << ", SectionData Size: " << S.size() << "\n";
 
     // Align FileOff to whatever the alignment restrictions of the section are.
-    if (S.Align) {
-      for (size_t NewFileOff = (FileOff+S.Align-1) & ~(S.Align-1);
-        FileOff != NewFileOff; ++FileOff)
-        O << (char)0xAB;
-    }
-
     if (S.size()) {
+      if (S.Align)  {
+        for (size_t NewFileOff = (FileOff+S.Align-1) & ~(S.Align-1);
+             FileOff != NewFileOff; ++FileOff)
+          O << (char)0xAB;
+      }
       O.write((char *)&S.getData()[0], S.Size);
       FileOff += S.Size;
     }
diff --git a/lib/CodeGen/ELFWriter.h b/lib/CodeGen/ELFWriter.h
index 8a380f034010..39577d9a97dd 100644
--- a/lib/CodeGen/ELFWriter.h
+++ b/lib/CodeGen/ELFWriter.h
@@ -16,7 +16,7 @@
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Support/OutputBuffer.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetAsmInfo.h"
 #include "llvm/Target/TargetELFWriterInfo.h"
 #include "ELF.h"
@@ -89,7 +89,7 @@ namespace llvm {
     bool doFinalization(Module &M);
 
   private:
-    // Blob containing the Elf header
+    /// Blob containing the Elf header
     BinaryObject ElfHdr;
 
     /// SectionList - This is the list of sections that we have emitted to the
@@ -102,14 +102,35 @@ namespace llvm {
     /// the SectionList.
     std::map<std::string, ELFSection*> SectionLookup;
 
+    /// GblSymLookup - This is a mapping from global value to a symbol index
+    /// in the symbol table. This is useful since relocations symbol references
+    /// must be quickly mapped to a symbol table index
+    std::map<const GlobalValue*, uint32_t> GblSymLookup;
+
+    /// SymbolList - This is the list of symbols emitted to the symbol table
+    /// Local symbols go to the front and Globals to the back.
+    std::list<ELFSym> SymbolList;
+
+    /// PendingGlobals - List of externally defined symbols that we have been
+    /// asked to emit, but have not seen a reference to.  When a reference
+    /// is seen, the symbol will move from this list to the SymbolList.
+    SetVector<GlobalValue*> PendingGlobals;
+
     /// getSection - Return the section with the specified name, creating a new
     /// section if one does not already exist.
-    ELFSection &getSection(const std::string &Name, unsigned Type, 
+    ELFSection &getSection(const std::string &Name, unsigned Type,
                            unsigned Flags = 0, unsigned Align = 0) {
       ELFSection *&SN = SectionLookup[Name];
       if (SN) return *SN;
 
-      SectionList.push_back(ELFSection(Name, isLittleEndian, is64Bit));
+      // Remove tab from section name prefix. This is necessary becase TAI 
+      // sometimes return a section name prefixed with a "\t" char.
+      std::string SectionName(Name);
+      size_t Pos = SectionName.find("\t");
+      if (Pos != std::string::npos)
+        SectionName.erase(Pos, 1);
+
+      SectionList.push_back(ELFSection(SectionName, isLittleEndian, is64Bit));
       SN = &SectionList.back();
       SN->SectionIdx = NumSections++;
       SN->Type = Type;
@@ -119,11 +140,25 @@ namespace llvm {
       return *SN;
     }
 
+    /// TODO: support mangled names here to emit the right .text section
+    /// for c++ object files.
     ELFSection &getTextSection() {
       return getSection(".text", ELFSection::SHT_PROGBITS,
                         ELFSection::SHF_EXECINSTR | ELFSection::SHF_ALLOC);
     }
 
+    /// Return the relocation section of section 'S'. 'RelA' is true
+    /// if the relocation section contains entries with addends.
+    ELFSection &getRelocSection(std::string SName, bool RelA) {
+      std::string RelSName(".rel");
+      unsigned SHdrTy = RelA ? ELFSection::SHT_RELA : ELFSection::SHT_REL;
+
+      if (RelA) RelSName.append("a");
+      RelSName.append(SName);
+
+      return getSection(RelSName, SHdrTy, 0, TEW->getPrefELFAlignment());
+    }
+
     ELFSection &getNonExecStackSection() {
       return getSection(".note.GNU-stack", ELFSection::SHT_PROGBITS, 0, 1);
     }
@@ -136,25 +171,23 @@ namespace llvm {
       return getSection(".strtab", ELFSection::SHT_STRTAB, 0, 1);
     }
 
+    ELFSection &getSectionHeaderStringTableSection() {
+      return getSection(".shstrtab", ELFSection::SHT_STRTAB, 0, 1);
+    }
+
     ELFSection &getDataSection() {
       return getSection(".data", ELFSection::SHT_PROGBITS,
-                        ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC);
+                        ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC, 4);
     }
 
     ELFSection &getBSSSection() {
       return getSection(".bss", ELFSection::SHT_NOBITS,
-                        ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC);
+                        ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC, 4);
     }
 
-    /// SymbolList - This is the list of symbols we have emitted to the file.
-    /// This actually gets rearranged before emission to the file (to put the
-    /// local symbols first in the list).
-    std::vector<ELFSym> SymbolList;
-
-    /// PendingGlobals - List of externally defined symbols that we have been
-    /// asked to emit, but have not seen a reference to.  When a reference
-    /// is seen, the symbol will move from this list to the SymbolList.
-    SetVector<GlobalValue*> PendingGlobals;
+    ELFSection &getNullSection() {
+      return getSection("", ELFSection::SHT_NULL, 0);
+    }
 
     // As we complete the ELF file, we need to update fields in the ELF header
     // (e.g. the location of the section table).  These members keep track of
@@ -165,15 +198,20 @@ namespace llvm {
     unsigned ELFHdr_e_shnum_Offset;     // e_shnum    in ELF header.
 
   private:
-    void EmitGlobal(GlobalVariable *GV);
+    void EmitFunctionDeclaration(const Function *F);
+    void EmitGlobalVar(const GlobalVariable *GV);
     void EmitGlobalConstant(const Constant *C, ELFSection &GblS);
     void EmitGlobalConstantStruct(const ConstantStruct *CVS,
                                   ELFSection &GblS);
+    unsigned getGlobalELFLinkage(const GlobalVariable *GV);
+    ELFSection &getGlobalSymELFSection(const GlobalVariable *GV, ELFSym &Sym);
     void EmitRelocations();
+    void EmitRelocation(BinaryObject &RelSec, ELFRelocation &Rel, bool HasRelA);
     void EmitSectionHeader(BinaryObject &SHdrTab, const ELFSection &SHdr);
     void EmitSectionTableStringTable();
     void EmitSymbol(BinaryObject &SymbolTable, ELFSym &Sym);
     void EmitSymbolTable();
+    void EmitStringTable();
     void OutputSectionsAndSectionTable();
   };
 }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index a9adce8fdc5d..ce01d53d60e7 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3121,6 +3121,8 @@ bool MeetsMaxMemopRequirement(std::vector<MVT> &MemOps,
           VT = (MVT::SimpleValueType)(VT.getSimpleVT() - 1);
         VTSize = VT.getSizeInBits() / 8;
       } else {
+        // This can result in a type that is not legal on the target, e.g.
+        // 1 or 2 bytes on PPC.
         VT = (MVT::SimpleValueType)(VT.getSimpleVT() - 1);
         VTSize >>= 1;
       }
@@ -3177,12 +3179,29 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
                            getMemBasePlusOffset(Dst, DstOff, DAG),
                            DstSV, DstSVOff + DstOff, false, DstAlign);
     } else {
-      Value = DAG.getLoad(VT, dl, Chain,
-                          getMemBasePlusOffset(Src, SrcOff, DAG),
-                          SrcSV, SrcSVOff + SrcOff, false, Align);
-      Store = DAG.getStore(Chain, dl, Value,
-                           getMemBasePlusOffset(Dst, DstOff, DAG),
-                           DstSV, DstSVOff + DstOff, false, DstAlign);
+      // The type might not be legal for the target.  This should only happen
+      // if the type is smaller than a legal type, as on PPC, so the right
+      // thing to do is generate a LoadExt/StoreTrunc pair.
+      // FIXME does the case above also need this?
+      if (TLI.isTypeLegal(VT)) {
+        Value = DAG.getLoad(VT, dl, Chain,
+                            getMemBasePlusOffset(Src, SrcOff, DAG),
+                            SrcSV, SrcSVOff + SrcOff, false, Align);
+        Store = DAG.getStore(Chain, dl, Value,
+                             getMemBasePlusOffset(Dst, DstOff, DAG),
+                             DstSV, DstSVOff + DstOff, false, DstAlign);
+      } else {
+        MVT NVT = VT;
+        while (!TLI.isTypeLegal(NVT)) {
+          NVT = (MVT::SimpleValueType(NVT.getSimpleVT() + 1));
+        }
+        Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
+                               getMemBasePlusOffset(Src, SrcOff, DAG),
+                               SrcSV, SrcSVOff + SrcOff, VT, false, Align);
+        Store = DAG.getTruncStore(Chain, dl, Value,
+                               getMemBasePlusOffset(Dst, DstOff, DAG),
+                               DstSV, DstSVOff + DstOff, VT, false, DstAlign);
+      }
     }
     OutChains.push_back(Store);
     SrcOff += VTSize;
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.cpp b/lib/CodeGen/SimpleRegisterCoalescing.cpp
index 20348055bd96..c2105e617be0 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.cpp
+++ b/lib/CodeGen/SimpleRegisterCoalescing.cpp
@@ -739,6 +739,9 @@ SimpleRegisterCoalescing::UpdateRegDefsUses(unsigned SrcReg, unsigned DstReg,
 
     // After updating the operand, check if the machine instruction has
     // become a copy. If so, update its val# information.
+    if (JoinedCopies.count(UseMI))
+      continue;
+
     const TargetInstrDesc &TID = UseMI->getDesc();
     unsigned CopySrcReg, CopyDstReg, CopySrcSubIdx, CopyDstSubIdx;
     if (TID.getNumDefs() == 1 && TID.getNumOperands() > 2 &&
@@ -749,9 +752,10 @@ SimpleRegisterCoalescing::UpdateRegDefsUses(unsigned SrcReg, unsigned DstReg,
          allocatableRegs_[CopyDstReg])) {
       LiveInterval &LI = li_->getInterval(CopyDstReg);
       unsigned DefIdx = li_->getDefIndex(li_->getInstructionIndex(UseMI));
-      const LiveRange *DLR = LI.getLiveRangeContaining(DefIdx);
-      if (DLR->valno->def == DefIdx)
-        DLR->valno->copy = UseMI;
+      if (const LiveRange *DLR = LI.getLiveRangeContaining(DefIdx)) {
+        if (DLR->valno->def == DefIdx)
+          DLR->valno->copy = UseMI;
+      }
     }
   }
 }
diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index 160f1ba9f6c5..b8525a30ecad 100644
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -27,6 +27,7 @@
 #include "llvm/System/DynamicLibrary.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/System/Mutex.h"
 #include <csignal>
 #include <cstdio>
 #include <map>
@@ -45,6 +46,8 @@
 
 using namespace llvm;
 
+static ManagedStatic<sys::Mutex> FunctionsLock;
+
 typedef GenericValue (*ExFunc)(const FunctionType *,
                                const std::vector<GenericValue> &);
 static ManagedStatic<std::map<const Function *, ExFunc> > ExportedFunctions;
@@ -94,6 +97,7 @@ static ExFunc lookupFunction(const Function *F) {
     ExtName += getTypeID(FT->getContainedType(i));
   ExtName += "_" + F->getName();
 
+  sys::ScopedLock Writer(&*FunctionsLock);
   ExFunc FnPtr = FuncNames[ExtName];
   if (FnPtr == 0)
     FnPtr = FuncNames["lle_X_"+F->getName()];
@@ -246,12 +250,16 @@ GenericValue Interpreter::callExternalFunction(Function *F,
                                      const std::vector<GenericValue> &ArgVals) {
   TheInterpreter = this;
 
+  FunctionsLock->acquire();
+
   // Do a lookup to see if the function is in our cache... this should just be a
   // deferred annotation!
   std::map<const Function *, ExFunc>::iterator FI = ExportedFunctions->find(F);
   if (ExFunc Fn = (FI == ExportedFunctions->end()) ? lookupFunction(F)
-                                                   : FI->second)
+                                                   : FI->second) {
+    FunctionsLock->release();
     return Fn(F->getFunctionType(), ArgVals);
+  }
 
 #ifdef USE_LIBFFI
   std::map<const Function *, RawFunc>::iterator RF = RawFunctions->find(F);
@@ -264,6 +272,8 @@ GenericValue Interpreter::callExternalFunction(Function *F,
   } else {
     RawFn = RF->second;
   }
+  
+  FunctionsLock->release();
 
   GenericValue Result;
   if (RawFn != 0 && ffiInvoke(RawFn, F, ArgVals, getTargetData(), Result))
@@ -529,6 +539,7 @@ GenericValue lle_X_fprintf(const FunctionType *FT,
 
 
 void Interpreter::initializeExternalFunctions() {
+  sys::ScopedLock Writer(&*FunctionsLock);
   FuncNames["lle_X_atexit"]       = lle_X_atexit;
   FuncNames["lle_X_exit"]         = lle_X_exit;
   FuncNames["lle_X_abort"]        = lle_X_abort;
diff --git a/lib/Support/Annotation.cpp b/lib/Support/Annotation.cpp
index 9764b5e829dc..9c3efa37d867 100644
--- a/lib/Support/Annotation.cpp
+++ b/lib/Support/Annotation.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Support/Annotation.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/System/RWMutex.h"
 #include <map>
 #include <cstring>
 using namespace llvm;
@@ -42,31 +43,33 @@ static unsigned IDCounter = 0;  // Unique ID counter
 
 // Static member to ensure initialiation on demand.
 static ManagedStatic<IDMapType> IDMap;
+static ManagedStatic<sys::SmartRWMutex<true> > AnnotationsLock;
 
 // On demand annotation creation support...
 typedef Annotation *(*AnnFactory)(AnnotationID, const Annotable *, void *);
 typedef std::map<unsigned, std::pair<AnnFactory,void*> > FactMapType;
 
-static FactMapType *TheFactMap = 0;
+static ManagedStatic<FactMapType> TheFactMap;
 static FactMapType &getFactMap() {
-  if (TheFactMap == 0)
-    TheFactMap = new FactMapType();
   return *TheFactMap;
 }
 
 static void eraseFromFactMap(unsigned ID) {
-  assert(TheFactMap && "No entries found!");
+  sys::SmartScopedWriter<true> Writer(&*AnnotationsLock);
   TheFactMap->erase(ID);
-  if (TheFactMap->empty()) {   // Delete when empty
-    delete TheFactMap;
-    TheFactMap = 0;
-  }
 }
 
 AnnotationID AnnotationManager::getID(const char *Name) {  // Name -> ID
+  AnnotationsLock->reader_acquire();
   IDMapType::iterator I = IDMap->find(Name);
-  if (I == IDMap->end()) {
-    (*IDMap)[Name] = IDCounter++;   // Add a new element
+  IDMapType::iterator E = IDMap->end();
+  AnnotationsLock->reader_release();
+  
+  if (I == E) {
+    sys::SmartScopedWriter<true> Writer(&*AnnotationsLock);
+    I = IDMap->find(Name);
+    if (I == IDMap->end())
+      (*IDMap)[Name] = IDCounter++;   // Add a new element
     return AnnotationID(IDCounter-1);
   }
   return AnnotationID(I->second);
@@ -85,6 +88,7 @@ AnnotationID AnnotationManager::getID(const char *Name, Factory Fact,
 // only be used for debugging.
 //
 const char *AnnotationManager::getName(AnnotationID ID) {  // ID -> Name
+  sys::SmartScopedReader<true> Reader(&*AnnotationsLock);
   IDMapType &TheMap = *IDMap;
   for (IDMapType::iterator I = TheMap.begin(); ; ++I) {
     assert(I != TheMap.end() && "Annotation ID is unknown!");
@@ -98,10 +102,12 @@ const char *AnnotationManager::getName(AnnotationID ID) {  // ID -> Name
 //
 void AnnotationManager::registerAnnotationFactory(AnnotationID ID, AnnFactory F,
                                                   void *ExtraData) {
-  if (F)
+  if (F) {
+    sys::SmartScopedWriter<true> Writer(&*AnnotationsLock);
     getFactMap()[ID.ID] = std::make_pair(F, ExtraData);
-  else
+  } else {
     eraseFromFactMap(ID.ID);
+  }
 }
 
 // createAnnotation - Create an annotation of the specified ID for the
@@ -109,7 +115,13 @@ void AnnotationManager::registerAnnotationFactory(AnnotationID ID, AnnFactory F,
 //
 Annotation *AnnotationManager::createAnnotation(AnnotationID ID,
                                                 const Annotable *Obj) {
+  AnnotationsLock->reader_acquire();
   FactMapType::iterator I = getFactMap().find(ID.ID);
-  if (I == getFactMap().end()) return 0;
+  if (I == getFactMap().end()) {
+    AnnotationsLock->reader_release();
+    return 0;
+  }
+  
+  AnnotationsLock->reader_release();
   return I->second.first(ID, Obj, I->second.second);
 }
diff --git a/lib/Support/PluginLoader.cpp b/lib/Support/PluginLoader.cpp
index 5acf1d13ee9c..ef32af4b3f38 100644
--- a/lib/Support/PluginLoader.cpp
+++ b/lib/Support/PluginLoader.cpp
@@ -16,13 +16,16 @@
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/Streams.h"
 #include "llvm/System/DynamicLibrary.h"
+#include "llvm/System/Mutex.h"
 #include <ostream>
 #include <vector>
 using namespace llvm;
 
 static ManagedStatic<std::vector<std::string> > Plugins;
+static ManagedStatic<sys::SmartMutex<true> > PluginsLock;
 
 void PluginLoader::operator=(const std::string &Filename) {
+  sys::SmartScopedLock<true> Lock(&*PluginsLock);
   std::string Error;
   if (sys::DynamicLibrary::LoadLibraryPermanently(Filename.c_str(), &Error)) {
     cerr << "Error opening '" << Filename << "': " << Error
@@ -33,10 +36,12 @@ void PluginLoader::operator=(const std::string &Filename) {
 }
 
 unsigned PluginLoader::getNumPlugins() {
+  sys::SmartScopedLock<true> Lock(&*PluginsLock);
   return Plugins.isConstructed() ? Plugins->size() : 0;
 }
 
 std::string &PluginLoader::getPlugin(unsigned num) {
+  sys::SmartScopedLock<true> Lock(&*PluginsLock);
   assert(Plugins.isConstructed() && num < Plugins->size() &&
          "Asking for an out of bounds plugin");
   return (*Plugins)[num];
diff --git a/lib/Support/Statistic.cpp b/lib/Support/Statistic.cpp
index 13acc1b0fa1e..6c652f8d3f16 100644
--- a/lib/Support/Statistic.cpp
+++ b/lib/Support/Statistic.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Streams.h"
+#include "llvm/System/Mutex.h"
 #include "llvm/ADT/StringExtras.h"
 #include <algorithm>
 #include <ostream>
@@ -57,13 +58,14 @@ public:
 }
 
 static ManagedStatic<StatisticInfo> StatInfo;
-
+static ManagedStatic<sys::Mutex> StatLock;
 
 /// RegisterStatistic - The first time a statistic is bumped, this method is
 /// called.
 void Statistic::RegisterStatistic() {
   // If stats are enabled, inform StatInfo that this statistic should be
   // printed.
+  sys::ScopedLock Writer(&*StatLock);
   if (Enabled)
     StatInfo->addStatistic(this);
   // Remember we have been registered.
diff --git a/lib/Support/Timer.cpp b/lib/Support/Timer.cpp
index 3c8879bd06e3..c4920f0ba573 100644
--- a/lib/Support/Timer.cpp
+++ b/lib/Support/Timer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Streams.h"
+#include "llvm/System/Mutex.h"
 #include "llvm/System/Process.h"
 #include <algorithm>
 #include <fstream>
@@ -50,25 +51,28 @@ namespace {
                    cl::Hidden, cl::location(getLibSupportInfoOutputFilename()));
 }
 
-static TimerGroup *DefaultTimerGroup = 0;
+static ManagedStatic<sys::SmartMutex<true> > TimerLock;
+static ManagedStatic<TimerGroup> DefaultTimerGroup;
 static TimerGroup *getDefaultTimerGroup() {
-  if (DefaultTimerGroup) return DefaultTimerGroup;
-  return DefaultTimerGroup = new TimerGroup("Miscellaneous Ungrouped Timers");
+  return &*DefaultTimerGroup;
 }
 
 Timer::Timer(const std::string &N)
   : Elapsed(0), UserTime(0), SystemTime(0), MemUsed(0), PeakMem(0), Name(N),
     Started(false), TG(getDefaultTimerGroup()) {
+  sys::SmartScopedLock<true> Lock(&*TimerLock);
   TG->addTimer();
 }
 
 Timer::Timer(const std::string &N, TimerGroup &tg)
   : Elapsed(0), UserTime(0), SystemTime(0), MemUsed(0), PeakMem(0), Name(N),
     Started(false), TG(&tg) {
+  sys::SmartScopedLock<true> Lock(&*TimerLock);
   TG->addTimer();
 }
 
 Timer::Timer(const Timer &T) {
+  sys::SmartScopedLock<true> Lock(&*TimerLock);
   TG = T.TG;
   if (TG) TG->addTimer();
   operator=(T);
@@ -77,6 +81,7 @@ Timer::Timer(const Timer &T) {
 
 // Copy ctor, initialize with no TG member.
 Timer::Timer(bool, const Timer &T) {
+  sys::SmartScopedLock<true> Lock(&*TimerLock);
   TG = T.TG;     // Avoid assertion in operator=
   operator=(T);  // Copy contents
   TG = 0;
@@ -84,6 +89,7 @@ Timer::Timer(bool, const Timer &T) {
 
 
 Timer::~Timer() {
+  sys::SmartScopedLock<true> Lock(&*TimerLock);
   if (TG) {
     if (Started) {
       Started = false;
@@ -129,8 +135,10 @@ static TimeRecord getTimeRecord(bool Start) {
 }
 
 static ManagedStatic<std::vector<Timer*> > ActiveTimers;
+static ManagedStatic<sys::SmartMutex<true> > ActiveTimerLock;
 
 void Timer::startTimer() {
+  sys::SmartScopedLock<true> Lock(&*ActiveTimerLock);
   Started = true;
   ActiveTimers->push_back(this);
   TimeRecord TR = getTimeRecord(true);
@@ -142,6 +150,7 @@ void Timer::startTimer() {
 }
 
 void Timer::stopTimer() {
+  sys::SmartScopedLock<true> Lock(&*ActiveTimerLock);
   TimeRecord TR = getTimeRecord(false);
   Elapsed    += TR.Elapsed;
   UserTime   += TR.UserTime;
@@ -171,6 +180,7 @@ void Timer::sum(const Timer &T) {
 /// currently active timers, which will be printed when the timer group prints
 ///
 void Timer::addPeakMemoryMeasurement() {
+  sys::SmartScopedLock<true> Lock(&*ActiveTimerLock);
   size_t MemUsed = getMemUsage();
 
   for (std::vector<Timer*>::iterator I = ActiveTimers->begin(),
@@ -193,7 +203,10 @@ static ManagedStatic<Name2Timer> NamedTimers;
 
 static ManagedStatic<Name2Pair> NamedGroupedTimers;
 
+static ManagedStatic<sys::SmartMutex<true> > NamedTimerLock;
+
 static Timer &getNamedRegionTimer(const std::string &Name) {
+  sys::SmartScopedLock<true> Lock(&*NamedTimerLock);
   Name2Timer::iterator I = NamedTimers->find(Name);
   if (I != NamedTimers->end())
     return I->second;
@@ -203,6 +216,7 @@ static Timer &getNamedRegionTimer(const std::string &Name) {
 
 static Timer &getNamedRegionTimer(const std::string &Name,
                                   const std::string &GroupName) {
+  sys::SmartScopedLock<true> Lock(&*NamedTimerLock);
 
   Name2Pair::iterator I = NamedGroupedTimers->find(GroupName);
   if (I == NamedGroupedTimers->end()) {
@@ -340,7 +354,7 @@ void TimerGroup::removeTimer() {
       // If this is not an collection of ungrouped times, print the total time.
       // Ungrouped timers don't really make sense to add up.  We still print the
       // TOTAL line to make the percentages make sense.
-      if (this != DefaultTimerGroup) {
+      if (this != &*DefaultTimerGroup) {
         *OutStream << "  Total Execution Time: ";
 
         printAlignedFP(Total.getProcessTime(), 4, 5, *OutStream);
@@ -377,11 +391,5 @@ void TimerGroup::removeTimer() {
     if (OutStream != cerr.stream() && OutStream != cout.stream())
       delete OutStream;   // Close the file...
   }
-
-  // Delete default timer group!
-  if (NumTimers == 0 && this == DefaultTimerGroup) {
-    delete DefaultTimerGroup;
-    DefaultTimerGroup = 0;
-  }
 }
 
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 47151e667c40..8a4c741faf95 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -24,19 +24,29 @@ def CC_ARM_APCS : CallingConv<[
 
   CCIfType<[i8, i16], CCPromoteToType<i32>>,
 
-  // f64 is passed in pairs of GPRs, possibly split onto the stack
-  CCIfType<[f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack
+  CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
 
   CCIfType<[f32], CCBitConvertToType<i32>>,
   CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
 
   CCIfType<[i32], CCAssignToStack<4, 4>>,
-  CCIfType<[f64], CCAssignToStack<8, 4>>
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+  CCIfType<[v2f64], CCAssignToStack<16, 4>>
 ]>;
 
 def RetCC_ARM_APCS : CallingConv<[
   CCIfType<[f32], CCBitConvertToType<i32>>,
-  CCIfType<[f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
 
   CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
   CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
@@ -59,7 +69,8 @@ def CC_ARM_AAPCS_Common : CallingConv<[
                        CCAssignToReg<[R0, R1, R2, R3]>>>,
 
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
-  CCIfType<[f64], CCAssignToStack<8, 8>>
+  CCIfType<[f64], CCAssignToStack<8, 8>>,
+  CCIfType<[v2f64], CCAssignToStack<16, 8>>
 ]>;
 
 def RetCC_ARM_AAPCS_Common : CallingConv<[
@@ -72,13 +83,21 @@ def RetCC_ARM_AAPCS_Common : CallingConv<[
 //===----------------------------------------------------------------------===//
 
 def CC_ARM_AAPCS : CallingConv<[
-  CCIfType<[f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
   CCDelegateTo<CC_ARM_AAPCS_Common>
 ]>;
 
 def RetCC_ARM_AAPCS : CallingConv<[
-  CCIfType<[f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
   CCDelegateTo<RetCC_ARM_AAPCS_Common>
 ]>;
@@ -88,6 +107,10 @@ def RetCC_ARM_AAPCS : CallingConv<[
 //===----------------------------------------------------------------------===//
 
 def CC_ARM_AAPCS_VFP : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
                                  S9, S10, S11, S12, S13, S14, S15]>>,
@@ -95,6 +118,10 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
 ]>;
 
 def RetCC_ARM_AAPCS_VFP : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
                                  S9, S10, S11, S12, S13, S14, S15]>>,
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 1ed9e8080a20..ee9dadff151b 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -32,6 +32,9 @@
 #include "llvm/Support/Debug.h"
 using namespace llvm;
 
+static const unsigned arm_dsubreg_0 = 5;
+static const unsigned arm_dsubreg_1 = 6;
+
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
 /// instructions for SelectionDAG operations.
@@ -579,17 +582,18 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
   switch (N->getOpcode()) {
   default: break;
   case ISD::Constant: {
-    // ARMv6T2 and later should materialize imms via MOV / MOVT pair.
-    if (Subtarget->hasV6T2Ops() || Subtarget->hasThumb2())
-      break;
-
     unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
     bool UseCP = true;
-    if (Subtarget->isThumb())
-      UseCP = (Val > 255 &&                          // MOV
-               ~Val > 255 &&                         // MOV + MVN
-               !ARM_AM::isThumbImmShiftedVal(Val));  // MOV + LSL
-    else
+    if (Subtarget->isThumb()) {
+      if (Subtarget->hasThumb2())
+        // Thumb2 has the MOVT instruction, so all immediates can
+        // be done with MOV + MOVT, at worst.
+        UseCP = 0;
+      else
+        UseCP = (Val > 255 &&                          // MOV
+                 ~Val > 255 &&                         // MOV + MVN
+                 !ARM_AM::isThumbImmShiftedVal(Val));  // MOV + LSL
+    } else
       UseCP = (ARM_AM::getSOImmVal(Val) == -1 &&     // MOV
                ARM_AM::getSOImmVal(~Val) == -1 &&    // MVN
                !ARM_AM::isSOImmTwoPartVal(Val));     // two instrs.
@@ -917,6 +921,65 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
     return CurDAG->getTargetNode(TargetInstrInfo::DECLARE, dl,
                                  MVT::Other, Ops, 3);
   }
+
+  case ISD::CONCAT_VECTORS: {
+    MVT VT = Op.getValueType();
+    assert(VT.is128BitVector() && Op.getNumOperands() == 2 &&
+           "unexpected CONCAT_VECTORS");
+    SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+    SDNode *Result =
+      CurDAG->getTargetNode(TargetInstrInfo::IMPLICIT_DEF, dl, VT);
+    if (N0.getOpcode() != ISD::UNDEF)
+      Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT,
+                                     SDValue(Result, 0), N0,
+                                     CurDAG->getTargetConstant(arm_dsubreg_0,
+                                                               MVT::i32));
+    if (N1.getOpcode() != ISD::UNDEF)
+      Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT,
+                                     SDValue(Result, 0), N1,
+                                     CurDAG->getTargetConstant(arm_dsubreg_1,
+                                                               MVT::i32));
+    return Result;
+  }
+
+  case ISD::VECTOR_SHUFFLE: {
+    MVT VT = Op.getValueType();
+
+    // Match 128-bit splat to VDUPLANEQ.  (This could be done with a Pat in
+    // ARMInstrNEON.td but it is awkward because the shuffle mask needs to be
+    // transformed first into a lane number and then to both a subregister
+    // index and an adjusted lane number.)  If the source operand is a
+    // SCALAR_TO_VECTOR, leave it so it will be matched later as a VDUP.
+    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+    if (VT.is128BitVector() && SVOp->isSplat() &&
+        Op.getOperand(0).getOpcode() != ISD::SCALAR_TO_VECTOR &&
+        Op.getOperand(1).getOpcode() == ISD::UNDEF) {
+      unsigned LaneVal = SVOp->getSplatIndex();
+
+      MVT HalfVT;
+      unsigned Opc = 0;
+      switch (VT.getVectorElementType().getSimpleVT()) {
+      default: assert(false && "unhandled VDUP splat type");
+      case MVT::i8:  Opc = ARM::VDUPLN8q;  HalfVT = MVT::v8i8; break;
+      case MVT::i16: Opc = ARM::VDUPLN16q; HalfVT = MVT::v4i16; break;
+      case MVT::i32: Opc = ARM::VDUPLN32q; HalfVT = MVT::v2i32; break;
+      case MVT::f32: Opc = ARM::VDUPLNfq;  HalfVT = MVT::v2f32; break;
+      }
+
+      // The source operand needs to be changed to a subreg of the original
+      // 128-bit operand, and the lane number needs to be adjusted accordingly.
+      unsigned NumElts = VT.getVectorNumElements() / 2;
+      unsigned SRVal = (LaneVal < NumElts ? arm_dsubreg_0 : arm_dsubreg_1);
+      SDValue SR = CurDAG->getTargetConstant(SRVal, MVT::i32);
+      SDValue NewLane = CurDAG->getTargetConstant(LaneVal % NumElts, MVT::i32);
+      SDNode *SubReg = CurDAG->getTargetNode(TargetInstrInfo::EXTRACT_SUBREG,
+                                             dl, HalfVT, N->getOperand(0), SR);
+      return CurDAG->SelectNodeTo(N, Opc, VT, SDValue(SubReg, 0), NewLane);
+    }
+
+    break;
+  }
   }
 
   return SelectCode(Op);
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 2443625d4bdb..29d3da2b79c1 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -56,6 +56,52 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                        ISD::ArgFlagsTy &ArgFlags,
                                        CCState &State);
 
+void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
+                                       MVT PromotedBitwiseVT) {
+  if (VT != PromotedLdStVT) {
+    setOperationAction(ISD::LOAD, VT, Promote);
+    AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
+
+    setOperationAction(ISD::STORE, VT, Promote);
+    AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
+  }
+
+  MVT ElemTy = VT.getVectorElementType();
+  if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
+    setOperationAction(ISD::VSETCC, VT, Custom);
+  if (ElemTy == MVT::i8 || ElemTy == MVT::i16)
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+  setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+  setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+  if (VT.isInteger()) {
+    setOperationAction(ISD::SHL, VT, Custom);
+    setOperationAction(ISD::SRA, VT, Custom);
+    setOperationAction(ISD::SRL, VT, Custom);
+  }
+
+  // Promote all bit-wise operations.
+  if (VT.isInteger() && VT != PromotedBitwiseVT) {
+    setOperationAction(ISD::AND, VT, Promote);
+    AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
+    setOperationAction(ISD::OR,  VT, Promote);
+    AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
+    setOperationAction(ISD::XOR, VT, Promote);
+    AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
+  }
+}
+
+void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, ARM::DPRRegisterClass);
+  addTypeForNEON(VT, MVT::f64, MVT::v2i32);
+}
+
+void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, ARM::QPRRegisterClass);
+  addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
+}
+
 ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     : TargetLowering(TM), ARMPCLabelIndex(0) {
   Subtarget = &TM.getSubtarget<ARMSubtarget>();
@@ -152,6 +198,30 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 
     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   }
+
+  if (Subtarget->hasNEON()) {
+    addDRTypeForNEON(MVT::v2f32);
+    addDRTypeForNEON(MVT::v8i8);
+    addDRTypeForNEON(MVT::v4i16);
+    addDRTypeForNEON(MVT::v2i32);
+    addDRTypeForNEON(MVT::v1i64);
+
+    addQRTypeForNEON(MVT::v4f32);
+    addQRTypeForNEON(MVT::v2f64);
+    addQRTypeForNEON(MVT::v16i8);
+    addQRTypeForNEON(MVT::v8i16);
+    addQRTypeForNEON(MVT::v4i32);
+    addQRTypeForNEON(MVT::v2i64);
+
+    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+    setTargetDAGCombine(ISD::SHL);
+    setTargetDAGCombine(ISD::SRL);
+    setTargetDAGCombine(ISD::SRA);
+    setTargetDAGCombine(ISD::SIGN_EXTEND);
+    setTargetDAGCombine(ISD::ZERO_EXTEND);
+    setTargetDAGCombine(ISD::ANY_EXTEND);
+  }
+
   computeRegisterProperties();
 
   // ARM does not have f32 extending load.
@@ -352,6 +422,36 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::FMDRR:         return "ARMISD::FMDRR";
 
   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
+
+  case ARMISD::VCEQ:          return "ARMISD::VCEQ";
+  case ARMISD::VCGE:          return "ARMISD::VCGE";
+  case ARMISD::VCGEU:         return "ARMISD::VCGEU";
+  case ARMISD::VCGT:          return "ARMISD::VCGT";
+  case ARMISD::VCGTU:         return "ARMISD::VCGTU";
+  case ARMISD::VTST:          return "ARMISD::VTST";
+
+  case ARMISD::VSHL:          return "ARMISD::VSHL";
+  case ARMISD::VSHRs:         return "ARMISD::VSHRs";
+  case ARMISD::VSHRu:         return "ARMISD::VSHRu";
+  case ARMISD::VSHLLs:        return "ARMISD::VSHLLs";
+  case ARMISD::VSHLLu:        return "ARMISD::VSHLLu";
+  case ARMISD::VSHLLi:        return "ARMISD::VSHLLi";
+  case ARMISD::VSHRN:         return "ARMISD::VSHRN";
+  case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
+  case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
+  case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
+  case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
+  case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
+  case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
+  case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
+  case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
+  case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
+  case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
+  case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
+  case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
+  case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
+  case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
+  case ARMISD::VDUPLANEQ:     return "ARMISD::VDUPLANEQ";
   }
 }
 
@@ -423,63 +523,93 @@ static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
 #include "ARMGenCallingConv.inc"
 
 // APCS f64 is in register pairs, possibly split to stack
-static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                   CCValAssign::LocInfo &LocInfo,
-                                   ISD::ArgFlagsTy &ArgFlags,
-                                   CCState &State) {
-  static const unsigned HiRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
-  static const unsigned LoRegList[] = { ARM::R1,
-                                        ARM::R2,
-                                        ARM::R3,
-                                        ARM::NoRegister };
-
-  unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 4);
-  if (Reg == 0)
-    return false; // we didn't handle it
+static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          CCState &State, bool CanFail) {
+  static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+  // Try to get the first register.
+  if (unsigned Reg = State.AllocateReg(RegList, 4))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else {
+    // For the 2nd half of a v2f64, do not fail.
+    if (CanFail)
+      return false;
 
-  unsigned i;
-  for (i = 0; i < 4; ++i)
-    if (HiRegList[i] == Reg)
-      break;
+    // Put the whole thing on the stack.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8, 4),
+                                           LocVT, LocInfo));
+    return true;
+  }
 
-  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo));
-  if (LoRegList[i] != ARM::NoRegister)
-    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
-                                           MVT::i32, LocInfo));
+  // Try to get the second register.
+  if (unsigned Reg = State.AllocateReg(RegList, 4))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   else
     State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
                                            State.AllocateStack(4, 4),
-                                           MVT::i32, LocInfo));
+                                           LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags,
+                                   CCState &State) {
+  if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+    return false;
+  if (LocVT == MVT::v2f64 &&
+      !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+    return false;
   return true;  // we handled it
 }
 
 // AAPCS f64 is in aligned register pairs
-static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                    CCValAssign::LocInfo &LocInfo,
-                                    ISD::ArgFlagsTy &ArgFlags,
-                                    CCState &State) {
+static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                           CCValAssign::LocInfo &LocInfo,
+                           CCState &State, bool CanFail) {
   static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
   static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
 
   unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
-  if (Reg == 0)
-    return false; // we didn't handle it
+  if (Reg == 0) {
+    // For the 2nd half of a v2f64, do not just fail.
+    if (CanFail)
+      return false;
+
+    // Put the whole thing on the stack.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8, 8),
+                                           LocVT, LocInfo));
+    return true;
+  }
 
   unsigned i;
   for (i = 0; i < 2; ++i)
     if (HiRegList[i] == Reg)
       break;
 
-  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo));
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
-                                         MVT::i32, LocInfo));
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags,
+                                    CCState &State) {
+  if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+    return false;
+  if (LocVT == MVT::v2f64 &&
+      !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+    return false;
   return true;  // we handled it
 }
 
-static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                      CCValAssign::LocInfo &LocInfo,
-                                      ISD::ArgFlagsTy &ArgFlags,
-                                      CCState &State) {
+static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                         CCValAssign::LocInfo &LocInfo, CCState &State) {
   static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
   static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
 
@@ -492,9 +622,20 @@ static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     if (HiRegList[i] == Reg)
       break;
 
-  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo));
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
-                                         MVT::i32, LocInfo));
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                      CCValAssign::LocInfo &LocInfo,
+                                      ISD::ArgFlagsTy &ArgFlags,
+                                      CCState &State) {
+  if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+    return false;
+  if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+    return false;
   return true;  // we handled it
 }
 
@@ -558,7 +699,7 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
 
     SDValue Val;
     if (VA.needsCustom()) {
-      // Handle f64 as custom.
+      // Handle f64 or half of a v2f64.
       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
                                       InFlag);
       Chain = Lo.getValue(1);
@@ -569,6 +710,24 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
       Chain = Hi.getValue(1);
       InFlag = Hi.getValue(2);
       Val = DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, Lo, Hi);
+
+      if (VA.getLocVT() == MVT::v2f64) {
+        SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+        Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
+                          DAG.getConstant(0, MVT::i32));
+
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
+        Chain = Lo.getValue(1);
+        InFlag = Lo.getValue(2);
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
+        Chain = Hi.getValue(1);
+        InFlag = Hi.getValue(2);
+        Val = DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, Lo, Hi);
+        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
+                          DAG.getConstant(1, MVT::i32));
+      }
     } else {
       Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
                                InFlag);
@@ -625,6 +784,31 @@ ARMTargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
                       PseudoSourceValue::getStack(), LocMemOffset);
 }
 
+void ARMTargetLowering::PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG,
+                                         SDValue Chain, SDValue &Arg,
+                                         RegsToPassVector &RegsToPass,
+                                         CCValAssign &VA, CCValAssign &NextVA,
+                                         SDValue &StackPtr,
+                                         SmallVector<SDValue, 8> &MemOpChains,
+                                         ISD::ArgFlagsTy Flags) {
+  DebugLoc dl = TheCall->getDebugLoc();
+
+  SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl,
+                              DAG.getVTList(MVT::i32, MVT::i32), Arg);
+  RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd));
+
+  if (NextVA.isRegLoc())
+    RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1)));
+  else {
+    assert(NextVA.isMemLoc());
+    if (StackPtr.getNode() == 0)
+      StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+
+    MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, NextVA,
+                                           Chain, fmrrd.getValue(1), Flags));
+  }
+}
+
 /// LowerCALL - Lowering a ISD::CALL node into a callseq_start <-
 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
 /// nodes.
@@ -651,7 +835,7 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
 
   SDValue StackPtr = DAG.getRegister(ARM::SP, MVT::i32);
 
-  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  RegsToPassVector RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
@@ -681,22 +865,32 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
       break;
     }
 
-    // f64 is passed in i32 pairs and must be combined
+    // f64 and v2f64 are passed in i32 pairs and must be split into pieces
     if (VA.needsCustom()) {
-      SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl,
-                                  DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
-      RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd));
-      VA = ArgLocs[++i]; // skip ahead to next loc
-      if (VA.isRegLoc())
-        RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(1)));
-      else {
-        assert(VA.isMemLoc());
-        if (StackPtr.getNode() == 0)
-          StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
-
-        MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
-                                               Chain, fmrrd.getValue(1),
-                                               Flags));
+      if (VA.getLocVT() == MVT::v2f64) {
+        SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                  DAG.getConstant(0, MVT::i32));
+        SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                  DAG.getConstant(1, MVT::i32));
+
+        PassF64ArgInRegs(TheCall, DAG, Chain, Op0, RegsToPass,
+                         VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
+
+        VA = ArgLocs[++i]; // skip ahead to next loc
+        if (VA.isRegLoc()) {
+          PassF64ArgInRegs(TheCall, DAG, Chain, Op1, RegsToPass,
+                           VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
+        } else {
+          assert(VA.isMemLoc());
+          if (StackPtr.getNode() == 0)
+            StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+
+          MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
+                                                 Chain, Op1, Flags));
+        }
+      } else {
+        PassF64ArgInRegs(TheCall, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+                         StackPtr, MemOpChains, Flags);
       }
     } else if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
@@ -864,9 +1058,28 @@ SDValue ARMTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
       break;
     }
 
-    // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
-    // available.
     if (VA.needsCustom()) {
+      if (VA.getLocVT() == MVT::v2f64) {
+        // Extract the first half and return it in two registers.
+        SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                   DAG.getConstant(0, MVT::i32));
+        SDValue HalfGPRs = DAG.getNode(ARMISD::FMRRD, dl,
+                                       DAG.getVTList(MVT::i32, MVT::i32), Half);
+
+        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag);
+        Flag = Chain.getValue(1);
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                                 HalfGPRs.getValue(1), Flag);
+        Flag = Chain.getValue(1);
+        VA = RVLocs[++i]; // skip ahead to next loc
+
+        // Extract the 2nd half and fall through to handle it as an f64 value.
+        Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                          DAG.getConstant(1, MVT::i32));
+      }
+      // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
+      // available.
       SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl,
                                   DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag);
@@ -1117,6 +1330,40 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
 }
 
 SDValue
+ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
+                                        SDValue &Root, SelectionDAG &DAG,
+                                        DebugLoc dl) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  TargetRegisterClass *RC;
+  if (AFI->isThumbFunction())
+    RC = ARM::tGPRRegisterClass;
+  else
+    RC = ARM::GPRRegisterClass;
+
+  // Transform the arguments stored in physical registers into virtual ones.
+  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+  SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
+
+  SDValue ArgValue2;
+  if (NextVA.isMemLoc()) {
+    unsigned ArgSize = NextVA.getLocVT().getSizeInBits()/8;
+    MachineFrameInfo *MFI = MF.getFrameInfo();
+    int FI = MFI->CreateFixedObject(ArgSize, NextVA.getLocMemOffset());
+
+    // Create load node to retrieve arguments from the stack.
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+    ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, NULL, 0);
+  } else {
+    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
+    ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
+  }
+
+  return DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, ArgValue, ArgValue2);
+}
+
+SDValue
 ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1141,47 +1388,45 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
     // Arguments stored in registers.
     if (VA.isRegLoc()) {
       MVT RegVT = VA.getLocVT();
-      TargetRegisterClass *RC;
-      if (AFI->isThumbFunction())
-        RC = ARM::tGPRRegisterClass;
-      else
-        RC = ARM::GPRRegisterClass;
 
-      if (FloatABIType == FloatABI::Hard) {
-        if (RegVT == MVT::f32)
-          RC = ARM::SPRRegisterClass;
-        else if (RegVT == MVT::f64)
-          RC = ARM::DPRRegisterClass;
-      } else if (RegVT == MVT::f64) {
-        // f64 is passed in pairs of GPRs and must be combined.
+      SDValue ArgValue;
+      if (VA.needsCustom()) {
+        // f64 and vector types are split up into multiple registers or
+        // combinations of registers and stack slots.
         RegVT = MVT::i32;
-      } else if (!((RegVT == MVT::i32) || (RegVT == MVT::f32)))
-        assert(0 && "RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
-      // Transform the arguments stored in physical registers into virtual ones.
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
-      SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT);
+        if (VA.getLocVT() == MVT::v2f64) {
+          SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
+                                                   Root, DAG, dl);
+          VA = ArgLocs[++i]; // skip ahead to next loc
+          SDValue ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
+                                                   Root, DAG, dl);
+          ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
+                                 ArgValue, ArgValue1, DAG.getIntPtrConstant(0));
+          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
+                                 ArgValue, ArgValue2, DAG.getIntPtrConstant(1));
+        } else
+          ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Root, DAG, dl);
 
-      // f64 is passed in i32 pairs and must be combined.
-      if (VA.needsCustom()) {
-        SDValue ArgValue2;
+      } else {
+        TargetRegisterClass *RC;
+        if (FloatABIType == FloatABI::Hard && RegVT == MVT::f32)
+          RC = ARM::SPRRegisterClass;
+        else if (FloatABIType == FloatABI::Hard && RegVT == MVT::f64)
+          RC = ARM::DPRRegisterClass;
+        else if (AFI->isThumbFunction())
+          RC = ARM::tGPRRegisterClass;
+        else
+          RC = ARM::GPRRegisterClass;
 
-        VA = ArgLocs[++i]; // skip ahead to next loc
-        if (VA.isMemLoc()) {
-          // must be APCS to split like this
-          unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
-          int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset());
-
-          // Create load node to retrieve arguments from the stack.
-          SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-          ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, NULL, 0);
-        } else {
-          Reg = MF.addLiveIn(VA.getLocReg(), RC);
-          ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
-        }
+        assert((RegVT == MVT::i32 || RegVT == MVT::f32 ||
+                (FloatABIType == FloatABI::Hard && RegVT == MVT::f64)) &&
+               "RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
-        ArgValue = DAG.getNode(ARMISD::FMDRR, dl, MVT::f64,
-                               ArgValue, ArgValue2);
+        // Transform the arguments in physical registers into virtual ones.
+        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+        ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT);
       }
 
       // If this is an 8 or 16-bit value, it is really passed promoted
@@ -1638,8 +1883,78 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
 }
 
-static SDValue ExpandSRx(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) {
-  assert(N->getValueType(0) == MVT::i64 &&
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+///
+static SDValue getZeroVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+
+  // Zero vectors are used to represent vector negation and in those cases
+  // will be implemented with the NEON VNEG instruction.  However, VNEG does
+  // not support i64 elements, so sometimes the zero vectors will need to be
+  // explicitly constructed.  For those cases, and potentially other uses in
+  // the future, always build zero vectors as <4 x i32> or <2 x i32> bitcasted
+  // to their dest type.  This ensures they get CSE'd.
+  SDValue Vec;
+  SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+  if (VT.getSizeInBits() == 64)
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
+  else
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
+}
+
+/// getOnesVector - Returns a vector of specified type with all bits set.
+///
+static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+
+  // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+  // type.  This ensures they get CSE'd.
+  SDValue Vec;
+  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+  if (VT.getSizeInBits() == 64)
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
+  else
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
+}
+
+static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) {
+  MVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Lower vector shifts on NEON to use VSHL.
+  if (VT.isVector()) {
+    assert(ST->hasNEON() && "unexpected vector shift");
+
+    // Left shifts translate directly to the vshiftu intrinsic.
+    if (N->getOpcode() == ISD::SHL)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                         DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32),
+                         N->getOperand(0), N->getOperand(1));
+
+    assert((N->getOpcode() == ISD::SRA ||
+            N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
+
+    // NEON uses the same intrinsics for both left and right shifts.  For
+    // right shifts, the shift amounts are negative, so negate the vector of
+    // shift amounts.
+    MVT ShiftVT = N->getOperand(1).getValueType();
+    SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
+                                       getZeroVector(ShiftVT, DAG, dl),
+                                       N->getOperand(1));
+    Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
+                               Intrinsic::arm_neon_vshifts :
+                               Intrinsic::arm_neon_vshiftu);
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(vshiftInt, MVT::i32),
+                       N->getOperand(0), NegatedCount);
+  }
+
+  assert(VT == MVT::i64 &&
          (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
          "Unknown shift to lower!");
 
@@ -1652,7 +1967,6 @@ static SDValue ExpandSRx(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) {
   if (ST->isThumb()) return SDValue();
 
   // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
-  DebugLoc dl = N->getDebugLoc();
   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
                              DAG.getConstant(0, MVT::i32));
   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
@@ -1670,6 +1984,273 @@ static SDValue ExpandSRx(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) {
  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
 }
 
+static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
+  SDValue TmpOp0, TmpOp1;
+  bool Invert = false;
+  bool Swap = false;
+  unsigned Opc = 0;
+
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  MVT VT = Op.getValueType();
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Op.getOperand(1).getValueType().isFloatingPoint()) {
+    switch (SetCCOpcode) {
+    default: assert(0 && "Illegal FP comparison"); break;
+    case ISD::SETUNE:
+    case ISD::SETNE:  Invert = true; // Fallthrough
+    case ISD::SETOEQ:
+    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
+    case ISD::SETOLT:
+    case ISD::SETLT: Swap = true; // Fallthrough
+    case ISD::SETOGT:
+    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
+    case ISD::SETOLE:
+    case ISD::SETLE:  Swap = true; // Fallthrough
+    case ISD::SETOGE:
+    case ISD::SETGE: Opc = ARMISD::VCGE; break;
+    case ISD::SETUGE: Swap = true; // Fallthrough
+    case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
+    case ISD::SETUGT: Swap = true; // Fallthrough
+    case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
+    case ISD::SETUEQ: Invert = true; // Fallthrough
+    case ISD::SETONE:
+      // Expand this to (OLT | OGT).
+      TmpOp0 = Op0;
+      TmpOp1 = Op1;
+      Opc = ISD::OR;
+      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
+      Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1);
+      break;
+    case ISD::SETUO: Invert = true; // Fallthrough
+    case ISD::SETO:
+      // Expand this to (OLT | OGE).
+      TmpOp0 = Op0;
+      TmpOp1 = Op1;
+      Opc = ISD::OR;
+      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
+      Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1);
+      break;
+    }
+  } else {
+    // Integer comparisons.
+    switch (SetCCOpcode) {
+    default: assert(0 && "Illegal integer comparison"); break;
+    case ISD::SETNE:  Invert = true;
+    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
+    case ISD::SETLT:  Swap = true;
+    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
+    case ISD::SETLE:  Swap = true;
+    case ISD::SETGE:  Opc = ARMISD::VCGE; break;
+    case ISD::SETULT: Swap = true;
+    case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
+    case ISD::SETULE: Swap = true;
+    case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
+    }
+
+    // Detect VTST (Vector Test Bits) = vicmp ne (and (op0, op1), zero).
+    if (Opc == ARMISD::VCEQ) {
+
+      SDValue AndOp;
+      if (ISD::isBuildVectorAllZeros(Op1.getNode()))
+        AndOp = Op0;
+      else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
+        AndOp = Op1;
+
+      // Ignore bitconvert.
+      if (AndOp.getNode() && AndOp.getOpcode() == ISD::BIT_CONVERT)
+        AndOp = AndOp.getOperand(0);
+
+      if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
+        Opc = ARMISD::VTST;
+        Op0 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(0));
+        Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(1));
+        Invert = !Invert;
+      }
+    }
+  }
+
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+
+  if (Invert)
+    Result = DAG.getNOT(dl, Result, VT);
+
+  return Result;
+}
+
+/// isVMOVSplat - Check if the specified splat value corresponds to an immediate
+/// VMOV instruction, and if so, return the constant being splatted.
+static SDValue isVMOVSplat(uint64_t SplatBits, uint64_t SplatUndef,
+                           unsigned SplatBitSize, SelectionDAG &DAG) {
+  switch (SplatBitSize) {
+  case 8:
+    // Any 1-byte value is OK.
+    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
+    return DAG.getTargetConstant(SplatBits, MVT::i8);
+
+  case 16:
+    // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
+    if ((SplatBits & ~0xff) == 0 ||
+        (SplatBits & ~0xff00) == 0)
+      return DAG.getTargetConstant(SplatBits, MVT::i16);
+    break;
+
+  case 32:
+    // NEON's 32-bit VMOV supports splat values where:
+    // * only one byte is nonzero, or
+    // * the least significant byte is 0xff and the second byte is nonzero, or
+    // * the least significant 2 bytes are 0xff and the third is nonzero.
+    if ((SplatBits & ~0xff) == 0 ||
+        (SplatBits & ~0xff00) == 0 ||
+        (SplatBits & ~0xff0000) == 0 ||
+        (SplatBits & ~0xff000000) == 0)
+      return DAG.getTargetConstant(SplatBits, MVT::i32);
+
+    if ((SplatBits & ~0xffff) == 0 &&
+        ((SplatBits | SplatUndef) & 0xff) == 0xff)
+      return DAG.getTargetConstant(SplatBits | 0xff, MVT::i32);
+
+    if ((SplatBits & ~0xffffff) == 0 &&
+        ((SplatBits | SplatUndef) & 0xffff) == 0xffff)
+      return DAG.getTargetConstant(SplatBits | 0xffff, MVT::i32);
+
+    // Note: there are a few 32-bit splat values (specifically: 00ffff00,
+    // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
+    // VMOV.I32.  A (very) minor optimization would be to replicate the value
+    // and fall through here to test for a valid 64-bit splat.  But, then the
+    // caller would also need to check and handle the change in size.
+    break;
+
+  case 64: {
+    // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
+    uint64_t BitMask = 0xff;
+    uint64_t Val = 0;
+    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
+      if (((SplatBits | SplatUndef) & BitMask) == BitMask)
+        Val |= BitMask;
+      else if ((SplatBits & BitMask) != 0)
+        return SDValue();
+      BitMask <<= 8;
+    }
+    return DAG.getTargetConstant(Val, MVT::i64);
+  }
+
+  default:
+    assert(0 && "unexpected size for isVMOVSplat");
+    break;
+  }
+
+  return SDValue();
+}
+
+/// getVMOVImm - If this is a build_vector of constants which can be
+/// formed by using a VMOV instruction of the specified element size,
+/// return the constant being splatted.  The ByteSize field indicates the
+/// number of bytes of each element [1248].
+SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                      HasAnyUndefs, ByteSize * 8))
+    return SDValue();
+
+  if (SplatBitSize > ByteSize * 8)
+    return SDValue();
+
+  return isVMOVSplat(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
+                     SplatBitSize, DAG);
+}
+
+static SDValue BuildSplat(SDValue Val, MVT VT, SelectionDAG &DAG, DebugLoc dl) {
+  // Canonicalize all-zeros and all-ones vectors.
+  ConstantSDNode *ConstVal = dyn_cast<ConstantSDNode>(Val.getNode());
+  if (ConstVal->isNullValue())
+    return getZeroVector(VT, DAG, dl);
+  if (ConstVal->isAllOnesValue())
+    return getOnesVector(VT, DAG, dl);
+
+  MVT CanonicalVT;
+  if (VT.is64BitVector()) {
+    switch (Val.getValueType().getSizeInBits()) {
+    case 8:  CanonicalVT = MVT::v8i8; break;
+    case 16: CanonicalVT = MVT::v4i16; break;
+    case 32: CanonicalVT = MVT::v2i32; break;
+    case 64: CanonicalVT = MVT::v1i64; break;
+    default: assert(0 && "unexpected splat element type"); break;
+    }
+  } else {
+    assert(VT.is128BitVector() && "unknown splat vector size");
+    switch (Val.getValueType().getSizeInBits()) {
+    case 8:  CanonicalVT = MVT::v16i8; break;
+    case 16: CanonicalVT = MVT::v8i16; break;
+    case 32: CanonicalVT = MVT::v4i32; break;
+    case 64: CanonicalVT = MVT::v2i64; break;
+    default: assert(0 && "unexpected splat element type"); break;
+    }
+  }
+
+  // Build a canonical splat for this value.
+  SmallVector<SDValue, 8> Ops;
+  Ops.assign(CanonicalVT.getVectorNumElements(), Val);
+  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, &Ops[0],
+                            Ops.size());
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Res);
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.
+static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  assert(BVN != 0 && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
+  DebugLoc dl = Op.getDebugLoc();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    SDValue Val = isVMOVSplat(SplatBits.getZExtValue(),
+                              SplatUndef.getZExtValue(), SplatBitSize, DAG);
+    if (Val.getNode())
+      return BuildSplat(Val, Op.getValueType(), DAG, dl);
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+  return Op;
+}
+
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  return Op;
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  assert((VT == MVT::i8 || VT == MVT::i16) &&
+         "unexpected type for custom-lowering vector extract");
+  SDValue Vec = Op.getOperand(0);
+  SDValue Lane = Op.getOperand(1);
+  Op = DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
+  Op = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Op, DAG.getValueType(VT));
+  return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op) {
+  if (Op.getValueType().is128BitVector() && Op.getNumOperands() == 2)
+    return Op;
+  return SDValue();
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   switch (Op.getOpcode()) {
   default: assert(0 && "Don't know how to custom lower this!"); abort();
@@ -1695,8 +2276,15 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::BIT_CONVERT:   return ExpandBIT_CONVERT(Op.getNode(), DAG);
+  case ISD::SHL:
   case ISD::SRL:
-  case ISD::SRA:           return ExpandSRx(Op.getNode(), DAG,Subtarget);
+  case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
+  case ISD::VSETCC:        return LowerVSETCC(Op, DAG);
+  case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op);
   }
   return SDValue();
 }
@@ -1715,7 +2303,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   case ISD::SRL:
   case ISD::SRA: {
-    SDValue Res = ExpandSRx(N, DAG, Subtarget);
+    SDValue Res = LowerShift(N, DAG, Subtarget);
     if (Res.getNode())
       Results.push_back(Res);
     return;
@@ -1900,6 +2488,294 @@ static SDValue PerformFMRRDCombine(SDNode *N,
   return SDValue();
 }
 
+/// getVShiftImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BIT_CONVERT)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                      HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation.  That value must be in the range:
+///   0 <= Value < ElementBits for a left shift; or
+///   0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, MVT VT, bool isLong, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (! getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation.  For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+///   1 <= |Value| <= ElementBits for a right shift; or
+///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, MVT VT, bool isNarrow, bool isIntrinsic,
+                         int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (! getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  if (isIntrinsic)
+    Cnt = -Cnt;
+  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
+}
+
+/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
+static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default:
+    // Don't do anything for most intrinsics.
+    break;
+
+  // Vector shifts: check for immediate versions and lower them.
+  // Note: This is done during DAG combining instead of DAG legalizing because
+  // the build_vectors for 64-bit vector element shift counts are generally
+  // not legal, and it is hard to see their values after they get legalized to
+  // loads from a constant pool.
+  case Intrinsic::arm_neon_vshifts:
+  case Intrinsic::arm_neon_vshiftu:
+  case Intrinsic::arm_neon_vshiftls:
+  case Intrinsic::arm_neon_vshiftlu:
+  case Intrinsic::arm_neon_vshiftn:
+  case Intrinsic::arm_neon_vrshifts:
+  case Intrinsic::arm_neon_vrshiftu:
+  case Intrinsic::arm_neon_vrshiftn:
+  case Intrinsic::arm_neon_vqshifts:
+  case Intrinsic::arm_neon_vqshiftu:
+  case Intrinsic::arm_neon_vqshiftsu:
+  case Intrinsic::arm_neon_vqshiftns:
+  case Intrinsic::arm_neon_vqshiftnu:
+  case Intrinsic::arm_neon_vqshiftnsu:
+  case Intrinsic::arm_neon_vqrshiftns:
+  case Intrinsic::arm_neon_vqrshiftnu:
+  case Intrinsic::arm_neon_vqrshiftnsu: {
+    MVT VT = N->getOperand(1).getValueType();
+    int64_t Cnt;
+    unsigned VShiftOpc = 0;
+
+    switch (IntNo) {
+    case Intrinsic::arm_neon_vshifts:
+    case Intrinsic::arm_neon_vshiftu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
+        VShiftOpc = ARMISD::VSHL;
+        break;
+      }
+      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
+        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
+                     ARMISD::VSHRs : ARMISD::VSHRu);
+        break;
+      }
+      return SDValue();
+
+    case Intrinsic::arm_neon_vshiftls:
+    case Intrinsic::arm_neon_vshiftlu:
+      if (isVShiftLImm(N->getOperand(2), VT, true, Cnt))
+        break;
+      assert(0 && "invalid shift count for vshll intrinsic");
+      abort();
+
+    case Intrinsic::arm_neon_vrshifts:
+    case Intrinsic::arm_neon_vrshiftu:
+      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
+        break;
+      return SDValue();
+
+    case Intrinsic::arm_neon_vqshifts:
+    case Intrinsic::arm_neon_vqshiftu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
+        break;
+      return SDValue();
+
+    case Intrinsic::arm_neon_vqshiftsu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
+        break;
+      assert(0 && "invalid shift count for vqshlu intrinsic");
+      abort();
+
+    case Intrinsic::arm_neon_vshiftn:
+    case Intrinsic::arm_neon_vrshiftn:
+    case Intrinsic::arm_neon_vqshiftns:
+    case Intrinsic::arm_neon_vqshiftnu:
+    case Intrinsic::arm_neon_vqshiftnsu:
+    case Intrinsic::arm_neon_vqrshiftns:
+    case Intrinsic::arm_neon_vqrshiftnu:
+    case Intrinsic::arm_neon_vqrshiftnsu:
+      // Narrowing shifts require an immediate right shift.
+      if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
+        break;
+      assert(0 && "invalid shift count for narrowing vector shift intrinsic");
+      abort();
+
+    default:
+      assert(0 && "unhandled vector shift");
+    }
+
+    switch (IntNo) {
+    case Intrinsic::arm_neon_vshifts:
+    case Intrinsic::arm_neon_vshiftu:
+      // Opcode already set above.
+      break;
+    case Intrinsic::arm_neon_vshiftls:
+    case Intrinsic::arm_neon_vshiftlu:
+      if (Cnt == VT.getVectorElementType().getSizeInBits())
+        VShiftOpc = ARMISD::VSHLLi;
+      else
+        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ?
+                     ARMISD::VSHLLs : ARMISD::VSHLLu);
+      break;
+    case Intrinsic::arm_neon_vshiftn:
+      VShiftOpc = ARMISD::VSHRN; break;
+    case Intrinsic::arm_neon_vrshifts:
+      VShiftOpc = ARMISD::VRSHRs; break;
+    case Intrinsic::arm_neon_vrshiftu:
+      VShiftOpc = ARMISD::VRSHRu; break;
+    case Intrinsic::arm_neon_vrshiftn:
+      VShiftOpc = ARMISD::VRSHRN; break;
+    case Intrinsic::arm_neon_vqshifts:
+      VShiftOpc = ARMISD::VQSHLs; break;
+    case Intrinsic::arm_neon_vqshiftu:
+      VShiftOpc = ARMISD::VQSHLu; break;
+    case Intrinsic::arm_neon_vqshiftsu:
+      VShiftOpc = ARMISD::VQSHLsu; break;
+    case Intrinsic::arm_neon_vqshiftns:
+      VShiftOpc = ARMISD::VQSHRNs; break;
+    case Intrinsic::arm_neon_vqshiftnu:
+      VShiftOpc = ARMISD::VQSHRNu; break;
+    case Intrinsic::arm_neon_vqshiftnsu:
+      VShiftOpc = ARMISD::VQSHRNsu; break;
+    case Intrinsic::arm_neon_vqrshiftns:
+      VShiftOpc = ARMISD::VQRSHRNs; break;
+    case Intrinsic::arm_neon_vqrshiftnu:
+      VShiftOpc = ARMISD::VQRSHRNu; break;
+    case Intrinsic::arm_neon_vqrshiftnsu:
+      VShiftOpc = ARMISD::VQRSHRNsu; break;
+    }
+
+    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
+                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
+  }
+
+  case Intrinsic::arm_neon_vshiftins: {
+    MVT VT = N->getOperand(1).getValueType();
+    int64_t Cnt;
+    unsigned VShiftOpc = 0;
+
+    if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
+      VShiftOpc = ARMISD::VSLI;
+    else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
+      VShiftOpc = ARMISD::VSRI;
+    else {
+      assert(0 && "invalid shift count for vsli/vsri intrinsic");
+      abort();
+    }
+
+    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2),
+                       DAG.getConstant(Cnt, MVT::i32));
+  }
+
+  case Intrinsic::arm_neon_vqrshifts:
+  case Intrinsic::arm_neon_vqrshiftu:
+    // No immediate versions of these to check for.
+    break;
+  }
+
+  return SDValue();
+}
+
+/// PerformShiftCombine - Checks for immediate versions of vector shifts and
+/// lowers them.  As with the vector shift intrinsics, this is done during DAG
+/// combining instead of DAG legalizing because the build_vectors for 64-bit
+/// vector element shift counts are generally not legal, and it is hard to see
+/// their values after they get legalized to loads from a constant pool.
+static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
+                                   const ARMSubtarget *ST) {
+  MVT VT = N->getValueType(0);
+
+  // Nothing to be done for scalar shifts.
+  if (! VT.isVector())
+    return SDValue();
+
+  assert(ST->hasNEON() && "unexpected vector shift");
+  int64_t Cnt;
+
+  switch (N->getOpcode()) {
+  default: assert(0 && "unexpected shift opcode");
+
+  case ISD::SHL:
+    if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
+      return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    break;
+
+  case ISD::SRA:
+  case ISD::SRL:
+    if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
+      unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
+                            ARMISD::VSHRs : ARMISD::VSHRu);
+      return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    }
+  }
+  return SDValue();
+}
+
+/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
+/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
+static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
+                                    const ARMSubtarget *ST) {
+  SDValue N0 = N->getOperand(0);
+
+  // Check for sign- and zero-extensions of vector extract operations of 8-
+  // and 16-bit vector elements.  NEON supports these directly.  They are
+  // handled during DAG combining because type legalization will promote them
+  // to 32-bit types and it is messy to recognize the operations after that.
+  if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    SDValue Vec = N0.getOperand(0);
+    SDValue Lane = N0.getOperand(1);
+    MVT VT = N->getValueType(0);
+    MVT EltVT = N0.getValueType();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+    if (VT == MVT::i32 &&
+        (EltVT == MVT::i8 || EltVT == MVT::i16) &&
+        TLI.isTypeLegal(Vec.getValueType())) {
+
+      unsigned Opc = 0;
+      switch (N->getOpcode()) {
+      default: assert(0 && "unexpected opcode");
+      case ISD::SIGN_EXTEND:
+        Opc = ARMISD::VGETLANEs;
+        break;
+      case ISD::ZERO_EXTEND:
+      case ISD::ANY_EXTEND:
+        Opc = ARMISD::VGETLANEu;
+        break;
+      }
+      return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane);
+    }
+  }
+
+  return SDValue();
+}
+
 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
@@ -1907,8 +2783,17 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ADD:      return PerformADDCombine(N, DCI);
   case ISD::SUB:      return PerformSUBCombine(N, DCI);
   case ARMISD::FMRRD: return PerformFMRRDCombine(N, DCI);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return PerformIntrinsicCombine(N, DCI.DAG);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    return PerformShiftCombine(N, DCI.DAG, Subtarget);
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+    return PerformExtendCombine(N, DCI.DAG, Subtarget);
   }
-
   return SDValue();
 }
 
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 8f53e396eaaf..631e37f90b45 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -67,10 +67,65 @@ namespace llvm {
       EH_SJLJ_SETJMP,    // SjLj exception handling setjmp
       EH_SJLJ_LONGJMP,   // SjLj exception handling longjmp
 
-      THREAD_POINTER
+      THREAD_POINTER,
+
+      VCEQ,         // Vector compare equal.
+      VCGE,         // Vector compare greater than or equal.
+      VCGEU,        // Vector compare unsigned greater than or equal.
+      VCGT,         // Vector compare greater than.
+      VCGTU,        // Vector compare unsigned greater than.
+      VTST,         // Vector test bits.
+
+      // Vector shift by immediate:
+      VSHL,         // ...left
+      VSHRs,        // ...right (signed)
+      VSHRu,        // ...right (unsigned)
+      VSHLLs,       // ...left long (signed)
+      VSHLLu,       // ...left long (unsigned)
+      VSHLLi,       // ...left long (with maximum shift count)
+      VSHRN,        // ...right narrow
+
+      // Vector rounding shift by immediate:
+      VRSHRs,       // ...right (signed)
+      VRSHRu,       // ...right (unsigned)
+      VRSHRN,       // ...right narrow
+
+      // Vector saturating shift by immediate:
+      VQSHLs,       // ...left (signed)
+      VQSHLu,       // ...left (unsigned)
+      VQSHLsu,      // ...left (signed to unsigned)
+      VQSHRNs,      // ...right narrow (signed)
+      VQSHRNu,      // ...right narrow (unsigned)
+      VQSHRNsu,     // ...right narrow (signed to unsigned)
+
+      // Vector saturating rounding shift by immediate:
+      VQRSHRNs,     // ...right narrow (signed)
+      VQRSHRNu,     // ...right narrow (unsigned)
+      VQRSHRNsu,    // ...right narrow (signed to unsigned)
+
+      // Vector shift and insert:
+      VSLI,         // ...left
+      VSRI,         // ...right
+
+      // Vector get lane (VMOV scalar to ARM core register)
+      // (These are used for 8- and 16-bit element types only.)
+      VGETLANEu,    // zero-extend vector extract element
+      VGETLANEs,    // sign-extend vector extract element
+
+      // Vector duplicate lane (128-bit result only; 64-bit is a shuffle)
+      VDUPLANEQ     // splat a lane from a 64-bit vector to a 128-bit vector
     };
   }
 
+  /// Define some predicates that are used for node matching.
+  namespace ARM {
+    /// getVMOVImm - If this is a build_vector of constants which can be
+    /// formed by using a VMOV instruction of the specified element size,
+    /// return the constant being splatted.  The ByteSize field indicates the
+    /// number of bytes of each element [1248].
+    SDValue getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+  }
+
   //===--------------------------------------------------------------------===//
   //  ARMTargetLowering - ARM Implementation of the TargetLowering interface
 
@@ -151,6 +206,21 @@ namespace llvm {
     ///
     unsigned ARMPCLabelIndex;
 
+    void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
+    void addDRTypeForNEON(MVT VT);
+    void addQRTypeForNEON(MVT VT);
+
+    typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
+    void PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG,
+                          SDValue Chain, SDValue &Arg,
+                          RegsToPassVector &RegsToPass,
+                          CCValAssign &VA, CCValAssign &NextVA,
+                          SDValue &StackPtr,
+                          SmallVector<SDValue, 8> &MemOpChains,
+                          ISD::ArgFlagsTy Flags);
+    SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
+                                 SDValue &Root, SelectionDAG &DAG, DebugLoc dl);
+
     CCAssignFn *CCAssignFnForNode(unsigned CC, bool Return) const;
     SDValue LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
                              const SDValue &StackPtr, const CCValAssign &VA,
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 9a1e1c2bb756..14cca7a6e5cb 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -49,6 +49,11 @@ def VFPMiscFrm    : Format<22>;
 
 def ThumbFrm      : Format<23>;
 
+def NEONFrm       : Format<24>;
+def NEONGetLnFrm  : Format<25>;
+def NEONSetLnFrm  : Format<26>;
+def NEONDupFrm    : Format<27>;
+
 // Misc flag for data processing instructions that indicates whether
 // the instruction has a Rn register operand.
 class UnaryDP  { bit isUnaryDataProc = 1; }
@@ -737,6 +742,14 @@ class TIx2<dag outs, dag ins, string asm, list<dag> pattern>
 class TJTI<dag outs, dag ins, string asm, list<dag> pattern>
   : ThumbI<outs, ins, AddrModeNone, SizeSpecial, asm, "", pattern>;
 
+// ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
+class ThumbPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb];
+}
+
+class ThumbV5Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb, HasV5T];
+}
 
 //===----------------------------------------------------------------------===//
 
@@ -857,12 +870,102 @@ class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
 
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// ARM NEON Instruction templates.
+//
 
-// ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
-class ThumbPat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb];
-}
-
-class ThumbV5Pat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb, HasV5T];
-}
+class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, string asm,
+            string cstr, list<dag> pattern>
+  : InstARM<am, Size4Bytes, im, NEONFrm, cstr> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+}
+
+class NI<dag oops, dag iops, string asm, list<dag> pattern>
+  : NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, "", pattern> {
+}
+
+class NDataI<dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, cstr, pattern> {
+  let Inst{31-25} = 0b1111001;
+}
+
+// NEON "one register and a modified immediate" format.
+class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
+               bit op5, bit op4,
+               dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, asm, cstr, pattern> {
+  let Inst{23} = op23;
+  let Inst{21-19} = op21_19;
+  let Inst{11-8} = op11_8;
+  let Inst{7} = op7;
+  let Inst{6} = op6;
+  let Inst{5} = op5;
+  let Inst{4} = op4;
+}
+
+// NEON 2 vector register format.
+class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
+          bits<5> op11_7, bit op6, bit op4,
+          dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, asm, cstr, pattern> {
+  let Inst{24-23} = op24_23;
+  let Inst{21-20} = op21_20;
+  let Inst{19-18} = op19_18;
+  let Inst{17-16} = op17_16;
+  let Inst{11-7} = op11_7;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// NEON 2 vector register with immediate.
+class N2VImm<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+             bit op6, bit op4,
+             dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, asm, cstr, pattern> {
+  let Inst{24} = op24;
+  let Inst{23} = op23;
+  let Inst{21-16} = op21_16;
+  let Inst{11-8} = op11_8;
+  let Inst{7} = op7;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// NEON 3 vector register format.
+class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
+          dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, asm, cstr, pattern> {
+  let Inst{24} = op24;
+  let Inst{23} = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8} = op11_8;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// NEON VMOVs between scalar and core registers.
+class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+               dag oops, dag iops, Format f, string opc, string asm,
+               list<dag> pattern>
+  : AI<oops, iops, f, opc, asm, pattern> {
+  let Inst{27-20} = opcod1;
+  let Inst{11-8} = opcod2;
+  let Inst{6-5} = opcod3;
+  let Inst{4} = 1;
+  list<Predicate> Predicates = [HasNEON];
+}
+class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+                dag oops, dag iops, string opc, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, opc, asm,
+             pattern>;
+class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+                dag oops, dag iops, string opc, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, opc, asm,
+             pattern>;
+class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+            dag oops, dag iops, string opc, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, opc, asm, pattern>;
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index d19fb8eea87f..e8da9276243f 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -59,6 +59,8 @@ bool ARMInstrInfo::isMoveInstr(const MachineInstr &MI,
     return false;
   case ARM::FCPYS:
   case ARM::FCPYD:
+  case ARM::VMOVD:
+  case ARM::VMOVQ:
     SrcReg = MI.getOperand(1).getReg();
     DstReg = MI.getOperand(0).getReg();
     return true;
@@ -528,6 +530,8 @@ bool ARMInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
   else if (DestRC == ARM::DPRRegisterClass)
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg)
                    .addReg(SrcReg));
+  else if (DestRC == ARM::QPRRegisterClass)
+    BuildMI(MBB, I, DL, get(ARM::VMOVQ), DestReg).addReg(SrcReg);
   else
     return false;
   
@@ -844,6 +848,10 @@ canFoldMemoryOperand(const MachineInstr *MI,
   case ARM::FCPYS:
   case ARM::FCPYD:
     return true;
+
+  case ARM::VMOVD:
+  case ARM::VMOVQ:
+    return false; // FIXME
   }
 
   return false;
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index 13ff3fea84be..9658f3bcba8b 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -114,6 +114,12 @@ namespace ARMII {
     // Thumb format
     ThumbFrm      = 23 << FormShift,
 
+    // NEON format
+    NEONFrm       = 24 << FormShift,
+    NEONGetLnFrm  = 25 << FormShift,
+    NEONSetLnFrm  = 26 << FormShift,
+    NEONDupFrm    = 27 << FormShift,
+
     //===------------------------------------------------------------------===//
     // Field shifts - such shifts are used to set field while generating
     // machine instructions.
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 4707e3b7a97f..44e67e9fc524 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -93,9 +93,15 @@ def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", SDT_ARMEH_SJLJ_Setjmp>;
 def HasV5T    : Predicate<"Subtarget->hasV5TOps()">;
 def HasV5TE   : Predicate<"Subtarget->hasV5TEOps()">;
 def HasV6     : Predicate<"Subtarget->hasV6Ops()">;
+def HasV7     : Predicate<"Subtarget->hasV7Ops()">;
+def HasVFP2   : Predicate<"Subtarget->hasVFP2()">;
+def HasVFP3   : Predicate<"Subtarget->hasVFP3()">;
+def HasNEON   : Predicate<"Subtarget->hasNEON()">;
 def IsThumb   : Predicate<"Subtarget->isThumb()">;
 def HasThumb2 : Predicate<"Subtarget->hasThumb2()">;
 def IsARM     : Predicate<"!Subtarget->isThumb()">;
+def IsDarwin    : Predicate<"Subtarget->isTargetDarwin()">;
+def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">;
 
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
@@ -518,6 +524,24 @@ def PICSTRB : AXI2stb<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
 }
 } // isNotDuplicable = 1
 
+
+// LEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p), Pseudo,
+                   !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
+                                         "${:private}PCRELL${:uid}+8))\n"),
+                              !strconcat("${:private}PCRELL${:uid}:\n\t",
+                                         "add$p $dst, pc, #PCRELV${:uid}")),
+                   []>;
+
+def LEApcrelJT : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, i32imm:$id, pred:$p),
+          Pseudo,
+          !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(",
+                                         "${:private}PCRELL${:uid}+8))\n"),
+                              !strconcat("${:private}PCRELL${:uid}:\n\t",
+                                         "add$p $dst, pc, #PCRELV${:uid}")),
+                   []>;
+
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions.
 //
@@ -539,21 +563,22 @@ let isReturn = 1, isTerminator = 1 in
                     LdStMulFrm, "ldm${p}${addr:submode} $addr, $dst1",
                     []>;
 
+// On non-Darwin platforms R9 is callee-saved.
 let isCall = 1, Itinerary = IIC_Br,
   Defs = [R0, R1, R2, R3, R12, LR,
           D0, D1, D2, D3, D4, D5, D6, D7, CPSR] in {
   def BL  : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops),
                 "bl ${func:call}",
-                [(ARMcall tglobaladdr:$func)]>;
+                [(ARMcall tglobaladdr:$func)]>, Requires<[IsNotDarwin]>;
 
   def BL_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops),
                    "bl", " ${func:call}",
-                   [(ARMcall_pred tglobaladdr:$func)]>;
+                   [(ARMcall_pred tglobaladdr:$func)]>, Requires<[IsNotDarwin]>;
 
   // ARMv5T and above
   def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
                 "blx $func",
-                [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T]> {
+                [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsNotDarwin]> {
     let Inst{7-4}   = 0b0011;
     let Inst{19-8}  = 0b111111111111;
     let Inst{27-20} = 0b00010010;
@@ -563,7 +588,36 @@ let isCall = 1, Itinerary = IIC_Br,
     // ARMv4T
     def BX : ABXIx2<(outs), (ins GPR:$func, variable_ops),
                      "mov lr, pc\n\tbx $func",
-                    [(ARMcall_nolink GPR:$func)]>;
+                    [(ARMcall_nolink GPR:$func)]>, Requires<[IsNotDarwin]>;
+  }
+}
+
+// On Darwin R9 is call-clobbered.
+let isCall = 1, Itinerary = IIC_Br,
+  Defs = [R0, R1, R2, R3, R9, R12, LR,
+          D0, D1, D2, D3, D4, D5, D6, D7, CPSR] in {
+  def BLr9  : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops),
+                "bl ${func:call}",
+                [(ARMcall tglobaladdr:$func)]>, Requires<[IsDarwin]>;
+
+  def BLr9_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops),
+                   "bl", " ${func:call}",
+                   [(ARMcall_pred tglobaladdr:$func)]>, Requires<[IsDarwin]>;
+
+  // ARMv5T and above
+  def BLXr9 : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
+                "blx $func",
+                [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsDarwin]> {
+    let Inst{7-4}   = 0b0011;
+    let Inst{19-8}  = 0b111111111111;
+    let Inst{27-20} = 0b00010010;
+  }
+
+  let Uses = [LR] in {
+    // ARMv4T
+    def BXr9 : ABXIx2<(outs), (ins GPR:$func, variable_ops),
+                     "mov lr, pc\n\tbx $func",
+                    [(ARMcall_nolink GPR:$func)]>, Requires<[IsDarwin]>;
   }
 }
 
@@ -823,9 +877,9 @@ defm UXTH   : AI_unary_rrot<0b01101111,
 defm UXTB16 : AI_unary_rrot<0b01101100,
                             "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
 
-def : ARMV6Pat<(and (shl GPR:$Src, 8), 0xFF00FF),
+def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
                (UXTB16r_rot GPR:$Src, 24)>;
-def : ARMV6Pat<(and (srl GPR:$Src, 8), 0xFF00FF),
+def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
                (UXTB16r_rot GPR:$Src, 8)>;
 
 defm UXTAB : AI_bin_rrot<0b01101110, "uxtab",
@@ -1006,7 +1060,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
   def BT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
               !strconcat(opc, "bt"), " $dst, $a, $b",
               [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
-                                      (sra GPR:$b, 16)))]>,
+                                      (sra GPR:$b, (i32 16))))]>,
            Requires<[IsARM, HasV5TE]> {
              let Inst{5} = 0;
              let Inst{6} = 1;
@@ -1014,7 +1068,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
 
   def TB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
               !strconcat(opc, "tb"), " $dst, $a, $b",
-              [(set GPR:$dst, (opnode (sra GPR:$a, 16),
+              [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
                                       (sext_inreg GPR:$b, i16)))]>,
            Requires<[IsARM, HasV5TE]> {
              let Inst{5} = 1;
@@ -1023,8 +1077,8 @@ multiclass AI_smul<string opc, PatFrag opnode> {
 
   def TT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
               !strconcat(opc, "tt"), " $dst, $a, $b",
-              [(set GPR:$dst, (opnode (sra GPR:$a, 16),
-                                      (sra GPR:$b, 16)))]>,
+              [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
+                                      (sra GPR:$b, (i32 16))))]>,
             Requires<[IsARM, HasV5TE]> {
              let Inst{5} = 1;
              let Inst{6} = 1;
@@ -1033,7 +1087,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
   def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
               !strconcat(opc, "wb"), " $dst, $a, $b",
               [(set GPR:$dst, (sra (opnode GPR:$a,
-                                    (sext_inreg GPR:$b, i16)), 16))]>,
+                                    (sext_inreg GPR:$b, i16)), (i32 16)))]>,
            Requires<[IsARM, HasV5TE]> {
              let Inst{5} = 1;
              let Inst{6} = 0;
@@ -1042,7 +1096,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
   def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
               !strconcat(opc, "wt"), " $dst, $a, $b",
               [(set GPR:$dst, (sra (opnode GPR:$a,
-                                    (sra GPR:$b, 16)), 16))]>,
+                                    (sra GPR:$b, (i32 16))), (i32 16)))]>,
             Requires<[IsARM, HasV5TE]> {
              let Inst{5} = 1;
              let Inst{6} = 1;
@@ -1064,7 +1118,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
   def BT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
               !strconcat(opc, "bt"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
-                                                     (sra GPR:$b, 16))))]>,
+                                                     (sra GPR:$b, (i32 16)))))]>,
            Requires<[IsARM, HasV5TE]> {
              let Inst{5} = 0;
              let Inst{6} = 1;
@@ -1072,7 +1126,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
 
   def TB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
               !strconcat(opc, "tb"), " $dst, $a, $b, $acc",
-              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, 16),
+              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
                                                  (sext_inreg GPR:$b, i16))))]>,
            Requires<[IsARM, HasV5TE]> {
              let Inst{5} = 1;
@@ -1081,8 +1135,8 @@ multiclass AI_smla<string opc, PatFrag opnode> {
 
   def TT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
               !strconcat(opc, "tt"), " $dst, $a, $b, $acc",
-              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, 16),
-                                                     (sra GPR:$b, 16))))]>,
+              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
+                                                     (sra GPR:$b, (i32 16)))))]>,
             Requires<[IsARM, HasV5TE]> {
              let Inst{5} = 1;
              let Inst{6} = 1;
@@ -1091,7 +1145,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
   def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
               !strconcat(opc, "wb"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
-                                            (sext_inreg GPR:$b, i16)), 16)))]>,
+                                       (sext_inreg GPR:$b, i16)), (i32 16))))]>,
            Requires<[IsARM, HasV5TE]> {
              let Inst{5} = 0;
              let Inst{6} = 0;
@@ -1100,7 +1154,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
   def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
               !strconcat(opc, "wt"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
-                                                   (sra GPR:$b, 16)), 16)))]>,
+                                         (sra GPR:$b, (i32 16))), (i32 16))))]>,
             Requires<[IsARM, HasV5TE]> {
              let Inst{5} = 0;
              let Inst{6} = 1;
@@ -1136,10 +1190,10 @@ def REV  : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src),
 def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src),
                "rev16", " $dst, $src",
                [(set GPR:$dst,
-                   (or (and (srl GPR:$src, 8), 0xFF),
-                       (or (and (shl GPR:$src, 8), 0xFF00),
-                           (or (and (srl GPR:$src, 8), 0xFF0000),
-                               (and (shl GPR:$src, 8), 0xFF000000)))))]>,
+                   (or (and (srl GPR:$src, (i32 8)), 0xFF),
+                       (or (and (shl GPR:$src, (i32 8)), 0xFF00),
+                           (or (and (srl GPR:$src, (i32 8)), 0xFF0000),
+                               (and (shl GPR:$src, (i32 8)), 0xFF000000)))))]>,
                Requires<[IsARM, HasV6]> {
   let Inst{7-4}   = 0b1011;
   let Inst{11-8}  = 0b1111;
@@ -1150,8 +1204,8 @@ def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src),
                "revsh", " $dst, $src",
                [(set GPR:$dst,
                   (sext_inreg
-                    (or (srl (and GPR:$src, 0xFF00), 8),
-                        (shl GPR:$src, 8)), i16))]>,
+                    (or (srl (and GPR:$src, 0xFF00), (i32 8)),
+                        (shl GPR:$src, (i32 8))), i16))]>,
                Requires<[IsARM, HasV6]> {
   let Inst{7-4}   = 0b1011;
   let Inst{11-8}  = 0b1111;
@@ -1186,7 +1240,7 @@ def PKHTB : AMiscA1I<0b01101000, (outs GPR:$dst),
 
 // Alternate cases for PKHTB where identities eliminate some nodes.  Note that
 // a shift amount of 0 is *not legal* here, it is PKHBT instead.
-def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, 16)),
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, (i32 16))),
                (PKHTB GPR:$src1, GPR:$src2, 16)>;
 def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000),
                    (and (srl GPR:$src2, imm1_15:$shamt), 0xFFFF)),
@@ -1240,23 +1294,6 @@ def MOVCCi : AI1<0b1101, (outs GPR:$dst),
                 RegConstraint<"$false = $dst">, UnaryDP;
 
 
-// LEApcrel - Load a pc-relative address into a register without offending the
-// assembler.
-def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p), Pseudo,
-                   !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
-                                         "${:private}PCRELL${:uid}+8))\n"),
-                              !strconcat("${:private}PCRELL${:uid}:\n\t",
-                                         "add$p $dst, pc, #PCRELV${:uid}")),
-                   []>;
-
-def LEApcrelJT : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, i32imm:$id, pred:$p),
-          Pseudo,
-          !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(",
-                                         "${:private}PCRELL${:uid}+8))\n"),
-                              !strconcat("${:private}PCRELL${:uid}:\n\t",
-                                         "add$p $dst, pc, #PCRELV${:uid}")),
-                   []>;
-
 //===----------------------------------------------------------------------===//
 // TLS Instructions
 //
@@ -1321,7 +1358,10 @@ def : ARMPat<(xor GPR:$LHS, so_imm2part:$RHS),
 
 
 // Direct calls
-def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>;
+def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>,
+      Requires<[IsNotDarwin]>;
+def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>,
+      Requires<[IsDarwin]>;
 
 // zextload i1 -> zextload i8
 def : ARMPat<(zextloadi1 addrmode2:$addr),  (LDRB addrmode2:$addr)>;
@@ -1335,47 +1375,54 @@ def : ARMPat<(extloadi8  addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>;
 def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;
 
 // smul* and smla*
-def : ARMV5TEPat<(mul (sra (shl GPR:$a, 16), 16), (sra (shl GPR:$b, 16), 16)),
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                      (sra (shl GPR:$b, (i32 16)), (i32 16))),
                  (SMULBB GPR:$a, GPR:$b)>;
 def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
                  (SMULBB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(mul (sra (shl GPR:$a, 16), 16), (sra GPR:$b, 16)),
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                      (sra GPR:$b, (i32 16))),
                  (SMULBT GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, 16)),
+def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))),
                  (SMULBT GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(mul (sra GPR:$a, 16), (sra (shl GPR:$b, 16), 16)),
+def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)),
+                      (sra (shl GPR:$b, (i32 16)), (i32 16))),
                  (SMULTB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(mul (sra GPR:$a, 16), sext_16_node:$b),
+def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
                 (SMULTB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, 16), 16)), 16),
+def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                      (i32 16)),
                  (SMULWB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), 16),
+def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), (i32 16)),
                  (SMULWB GPR:$a, GPR:$b)>;
 
 def : ARMV5TEPat<(add GPR:$acc,
-                      (mul (sra (shl GPR:$a, 16), 16),
-                           (sra (shl GPR:$b, 16), 16))),
+                      (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                           (sra (shl GPR:$b, (i32 16)), (i32 16)))),
                  (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
 def : ARMV5TEPat<(add GPR:$acc,
                       (mul sext_16_node:$a, sext_16_node:$b)),
                  (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
 def : ARMV5TEPat<(add GPR:$acc,
-                      (mul (sra (shl GPR:$a, 16), 16), (sra GPR:$b, 16))),
+                      (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                           (sra GPR:$b, (i32 16)))),
                  (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
 def : ARMV5TEPat<(add GPR:$acc,
-                      (mul sext_16_node:$a, (sra GPR:$b, 16))),
+                      (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
                  (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
 def : ARMV5TEPat<(add GPR:$acc,
-                      (mul (sra GPR:$a, 16), (sra (shl GPR:$b, 16), 16))),
+                      (mul (sra GPR:$a, (i32 16)),
+                           (sra (shl GPR:$b, (i32 16)), (i32 16)))),
                  (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
 def : ARMV5TEPat<(add GPR:$acc,
-                      (mul (sra GPR:$a, 16), sext_16_node:$b)),
+                      (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
                  (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
 def : ARMV5TEPat<(add GPR:$acc,
-                      (sra (mul GPR:$a, (sra (shl GPR:$b, 16), 16)), 16)),
+                      (sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                           (i32 16))),
                  (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
 def : ARMV5TEPat<(add GPR:$acc,
-                      (sra (mul GPR:$a, sext_16_node:$b), 16)),
+                      (sra (mul GPR:$a, sext_16_node:$b), (i32 16))),
                  (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
 
 //===----------------------------------------------------------------------===//
@@ -1395,3 +1442,9 @@ include "ARMInstrThumb2.td"
 //
 
 include "ARMInstrVFP.td"
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON) Support
+//
+
+include "ARMInstrNEON.td"
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
new file mode 100644
index 000000000000..a62597bad840
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -0,0 +1,1665 @@
+//===- ARMInstrNEON.td - NEON support for ARM -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM NEON instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// NEON-specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTARMVCMP    : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
+
+def NEONvceq      : SDNode<"ARMISD::VCEQ", SDTARMVCMP>;
+def NEONvcge      : SDNode<"ARMISD::VCGE", SDTARMVCMP>;
+def NEONvcgeu     : SDNode<"ARMISD::VCGEU", SDTARMVCMP>;
+def NEONvcgt      : SDNode<"ARMISD::VCGT", SDTARMVCMP>;
+def NEONvcgtu     : SDNode<"ARMISD::VCGTU", SDTARMVCMP>;
+def NEONvtst      : SDNode<"ARMISD::VTST", SDTARMVCMP>;
+
+// Types for vector shift by immediates.  The "SHX" version is for long and
+// narrow operations where the source and destination vectors have different
+// types.  The "SHINS" version is for shift and insert operations.
+def SDTARMVSH     : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisVT<2, i32>]>;
+def SDTARMVSHX    : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+                                         SDTCisVT<2, i32>]>;
+def SDTARMVSHINS  : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
+
+def NEONvshl      : SDNode<"ARMISD::VSHL", SDTARMVSH>;
+def NEONvshrs     : SDNode<"ARMISD::VSHRs", SDTARMVSH>;
+def NEONvshru     : SDNode<"ARMISD::VSHRu", SDTARMVSH>;
+def NEONvshlls    : SDNode<"ARMISD::VSHLLs", SDTARMVSHX>;
+def NEONvshllu    : SDNode<"ARMISD::VSHLLu", SDTARMVSHX>;
+def NEONvshlli    : SDNode<"ARMISD::VSHLLi", SDTARMVSHX>;
+def NEONvshrn     : SDNode<"ARMISD::VSHRN", SDTARMVSHX>;
+
+def NEONvrshrs    : SDNode<"ARMISD::VRSHRs", SDTARMVSH>;
+def NEONvrshru    : SDNode<"ARMISD::VRSHRu", SDTARMVSH>;
+def NEONvrshrn    : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>;
+
+def NEONvqshls    : SDNode<"ARMISD::VQSHLs", SDTARMVSH>;
+def NEONvqshlu    : SDNode<"ARMISD::VQSHLu", SDTARMVSH>;
+def NEONvqshlsu   : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>;
+def NEONvqshrns   : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>;
+def NEONvqshrnu   : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>;
+def NEONvqshrnsu  : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>;
+
+def NEONvqrshrns  : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>;
+def NEONvqrshrnu  : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>;
+def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>;
+
+def NEONvsli      : SDNode<"ARMISD::VSLI", SDTARMVSHINS>;
+def NEONvsri      : SDNode<"ARMISD::VSRI", SDTARMVSHINS>;
+
+def SDTARMVGETLN  : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
+                                         SDTCisVT<2, i32>]>;
+def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
+def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
+
+def NEONvduplaneq : SDNode<"ARMISD::VDUPLANEQ",
+                           SDTypeProfile<1, 2, [SDTCisVT<2, i32>]>>;
+
+//===----------------------------------------------------------------------===//
+// NEON operand definitions
+//===----------------------------------------------------------------------===//
+
+// addrmode_neonldstm := reg
+//
+/* TODO: Take advantage of vldm.
+def addrmode_neonldstm : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrModeNeonLdStM", []> {
+  let PrintMethod = "printAddrNeonLdStMOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+*/
+
+//===----------------------------------------------------------------------===//
+// NEON load / store instructions
+//===----------------------------------------------------------------------===//
+
+/* TODO: Take advantage of vldm.
+let mayLoad = 1 in {
+def VLDMD : NI<(outs),
+               (ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops),
+               "vldm${addr:submode} ${addr:base}, $dst1",
+               []>;
+
+def VLDMS : NI<(outs),
+               (ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops),
+               "vldm${addr:submode} ${addr:base}, $dst1",
+               []>;
+}
+*/
+
+// Use vldmia to load a Q register as a D register pair.
+def VLDRQ : NI<(outs QPR:$dst), (ins GPR:$addr),
+               "vldmia $addr, ${dst:dregpair}",
+               [(set QPR:$dst, (v2f64 (load GPR:$addr)))]>;
+
+// Use vstmia to store a Q register as a D register pair.
+def VSTRQ : NI<(outs), (ins QPR:$src, GPR:$addr),
+               "vstmia $addr, ${src:dregpair}",
+               [(store (v2f64 QPR:$src), GPR:$addr)]>;
+
+
+//===----------------------------------------------------------------------===//
+// NEON pattern fragments
+//===----------------------------------------------------------------------===//
+
+// Extract D sub-registers of Q registers.
+// (arm_dsubreg_0 is 5; arm_dsubreg_1 is 6)
+def SubReg_i8_reg  : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(5 + N->getZExtValue() / 8, MVT::i32);
+}]>;
+def SubReg_i16_reg : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(5 + N->getZExtValue() / 4, MVT::i32);
+}]>;
+def SubReg_i32_reg : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(5 + N->getZExtValue() / 2, MVT::i32);
+}]>;
+def SubReg_f64_reg : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(5 + N->getZExtValue(), MVT::i32);
+}]>;
+
+// Translate lane numbers from Q registers to D subregs.
+def SubReg_i8_lane  : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 7, MVT::i32);
+}]>;
+def SubReg_i16_lane : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 3, MVT::i32);
+}]>;
+def SubReg_i32_lane : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 1, MVT::i32);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Classes
+//===----------------------------------------------------------------------===//
+
+// Basic 2-register operations, both double- and quad-register.
+class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+           ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
+        (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>;
+class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+           ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
+        (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>;
+
+// Basic 2-register intrinsics, both double- and quad-register.
+class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
+        (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>;
+class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
+        (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
+
+// Narrow 2-register intrinsics.
+class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+              string OpcodeStr, ValueType TyD, ValueType TyQ, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$dst),
+        (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src))))]>;
+
+// Long 2-register intrinsics.  (This is currently only used for VMOVL and is
+// derived from N2VImm instead of N2V because of the way the size is encoded.)
+class N2VLInt<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+              bit op6, bit op4, string OpcodeStr, ValueType TyQ, ValueType TyD,
+              Intrinsic IntOp>
+  : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4, (outs QPR:$dst),
+        (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src))))]>;
+
+// Basic 3-register operations, both double- and quad-register.
+class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           string OpcodeStr, ValueType ResTy, ValueType OpTy,
+           SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2),
+        !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
+        [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           string OpcodeStr, ValueType ResTy, ValueType OpTy,
+           SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2),
+        !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
+        [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+
+// Basic 3-register intrinsics, both double- and quad-register.
+class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              string OpcodeStr, ValueType ResTy, ValueType OpTy,
+              Intrinsic IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2),
+        !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
+        [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              string OpcodeStr, ValueType ResTy, ValueType OpTy,
+              Intrinsic IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2),
+        !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
+        [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+
+// Multiply-Add/Sub operations, both double- and quad-register.
+class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
+        !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
+        [(set DPR:$dst, (Ty (OpNode DPR:$src1,
+                             (Ty (MulOp DPR:$src2, DPR:$src3)))))]>;
+class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
+        !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
+        [(set QPR:$dst, (Ty (OpNode QPR:$src1,
+                             (Ty (MulOp QPR:$src2, QPR:$src3)))))]>;
+
+// Neon 3-argument intrinsics, both double- and quad-register.
+// The destination register is also used as the first source operand register.
+class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               string OpcodeStr, ValueType ResTy, ValueType OpTy,
+               Intrinsic IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
+        !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
+        [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1),
+                                      (OpTy DPR:$src2), (OpTy DPR:$src3))))]>;
+class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               string OpcodeStr, ValueType ResTy, ValueType OpTy,
+               Intrinsic IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
+        !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
+        [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1),
+                                      (OpTy QPR:$src2), (OpTy QPR:$src3))))]>;
+
+// Neon Long 3-argument intrinsic.  The destination register is
+// a quad-register and is also used as the first source operand register.
+class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               string OpcodeStr, ValueType TyQ, ValueType TyD, Intrinsic IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3),
+        !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
+        [(set QPR:$dst,
+          (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>;
+
+// Narrowing 3-register intrinsics.
+class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              string OpcodeStr, ValueType TyD, ValueType TyQ,
+              Intrinsic IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$dst), (ins QPR:$src1, QPR:$src2),
+        !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
+        [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src1), (TyQ QPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+
+// Long 3-register intrinsics.
+class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              string OpcodeStr, ValueType TyQ, ValueType TyD,
+              Intrinsic IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2),
+        !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
+        [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+
+// Wide 3-register intrinsics.
+class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              string OpcodeStr, ValueType TyQ, ValueType TyD,
+              Intrinsic IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2),
+        !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
+        [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+
+// Pairwise long 2-register intrinsics, both double- and quad-register.
+class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
+        (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>;
+class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
+        (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
+
+// Pairwise long 2-register accumulate intrinsics,
+// both double- and quad-register.
+// The destination register is also used as the first source operand register.
+class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                 bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2),
+        !strconcat(OpcodeStr, "\t$dst, $src2"), "$src1 = $dst",
+        [(set DPR:$dst, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$src2))))]>;
+class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                 bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2),
+        !strconcat(OpcodeStr, "\t$dst, $src2"), "$src1 = $dst",
+        [(set QPR:$dst, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$src2))))]>;
+
+// Shift by immediate,
+// both double- and quad-register.
+class N2VDSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+             bit op4, string OpcodeStr, ValueType Ty, SDNode OpNode>
+  : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
+           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM),
+           !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
+           [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>;
+class N2VQSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+             bit op4, string OpcodeStr, ValueType Ty, SDNode OpNode>
+  : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
+           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM),
+           !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
+           [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>;
+
+// Long shift by immediate.
+class N2VLSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+             bit op6, bit op4, string OpcodeStr, ValueType ResTy,
+             ValueType OpTy, SDNode OpNode>
+  : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4,
+           (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM),
+           !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
+           [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src),
+                                          (i32 imm:$SIMM))))]>;
+
+// Narrow shift by immediate.
+class N2VNSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+             bit op6, bit op4, string OpcodeStr, ValueType ResTy,
+             ValueType OpTy, SDNode OpNode>
+  : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4,
+           (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM),
+           !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
+           [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src),
+                                          (i32 imm:$SIMM))))]>;
+
+// Shift right by immediate and accumulate,
+// both double- and quad-register.
+class N2VDShAdd<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+                bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
+           (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM),
+           !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
+           [(set DPR:$dst, (Ty (add DPR:$src1,
+                                (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>;
+class N2VQShAdd<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+                bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
+           (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM),
+           !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
+           [(set QPR:$dst, (Ty (add QPR:$src1,
+                                (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>;
+
+// Shift by immediate and insert,
+// both double- and quad-register.
+class N2VDShIns<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+                bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
+           (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM),
+           !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
+           [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>;
+class N2VQShIns<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+                bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
+           (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM),
+           !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
+           [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>;
+
+// Convert, with fractional bits immediate,
+// both double- and quad-register.
+class N2VCvtD<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+              bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy,
+              Intrinsic IntOp>
+  : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
+           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM),
+           !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
+           [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>;
+class N2VCvtQ<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+              bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy,
+              Intrinsic IntOp>
+  : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
+           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM),
+           !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
+           [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>;
+
+//===----------------------------------------------------------------------===//
+// Multiclasses
+//===----------------------------------------------------------------------===//
+
+// Neon 3-register vector operations.
+
+// First with only element sizes of 8, 16 and 32 bits:
+multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                   string OpcodeStr, SDNode OpNode, bit Commutable = 0> {
+  // 64-bit vector types.
+  def v8i8  : N3VD<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
+                   v8i8, v8i8, OpNode, Commutable>;
+  def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr, "16"),
+                   v4i16, v4i16, OpNode, Commutable>;
+  def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr, "32"),
+                   v2i32, v2i32, OpNode, Commutable>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
+                   v16i8, v16i8, OpNode, Commutable>;
+  def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr, "16"),
+                   v8i16, v8i16, OpNode, Commutable>;
+  def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr, "32"),
+                   v4i32, v4i32, OpNode, Commutable>;
+}
+
+// ....then also with element size 64 bits:
+multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                    string OpcodeStr, SDNode OpNode, bit Commutable = 0>
+  : N3V_QHS<op24, op23, op11_8, op4, OpcodeStr, OpNode, Commutable> {
+  def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr, "64"),
+                   v1i64, v1i64, OpNode, Commutable>;
+  def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr, "64"),
+                   v2i64, v2i64, OpNode, Commutable>;
+}
+
+
+// Neon Narrowing 2-register vector intrinsics,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                       bits<5> op11_7, bit op6, bit op4, string OpcodeStr,
+                       Intrinsic IntOp> {
+  def v8i8  : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
+                      !strconcat(OpcodeStr, "16"), v8i8, v8i16, IntOp>;
+  def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
+                      !strconcat(OpcodeStr, "32"), v4i16, v4i32, IntOp>;
+  def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
+                      !strconcat(OpcodeStr, "64"), v2i32, v2i64, IntOp>;
+}
+
+
+// Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL).
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+                       bit op4, string OpcodeStr, Intrinsic IntOp> {
+  def v8i16 : N2VLInt<op24, op23, 0b001000, op11_8, op7, op6, op4,
+                      !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
+  def v4i32 : N2VLInt<op24, op23, 0b010000, op11_8, op7, op6, op4,
+                      !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
+  def v2i64 : N2VLInt<op24, op23, 0b100000, op11_8, op7, op6, op4,
+                      !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
+}
+
+
+// Neon 3-register vector intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                     string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> {
+  // 64-bit vector types.
+  def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"),
+                      v4i16, v4i16, IntOp, Commutable>;
+  def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"),
+                      v2i32, v2i32, IntOp, Commutable>;
+
+  // 128-bit vector types.
+  def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"),
+                      v8i16, v8i16, IntOp, Commutable>;
+  def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"),
+                      v4i32, v4i32, IntOp, Commutable>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                      string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
+  : N3VInt_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> {
+  def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
+                      v8i8, v8i8, IntOp, Commutable>;
+  def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
+                      v16i8, v16i8, IntOp, Commutable>;
+}
+
+// ....then also with element size of 64 bits:
+multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
+  : N3VInt_QHS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> {
+  def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr,"64"),
+                      v1i64, v1i64, IntOp, Commutable>;
+  def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr,"64"),
+                      v2i64, v2i64, IntOp, Commutable>;
+}
+
+
+// Neon Narrowing 3-register vector intrinsics,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> {
+  def v8i8  : N3VNInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr,"16"),
+                      v8i8, v8i16, IntOp, Commutable>;
+  def v4i16 : N3VNInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"32"),
+                      v4i16, v4i32, IntOp, Commutable>;
+  def v2i32 : N3VNInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"64"),
+                      v2i32, v2i64, IntOp, Commutable>;
+}
+
+
+// Neon Long 3-register vector intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                      string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> {
+  def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"),
+                      v4i32, v4i16, IntOp, Commutable>;
+  def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"),
+                      v2i64, v2i32, IntOp, Commutable>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
+  : N3VLInt_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> {
+  def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
+                      v8i16, v8i8, IntOp, Commutable>;
+}
+
+
+// Neon Wide 3-register vector intrinsics,
+//   source operand element sizes of 8, 16 and 32 bits:
+multiclass N3VWInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> {
+  def v8i16 : N3VWInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"),
+                      v8i16, v8i8, IntOp, Commutable>;
+  def v4i32 : N3VWInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"),
+                      v4i32, v4i16, IntOp, Commutable>;
+  def v2i64 : N3VWInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"),
+                      v2i64, v2i32, IntOp, Commutable>;
+}
+
+
+// Neon Multiply-Op vector operations,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        string OpcodeStr, SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N3VDMulOp<op24, op23, 0b00, op11_8, op4,
+                        !strconcat(OpcodeStr, "8"), v8i8, mul, OpNode>;
+  def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4,
+                        !strconcat(OpcodeStr, "16"), v4i16, mul, OpNode>;
+  def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4,
+                        !strconcat(OpcodeStr, "32"), v2i32, mul, OpNode>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4,
+                        !strconcat(OpcodeStr, "8"), v16i8, mul, OpNode>;
+  def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4,
+                        !strconcat(OpcodeStr, "16"), v8i16, mul, OpNode>;
+  def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4,
+                        !strconcat(OpcodeStr, "32"), v4i32, mul, OpNode>;
+}
+
+
+// Neon 3-argument intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       string OpcodeStr, Intrinsic IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N3VDInt3<op24, op23, 0b00, op11_8, op4,
+                        !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>;
+  def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4,
+                        !strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>;
+  def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4,
+                        !strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4,
+                        !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>;
+  def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4,
+                        !strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>;
+  def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4,
+                        !strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>;
+}
+
+
+// Neon Long 3-argument intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       string OpcodeStr, Intrinsic IntOp> {
+  def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4,
+                       !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
+  def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4,
+                       !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        string OpcodeStr, Intrinsic IntOp>
+  : N3VLInt3_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp> {
+  def v8i16 : N3VLInt3<op24, op23, 0b01, op11_8, op4,
+                       !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
+}
+
+
+// Neon 2-register vector intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                      bits<5> op11_7, bit op4, string OpcodeStr,
+                      Intrinsic IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                      !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>;
+  def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                      !strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>;
+  def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                      !strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                      !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>;
+  def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                      !strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>;
+  def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                      !strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>;
+}
+
+
+// Neon Pairwise long 2-register intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                        bits<5> op11_7, bit op4,
+                        string OpcodeStr, Intrinsic IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                        !strconcat(OpcodeStr, "8"), v4i16, v8i8, IntOp>;
+  def v4i16 : N2VDPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                        !strconcat(OpcodeStr, "16"), v2i32, v4i16, IntOp>;
+  def v2i32 : N2VDPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                        !strconcat(OpcodeStr, "32"), v1i64, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                        !strconcat(OpcodeStr, "8"), v8i16, v16i8, IntOp>;
+  def v8i16 : N2VQPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                        !strconcat(OpcodeStr, "16"), v4i32, v8i16, IntOp>;
+  def v4i32 : N2VQPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                        !strconcat(OpcodeStr, "32"), v2i64, v4i32, IntOp>;
+}
+
+
+// Neon Pairwise long 2-register accumulate intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                         bits<5> op11_7, bit op4,
+                         string OpcodeStr, Intrinsic IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                         !strconcat(OpcodeStr, "8"), v4i16, v8i8, IntOp>;
+  def v4i16 : N2VDPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                         !strconcat(OpcodeStr, "16"), v2i32, v4i16, IntOp>;
+  def v2i32 : N2VDPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                         !strconcat(OpcodeStr, "32"), v1i64, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                         !strconcat(OpcodeStr, "8"), v8i16, v16i8, IntOp>;
+  def v8i16 : N2VQPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                         !strconcat(OpcodeStr, "16"), v4i32, v8i16, IntOp>;
+  def v4i32 : N2VQPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                         !strconcat(OpcodeStr, "32"), v2i64, v4i32, IntOp>;
+}
+
+
+// Neon 2-register vector shift by immediate,
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                      string OpcodeStr, SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N2VDSh<op24, op23, 0b001000, op11_8, 0, op4,
+                     !strconcat(OpcodeStr, "8"), v8i8, OpNode>;
+  def v4i16 : N2VDSh<op24, op23, 0b010000, op11_8, 0, op4,
+                     !strconcat(OpcodeStr, "16"), v4i16, OpNode>;
+  def v2i32 : N2VDSh<op24, op23, 0b100000, op11_8, 0, op4,
+                     !strconcat(OpcodeStr, "32"), v2i32, OpNode>;
+  def v1i64 : N2VDSh<op24, op23, 0b000000, op11_8, 1, op4,
+                     !strconcat(OpcodeStr, "64"), v1i64, OpNode>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQSh<op24, op23, 0b001000, op11_8, 0, op4,
+                     !strconcat(OpcodeStr, "8"), v16i8, OpNode>;
+  def v8i16 : N2VQSh<op24, op23, 0b010000, op11_8, 0, op4,
+                     !strconcat(OpcodeStr, "16"), v8i16, OpNode>;
+  def v4i32 : N2VQSh<op24, op23, 0b100000, op11_8, 0, op4,
+                     !strconcat(OpcodeStr, "32"), v4i32, OpNode>;
+  def v2i64 : N2VQSh<op24, op23, 0b000000, op11_8, 1, op4,
+                     !strconcat(OpcodeStr, "64"), v2i64, OpNode>;
+}
+
+
+// Neon Shift-Accumulate vector operations,
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                         string OpcodeStr, SDNode ShOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDShAdd<op24, op23, 0b001000, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "8"), v8i8, ShOp>;
+  def v4i16 : N2VDShAdd<op24, op23, 0b010000, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "16"), v4i16, ShOp>;
+  def v2i32 : N2VDShAdd<op24, op23, 0b100000, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "32"), v2i32, ShOp>;
+  def v1i64 : N2VDShAdd<op24, op23, 0b000000, op11_8, 1, op4,
+                        !strconcat(OpcodeStr, "64"), v1i64, ShOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQShAdd<op24, op23, 0b001000, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "8"), v16i8, ShOp>;
+  def v8i16 : N2VQShAdd<op24, op23, 0b010000, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "16"), v8i16, ShOp>;
+  def v4i32 : N2VQShAdd<op24, op23, 0b100000, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "32"), v4i32, ShOp>;
+  def v2i64 : N2VQShAdd<op24, op23, 0b000000, op11_8, 1, op4,
+                        !strconcat(OpcodeStr, "64"), v2i64, ShOp>;
+}
+
+
+// Neon Shift-Insert vector operations,
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                         string OpcodeStr, SDNode ShOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDShIns<op24, op23, 0b001000, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "8"), v8i8, ShOp>;
+  def v4i16 : N2VDShIns<op24, op23, 0b010000, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "16"), v4i16, ShOp>;
+  def v2i32 : N2VDShIns<op24, op23, 0b100000, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "32"), v2i32, ShOp>;
+  def v1i64 : N2VDShIns<op24, op23, 0b000000, op11_8, 1, op4,
+                        !strconcat(OpcodeStr, "64"), v1i64, ShOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQShIns<op24, op23, 0b001000, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "8"), v16i8, ShOp>;
+  def v8i16 : N2VQShIns<op24, op23, 0b010000, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "16"), v8i16, ShOp>;
+  def v4i32 : N2VQShIns<op24, op23, 0b100000, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "32"), v4i32, ShOp>;
+  def v2i64 : N2VQShIns<op24, op23, 0b000000, op11_8, 1, op4,
+                        !strconcat(OpcodeStr, "64"), v2i64, ShOp>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+//===----------------------------------------------------------------------===//
+
+// Vector Add Operations.
+
+//   VADD     : Vector Add (integer and floating-point)
+defm VADD     : N3V_QHSD<0, 0, 0b1000, 0, "vadd.i", add, 1>;
+def  VADDfd   : N3VD<0, 0, 0b00, 0b1101, 0, "vadd.f32", v2f32, v2f32, fadd, 1>;
+def  VADDfq   : N3VQ<0, 0, 0b00, 0b1101, 0, "vadd.f32", v4f32, v4f32, fadd, 1>;
+//   VADDL    : Vector Add Long (Q = D + D)
+defm VADDLs   : N3VLInt_QHS<0,1,0b0000,0, "vaddl.s", int_arm_neon_vaddls, 1>;
+defm VADDLu   : N3VLInt_QHS<1,1,0b0000,0, "vaddl.u", int_arm_neon_vaddlu, 1>;
+//   VADDW    : Vector Add Wide (Q = Q + D)
+defm VADDWs   : N3VWInt_QHS<0,1,0b0001,0, "vaddw.s", int_arm_neon_vaddws, 0>;
+defm VADDWu   : N3VWInt_QHS<1,1,0b0001,0, "vaddw.u", int_arm_neon_vaddwu, 0>;
+//   VHADD    : Vector Halving Add
+defm VHADDs   : N3VInt_QHS<0,0,0b0000,0, "vhadd.s", int_arm_neon_vhadds, 1>;
+defm VHADDu   : N3VInt_QHS<1,0,0b0000,0, "vhadd.u", int_arm_neon_vhaddu, 1>;
+//   VRHADD   : Vector Rounding Halving Add
+defm VRHADDs  : N3VInt_QHS<0,0,0b0001,0, "vrhadd.s", int_arm_neon_vrhadds, 1>;
+defm VRHADDu  : N3VInt_QHS<1,0,0b0001,0, "vrhadd.u", int_arm_neon_vrhaddu, 1>;
+//   VQADD    : Vector Saturating Add
+defm VQADDs   : N3VInt_QHSD<0,0,0b0000,1, "vqadd.s", int_arm_neon_vqadds, 1>;
+defm VQADDu   : N3VInt_QHSD<1,0,0b0000,1, "vqadd.u", int_arm_neon_vqaddu, 1>;
+//   VADDHN   : Vector Add and Narrow Returning High Half (D = Q + Q)
+defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn.i", int_arm_neon_vaddhn, 1>;
+//   VRADDHN  : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
+defm VRADDHN  : N3VNInt_HSD<1,1,0b0100,0, "vraddhn.i", int_arm_neon_vraddhn, 1>;
+
+// Vector Multiply Operations.
+
+//   VMUL     : Vector Multiply (integer, polynomial and floating-point)
+defm VMUL     : N3V_QHS<0, 0, 0b1001, 1, "vmul.i", mul, 1>;
+def  VMULpd   : N3VDInt<1, 0, 0b00, 0b1001, 1, "vmul.p8", v8i8, v8i8,
+                        int_arm_neon_vmulp, 1>;
+def  VMULpq   : N3VQInt<1, 0, 0b00, 0b1001, 1, "vmul.p8", v16i8, v16i8,
+                        int_arm_neon_vmulp, 1>;
+def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul, 1>;
+def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, "vmul.f32", v4f32, v4f32, fmul, 1>;
+//   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
+defm VQDMULH  : N3VInt_HS<0,0,0b1011,0, "vqdmulh.s", int_arm_neon_vqdmulh, 1>;
+//   VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
+defm VQRDMULH : N3VInt_HS<1,0,0b1011,0, "vqrdmulh.s", int_arm_neon_vqrdmulh, 1>;
+//   VMULL    : Vector Multiply Long (integer and polynomial) (Q = D * D)
+defm VMULLs   : N3VLInt_QHS<0,1,0b1100,0, "vmull.s", int_arm_neon_vmulls, 1>;
+defm VMULLu   : N3VLInt_QHS<1,1,0b1100,0, "vmull.u", int_arm_neon_vmullu, 1>;
+def  VMULLp   : N3VLInt<0, 1, 0b00, 0b1110, 0, "vmull.p8", v8i16, v8i8,
+                        int_arm_neon_vmullp, 1>;
+//   VQDMULL  : Vector Saturating Doubling Multiply Long (Q = D * D)
+defm VQDMULL  : N3VLInt_HS<0,1,0b1101,0, "vqdmull.s", int_arm_neon_vqdmull, 1>;
+
+// Vector Multiply-Accumulate and Multiply-Subtract Operations.
+
+//   VMLA     : Vector Multiply Accumulate (integer and floating-point)
+defm VMLA     : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmla.i", add>;
+def  VMLAfd   : N3VDMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v2f32, fmul, fadd>;
+def  VMLAfq   : N3VQMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v4f32, fmul, fadd>;
+//   VMLAL    : Vector Multiply Accumulate Long (Q += D * D)
+defm VMLALs   : N3VLInt3_QHS<0,1,0b1000,0, "vmlal.s", int_arm_neon_vmlals>;
+defm VMLALu   : N3VLInt3_QHS<1,1,0b1000,0, "vmlal.u", int_arm_neon_vmlalu>;
+//   VQDMLAL  : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
+defm VQDMLAL  : N3VLInt3_HS<0, 1, 0b1001, 0, "vqdmlal.s", int_arm_neon_vqdmlal>;
+//   VMLS     : Vector Multiply Subtract (integer and floating-point)
+defm VMLS     : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmls.i", sub>;
+def  VMLSfd   : N3VDMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v2f32, fmul, fsub>;
+def  VMLSfq   : N3VQMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v4f32, fmul, fsub>;
+//   VMLSL    : Vector Multiply Subtract Long (Q -= D * D)
+defm VMLSLs   : N3VLInt3_QHS<0,1,0b1010,0, "vmlsl.s", int_arm_neon_vmlsls>;
+defm VMLSLu   : N3VLInt3_QHS<1,1,0b1010,0, "vmlsl.u", int_arm_neon_vmlslu>;
+//   VQDMLSL  : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
+defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, "vqdmlsl.s", int_arm_neon_vqdmlsl>;
+
+// Vector Subtract Operations.
+
+//   VSUB     : Vector Subtract (integer and floating-point)
+defm VSUB     : N3V_QHSD<1, 0, 0b1000, 0, "vsub.i", sub, 0>;
+def  VSUBfd   : N3VD<0, 0, 0b10, 0b1101, 0, "vsub.f32", v2f32, v2f32, fsub, 0>;
+def  VSUBfq   : N3VQ<0, 0, 0b10, 0b1101, 0, "vsub.f32", v4f32, v4f32, fsub, 0>;
+//   VSUBL    : Vector Subtract Long (Q = D - D)
+defm VSUBLs   : N3VLInt_QHS<0,1,0b0010,0, "vsubl.s", int_arm_neon_vsubls, 1>;
+defm VSUBLu   : N3VLInt_QHS<1,1,0b0010,0, "vsubl.u", int_arm_neon_vsublu, 1>;
+//   VSUBW    : Vector Subtract Wide (Q = Q - D)
+defm VSUBWs   : N3VWInt_QHS<0,1,0b0011,0, "vsubw.s", int_arm_neon_vsubws, 0>;
+defm VSUBWu   : N3VWInt_QHS<1,1,0b0011,0, "vsubw.u", int_arm_neon_vsubwu, 0>;
+//   VHSUB    : Vector Halving Subtract
+defm VHSUBs   : N3VInt_QHS<0, 0, 0b0010, 0, "vhsub.s", int_arm_neon_vhsubs, 0>;
+defm VHSUBu   : N3VInt_QHS<1, 0, 0b0010, 0, "vhsub.u", int_arm_neon_vhsubu, 0>;
+//   VQSUB    : Vector Saturing Subtract
+defm VQSUBs   : N3VInt_QHSD<0, 0, 0b0010, 1, "vqsub.s", int_arm_neon_vqsubs, 0>;
+defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, "vqsub.u", int_arm_neon_vqsubu, 0>;
+//   VSUBHN   : Vector Subtract and Narrow Returning High Half (D = Q - Q)
+defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn.i", int_arm_neon_vsubhn, 0>;
+//   VRSUBHN  : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
+defm VRSUBHN  : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn.i", int_arm_neon_vrsubhn, 0>;
+
+// Vector Comparisons.
+
+//   VCEQ     : Vector Compare Equal
+defm VCEQ     : N3V_QHS<1, 0, 0b1000, 1, "vceq.i", NEONvceq, 1>;
+def  VCEQfd   : N3VD<0,0,0b00,0b1110,0, "vceq.f32", v2i32, v2f32, NEONvceq, 1>;
+def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, "vceq.f32", v4i32, v4f32, NEONvceq, 1>;
+//   VCGE     : Vector Compare Greater Than or Equal
+defm VCGEs    : N3V_QHS<0, 0, 0b0011, 1, "vcge.s", NEONvcge, 0>;
+defm VCGEu    : N3V_QHS<1, 0, 0b0011, 1, "vcge.u", NEONvcgeu, 0>;
+def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, "vcge.f32", v2i32, v2f32, NEONvcge, 0>;
+def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, "vcge.f32", v4i32, v4f32, NEONvcge, 0>;
+//   VCGT     : Vector Compare Greater Than
+defm VCGTs    : N3V_QHS<0, 0, 0b0011, 0, "vcgt.s", NEONvcgt, 0>;
+defm VCGTu    : N3V_QHS<1, 0, 0b0011, 0, "vcgt.u", NEONvcgtu, 0>;
+def  VCGTfd   : N3VD<1,0,0b10,0b1110,0, "vcgt.f32", v2i32, v2f32, NEONvcgt, 0>;
+def  VCGTfq   : N3VQ<1,0,0b10,0b1110,0, "vcgt.f32", v4i32, v4f32, NEONvcgt, 0>;
+//   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
+def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, "vacge.f32", v2i32, v2f32,
+                        int_arm_neon_vacged, 0>;
+def  VACGEq   : N3VQInt<1, 0, 0b00, 0b1110, 1, "vacge.f32", v4i32, v4f32,
+                        int_arm_neon_vacgeq, 0>;
+//   VACGT    : Vector Absolute Compare Greater Than (aka VCAGT)
+def  VACGTd   : N3VDInt<1, 0, 0b10, 0b1110, 1, "vacgt.f32", v2i32, v2f32,
+                        int_arm_neon_vacgtd, 0>;
+def  VACGTq   : N3VQInt<1, 0, 0b10, 0b1110, 1, "vacgt.f32", v4i32, v4f32,
+                        int_arm_neon_vacgtq, 0>;
+//   VTST     : Vector Test Bits
+defm VTST     : N3V_QHS<0, 0, 0b1000, 1, "vtst.i", NEONvtst, 1>;
+
+// Vector Bitwise Operations.
+
+//   VAND     : Vector Bitwise AND
+def  VANDd    : N3VD<0, 0, 0b00, 0b0001, 1, "vand", v2i32, v2i32, and, 1>;
+def  VANDq    : N3VQ<0, 0, 0b00, 0b0001, 1, "vand", v4i32, v4i32, and, 1>;
+
+//   VEOR     : Vector Bitwise Exclusive OR
+def  VEORd    : N3VD<1, 0, 0b00, 0b0001, 1, "veor", v2i32, v2i32, xor, 1>;
+def  VEORq    : N3VQ<1, 0, 0b00, 0b0001, 1, "veor", v4i32, v4i32, xor, 1>;
+
+//   VORR     : Vector Bitwise OR
+def  VORRd    : N3VD<0, 0, 0b10, 0b0001, 1, "vorr", v2i32, v2i32, or, 1>;
+def  VORRq    : N3VQ<0, 0, 0b10, 0b0001, 1, "vorr", v4i32, v4i32, or, 1>;
+
+//   VBIC     : Vector Bitwise Bit Clear (AND NOT)
+def  VBICd    : N3V<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
+                    (ins DPR:$src1, DPR:$src2), "vbic\t$dst, $src1, $src2", "",
+                    [(set DPR:$dst, (v2i32 (and DPR:$src1,(vnot DPR:$src2))))]>;
+def  VBICq    : N3V<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
+                    (ins QPR:$src1, QPR:$src2), "vbic\t$dst, $src1, $src2", "",
+                    [(set QPR:$dst, (v4i32 (and QPR:$src1,(vnot QPR:$src2))))]>;
+
+//   VORN     : Vector Bitwise OR NOT
+def  VORNd    : N3V<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst),
+                    (ins DPR:$src1, DPR:$src2), "vorn\t$dst, $src1, $src2", "",
+                    [(set DPR:$dst, (v2i32 (or DPR:$src1, (vnot DPR:$src2))))]>;
+def  VORNq    : N3V<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst),
+                    (ins QPR:$src1, QPR:$src2), "vorn\t$dst, $src1, $src2", "",
+                    [(set QPR:$dst, (v4i32 (or QPR:$src1, (vnot QPR:$src2))))]>;
+
+//   VMVN     : Vector Bitwise NOT
+def  VMVNd    : N2V<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0,
+                    (outs DPR:$dst), (ins DPR:$src), "vmvn\t$dst, $src", "",
+                    [(set DPR:$dst, (v2i32 (vnot DPR:$src)))]>;
+def  VMVNq    : N2V<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0,
+                    (outs QPR:$dst), (ins QPR:$src), "vmvn\t$dst, $src", "",
+                    [(set QPR:$dst, (v4i32 (vnot QPR:$src)))]>;
+def : Pat<(v2i32 (vnot_conv DPR:$src)), (VMVNd DPR:$src)>;
+def : Pat<(v4i32 (vnot_conv QPR:$src)), (VMVNq QPR:$src)>;
+
+//   VBSL     : Vector Bitwise Select
+def  VBSLd    : N3V<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
+                    (ins DPR:$src1, DPR:$src2, DPR:$src3),
+                    "vbsl\t$dst, $src2, $src3", "$src1 = $dst",
+                    [(set DPR:$dst,
+                      (v2i32 (or (and DPR:$src2, DPR:$src1),
+                                 (and DPR:$src3, (vnot DPR:$src1)))))]>;
+def  VBSLq    : N3V<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
+                    (ins QPR:$src1, QPR:$src2, QPR:$src3),
+                    "vbsl\t$dst, $src2, $src3", "$src1 = $dst",
+                    [(set QPR:$dst,
+                      (v4i32 (or (and QPR:$src2, QPR:$src1),
+                                 (and QPR:$src3, (vnot QPR:$src1)))))]>;
+
+//   VBIF     : Vector Bitwise Insert if False
+//              like VBSL but with: "vbif\t$dst, $src3, $src1", "$src2 = $dst",
+//   VBIT     : Vector Bitwise Insert if True
+//              like VBSL but with: "vbit\t$dst, $src2, $src1", "$src3 = $dst",
+// These are not yet implemented.  The TwoAddress pass will not go looking
+// for equivalent operations with different register constraints; it just
+// inserts copies.
+
+// Vector Absolute Differences.
+
+//   VABD     : Vector Absolute Difference
+defm VABDs    : N3VInt_QHS<0, 0, 0b0111, 0, "vabd.s", int_arm_neon_vabds, 0>;
+defm VABDu    : N3VInt_QHS<1, 0, 0b0111, 0, "vabd.u", int_arm_neon_vabdu, 0>;
+def  VABDfd   : N3VDInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v2f32, v2f32,
+                        int_arm_neon_vabdf, 0>;
+def  VABDfq   : N3VQInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v4f32, v4f32,
+                        int_arm_neon_vabdf, 0>;
+
+//   VABDL    : Vector Absolute Difference Long (Q = | D - D |)
+defm VABDLs   : N3VLInt_QHS<0,1,0b0111,0, "vabdl.s", int_arm_neon_vabdls, 0>;
+defm VABDLu   : N3VLInt_QHS<1,1,0b0111,0, "vabdl.u", int_arm_neon_vabdlu, 0>;
+
+//   VABA     : Vector Absolute Difference and Accumulate
+defm VABAs    : N3VInt3_QHS<0,1,0b0101,0, "vaba.s", int_arm_neon_vabas>;
+defm VABAu    : N3VInt3_QHS<1,1,0b0101,0, "vaba.u", int_arm_neon_vabau>;
+
+//   VABAL    : Vector Absolute Difference and Accumulate Long (Q += | D - D |)
+defm VABALs   : N3VLInt3_QHS<0,1,0b0101,0, "vabal.s", int_arm_neon_vabals>;
+defm VABALu   : N3VLInt3_QHS<1,1,0b0101,0, "vabal.u", int_arm_neon_vabalu>;
+
+// Vector Maximum and Minimum.
+
+//   VMAX     : Vector Maximum
+defm VMAXs    : N3VInt_QHS<0, 0, 0b0110, 0, "vmax.s", int_arm_neon_vmaxs, 1>;
+defm VMAXu    : N3VInt_QHS<1, 0, 0b0110, 0, "vmax.u", int_arm_neon_vmaxu, 1>;
+def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v2f32, v2f32,
+                        int_arm_neon_vmaxf, 1>;
+def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v4f32, v4f32,
+                        int_arm_neon_vmaxf, 1>;
+
+//   VMIN     : Vector Minimum
+defm VMINs    : N3VInt_QHS<0, 0, 0b0110, 1, "vmin.s", int_arm_neon_vmins, 1>;
+defm VMINu    : N3VInt_QHS<1, 0, 0b0110, 1, "vmin.u", int_arm_neon_vminu, 1>;
+def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v2f32, v2f32,
+                        int_arm_neon_vminf, 1>;
+def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v4f32, v4f32,
+                        int_arm_neon_vminf, 1>;
+
+// Vector Pairwise Operations.
+
+//   VPADD    : Vector Pairwise Add
+def  VPADDi8  : N3VDInt<0, 0, 0b00, 0b1011, 1, "vpadd.i8", v8i8, v8i8,
+                        int_arm_neon_vpaddi, 0>;
+def  VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, "vpadd.i16", v4i16, v4i16,
+                        int_arm_neon_vpaddi, 0>;
+def  VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, "vpadd.i32", v2i32, v2i32,
+                        int_arm_neon_vpaddi, 0>;
+def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, "vpadd.f32", v2f32, v2f32,
+                        int_arm_neon_vpaddf, 0>;
+
+//   VPADDL   : Vector Pairwise Add Long
+defm VPADDLs  : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl.s",
+                             int_arm_neon_vpaddls>;
+defm VPADDLu  : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpaddl.u",
+                             int_arm_neon_vpaddlu>;
+
+//   VPADAL   : Vector Pairwise Add and Accumulate Long
+defm VPADALs  : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpadal.s",
+                              int_arm_neon_vpadals>;
+defm VPADALu  : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpadal.u",
+                              int_arm_neon_vpadalu>;
+
+//   VPMAX    : Vector Pairwise Maximum
+def  VPMAXs8  : N3VDInt<0, 0, 0b00, 0b1010, 0, "vpmax.s8", v8i8, v8i8,
+                        int_arm_neon_vpmaxs, 0>;
+def  VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, "vpmax.s16", v4i16, v4i16,
+                        int_arm_neon_vpmaxs, 0>;
+def  VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, "vpmax.s32", v2i32, v2i32,
+                        int_arm_neon_vpmaxs, 0>;
+def  VPMAXu8  : N3VDInt<1, 0, 0b00, 0b1010, 0, "vpmax.u8", v8i8, v8i8,
+                        int_arm_neon_vpmaxu, 0>;
+def  VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, "vpmax.u16", v4i16, v4i16,
+                        int_arm_neon_vpmaxu, 0>;
+def  VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, "vpmax.u32", v2i32, v2i32,
+                        int_arm_neon_vpmaxu, 0>;
+def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, "vpmax.f32", v2f32, v2f32,
+                        int_arm_neon_vpmaxf, 0>;
+
+//   VPMIN    : Vector Pairwise Minimum
+def  VPMINs8  : N3VDInt<0, 0, 0b00, 0b1010, 1, "vpmin.s8", v8i8, v8i8,
+                        int_arm_neon_vpmins, 0>;
+def  VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, "vpmin.s16", v4i16, v4i16,
+                        int_arm_neon_vpmins, 0>;
+def  VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, "vpmin.s32", v2i32, v2i32,
+                        int_arm_neon_vpmins, 0>;
+def  VPMINu8  : N3VDInt<1, 0, 0b00, 0b1010, 1, "vpmin.u8", v8i8, v8i8,
+                        int_arm_neon_vpminu, 0>;
+def  VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, "vpmin.u16", v4i16, v4i16,
+                        int_arm_neon_vpminu, 0>;
+def  VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, "vpmin.u32", v2i32, v2i32,
+                        int_arm_neon_vpminu, 0>;
+def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, "vpmin.f32", v2f32, v2f32,
+                        int_arm_neon_vpminf, 0>;
+
+// Vector Reciprocal and Reciprocal Square Root Estimate and Step.
+
+//   VRECPE   : Vector Reciprocal Estimate
+def  VRECPEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32",
+                        v2i32, v2i32, int_arm_neon_vrecpe>;
+def  VRECPEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32",
+                        v4i32, v4i32, int_arm_neon_vrecpe>;
+def  VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32",
+                        v2f32, v2f32, int_arm_neon_vrecpef>;
+def  VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32",
+                        v4f32, v4f32, int_arm_neon_vrecpef>;
+
+//   VRECPS   : Vector Reciprocal Step
+def  VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v2f32, v2f32,
+                        int_arm_neon_vrecps, 1>;
+def  VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v4f32, v4f32,
+                        int_arm_neon_vrecps, 1>;
+
+//   VRSQRTE  : Vector Reciprocal Square Root Estimate
+def  VRSQRTEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32",
+                        v2i32, v2i32, int_arm_neon_vrsqrte>;
+def  VRSQRTEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32",
+                        v4i32, v4i32, int_arm_neon_vrsqrte>;
+def  VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32",
+                        v2f32, v2f32, int_arm_neon_vrsqrtef>;
+def  VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32",
+                        v4f32, v4f32, int_arm_neon_vrsqrtef>;
+
+//   VRSQRTS  : Vector Reciprocal Square Root Step
+def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v2f32, v2f32,
+                        int_arm_neon_vrsqrts, 1>;
+def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v4f32, v4f32,
+                        int_arm_neon_vrsqrts, 1>;
+
+// Vector Shifts.
+
+//   VSHL     : Vector Shift
+defm VSHLs    : N3VInt_QHSD<0, 0, 0b0100, 0, "vshl.s", int_arm_neon_vshifts, 0>;
+defm VSHLu    : N3VInt_QHSD<1, 0, 0b0100, 0, "vshl.u", int_arm_neon_vshiftu, 0>;
+//   VSHL     : Vector Shift Left (Immediate)
+defm VSHLi    : N2VSh_QHSD<0, 1, 0b0111, 1, "vshl.i", NEONvshl>;
+//   VSHR     : Vector Shift Right (Immediate)
+defm VSHRs    : N2VSh_QHSD<0, 1, 0b0000, 1, "vshr.s", NEONvshrs>;
+defm VSHRu    : N2VSh_QHSD<1, 1, 0b0000, 1, "vshr.u", NEONvshru>;
+
+//   VSHLL    : Vector Shift Left Long
+def  VSHLLs8  : N2VLSh<0, 1, 0b001000, 0b1010, 0, 0, 1, "vshll.s8",
+                       v8i16, v8i8, NEONvshlls>;
+def  VSHLLs16 : N2VLSh<0, 1, 0b010000, 0b1010, 0, 0, 1, "vshll.s16",
+                       v4i32, v4i16, NEONvshlls>;
+def  VSHLLs32 : N2VLSh<0, 1, 0b100000, 0b1010, 0, 0, 1, "vshll.s32",
+                       v2i64, v2i32, NEONvshlls>;
+def  VSHLLu8  : N2VLSh<1, 1, 0b001000, 0b1010, 0, 0, 1, "vshll.u8",
+                       v8i16, v8i8, NEONvshllu>;
+def  VSHLLu16 : N2VLSh<1, 1, 0b010000, 0b1010, 0, 0, 1, "vshll.u16",
+                       v4i32, v4i16, NEONvshllu>;
+def  VSHLLu32 : N2VLSh<1, 1, 0b100000, 0b1010, 0, 0, 1, "vshll.u32",
+                       v2i64, v2i32, NEONvshllu>;
+
+//   VSHLL    : Vector Shift Left Long (with maximum shift count)
+def  VSHLLi8  : N2VLSh<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll.i8",
+                       v8i16, v8i8, NEONvshlli>;
+def  VSHLLi16 : N2VLSh<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll.i16",
+                       v4i32, v4i16, NEONvshlli>;
+def  VSHLLi32 : N2VLSh<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll.i32",
+                       v2i64, v2i32, NEONvshlli>;
+
+//   VSHRN    : Vector Shift Right and Narrow
+def  VSHRN16  : N2VNSh<0, 1, 0b001000, 0b1000, 0, 0, 1, "vshrn.i16",
+                       v8i8, v8i16, NEONvshrn>;
+def  VSHRN32  : N2VNSh<0, 1, 0b010000, 0b1000, 0, 0, 1, "vshrn.i32",
+                       v4i16, v4i32, NEONvshrn>;
+def  VSHRN64  : N2VNSh<0, 1, 0b100000, 0b1000, 0, 0, 1, "vshrn.i64",
+                       v2i32, v2i64, NEONvshrn>;
+
+//   VRSHL    : Vector Rounding Shift
+defm VRSHLs   : N3VInt_QHSD<0,0,0b0101,0, "vrshl.s", int_arm_neon_vrshifts, 0>;
+defm VRSHLu   : N3VInt_QHSD<1,0,0b0101,0, "vrshl.u", int_arm_neon_vrshiftu, 0>;
+//   VRSHR    : Vector Rounding Shift Right
+defm VRSHRs   : N2VSh_QHSD<0, 1, 0b0010, 1, "vrshr.s", NEONvrshrs>;
+defm VRSHRu   : N2VSh_QHSD<1, 1, 0b0010, 1, "vrshr.u", NEONvrshru>;
+
+//   VRSHRN   : Vector Rounding Shift Right and Narrow
+def  VRSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 1, 1, "vrshrn.i16",
+                       v8i8, v8i16, NEONvrshrn>;
+def  VRSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 1, 1, "vrshrn.i32",
+                       v4i16, v4i32, NEONvrshrn>;
+def  VRSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 1, 1, "vrshrn.i64",
+                       v2i32, v2i64, NEONvrshrn>;
+
+//   VQSHL    : Vector Saturating Shift
+defm VQSHLs   : N3VInt_QHSD<0,0,0b0100,1, "vqshl.s", int_arm_neon_vqshifts, 0>;
+defm VQSHLu   : N3VInt_QHSD<1,0,0b0100,1, "vqshl.u", int_arm_neon_vqshiftu, 0>;
+//   VQSHL    : Vector Saturating Shift Left (Immediate)
+defm VQSHLsi  : N2VSh_QHSD<0, 1, 0b0111, 1, "vqshl.s", NEONvqshls>;
+defm VQSHLui  : N2VSh_QHSD<1, 1, 0b0111, 1, "vqshl.u", NEONvqshlu>;
+//   VQSHLU   : Vector Saturating Shift Left (Immediate, Unsigned)
+defm VQSHLsu  : N2VSh_QHSD<1, 1, 0b0110, 1, "vqshlu.s", NEONvqshlsu>;
+
+//   VQSHRN   : Vector Saturating Shift Right and Narrow
+def VQSHRNs16 : N2VNSh<0, 1, 0b001000, 0b1001, 0, 0, 1, "vqshrn.s16",
+                       v8i8, v8i16, NEONvqshrns>;
+def VQSHRNs32 : N2VNSh<0, 1, 0b010000, 0b1001, 0, 0, 1, "vqshrn.s32",
+                       v4i16, v4i32, NEONvqshrns>;
+def VQSHRNs64 : N2VNSh<0, 1, 0b100000, 0b1001, 0, 0, 1, "vqshrn.s64",
+                       v2i32, v2i64, NEONvqshrns>;
+def VQSHRNu16 : N2VNSh<1, 1, 0b001000, 0b1001, 0, 0, 1, "vqshrn.u16",
+                       v8i8, v8i16, NEONvqshrnu>;
+def VQSHRNu32 : N2VNSh<1, 1, 0b010000, 0b1001, 0, 0, 1, "vqshrn.u32",
+                       v4i16, v4i32, NEONvqshrnu>;
+def VQSHRNu64 : N2VNSh<1, 1, 0b100000, 0b1001, 0, 0, 1, "vqshrn.u64",
+                       v2i32, v2i64, NEONvqshrnu>;
+
+//   VQSHRUN  : Vector Saturating Shift Right and Narrow (Unsigned)
+def VQSHRUN16 : N2VNSh<1, 1, 0b001000, 0b1000, 0, 0, 1, "vqshrun.s16",
+                       v8i8, v8i16, NEONvqshrnsu>;
+def VQSHRUN32 : N2VNSh<1, 1, 0b010000, 0b1000, 0, 0, 1, "vqshrun.s32",
+                       v4i16, v4i32, NEONvqshrnsu>;
+def VQSHRUN64 : N2VNSh<1, 1, 0b100000, 0b1000, 0, 0, 1, "vqshrun.s64",
+                       v2i32, v2i64, NEONvqshrnsu>;
+
+//   VQRSHL   : Vector Saturating Rounding Shift
+defm VQRSHLs  : N3VInt_QHSD<0, 0, 0b0101, 1, "vqrshl.s",
+                            int_arm_neon_vqrshifts, 0>;
+defm VQRSHLu  : N3VInt_QHSD<1, 0, 0b0101, 1, "vqrshl.u",
+                            int_arm_neon_vqrshiftu, 0>;
+
+//   VQRSHRN  : Vector Saturating Rounding Shift Right and Narrow
+def VQRSHRNs16: N2VNSh<0, 1, 0b001000, 0b1001, 0, 1, 1, "vqrshrn.s16",
+                       v8i8, v8i16, NEONvqrshrns>;
+def VQRSHRNs32: N2VNSh<0, 1, 0b010000, 0b1001, 0, 1, 1, "vqrshrn.s32",
+                       v4i16, v4i32, NEONvqrshrns>;
+def VQRSHRNs64: N2VNSh<0, 1, 0b100000, 0b1001, 0, 1, 1, "vqrshrn.s64",
+                       v2i32, v2i64, NEONvqrshrns>;
+def VQRSHRNu16: N2VNSh<1, 1, 0b001000, 0b1001, 0, 1, 1, "vqrshrn.u16",
+                       v8i8, v8i16, NEONvqrshrnu>;
+def VQRSHRNu32: N2VNSh<1, 1, 0b010000, 0b1001, 0, 1, 1, "vqrshrn.u32",
+                       v4i16, v4i32, NEONvqrshrnu>;
+def VQRSHRNu64: N2VNSh<1, 1, 0b100000, 0b1001, 0, 1, 1, "vqrshrn.u64",
+                       v2i32, v2i64, NEONvqrshrnu>;
+
+//   VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned)
+def VQRSHRUN16: N2VNSh<1, 1, 0b001000, 0b1000, 0, 1, 1, "vqrshrun.s16",
+                       v8i8, v8i16, NEONvqrshrnsu>;
+def VQRSHRUN32: N2VNSh<1, 1, 0b010000, 0b1000, 0, 1, 1, "vqrshrun.s32",
+                       v4i16, v4i32, NEONvqrshrnsu>;
+def VQRSHRUN64: N2VNSh<1, 1, 0b100000, 0b1000, 0, 1, 1, "vqrshrun.s64",
+                       v2i32, v2i64, NEONvqrshrnsu>;
+
+//   VSRA     : Vector Shift Right and Accumulate
+defm VSRAs    : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra.s", NEONvshrs>;
+defm VSRAu    : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra.u", NEONvshru>;
+//   VRSRA    : Vector Rounding Shift Right and Accumulate
+defm VRSRAs   : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra.s", NEONvrshrs>;
+defm VRSRAu   : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra.u", NEONvrshru>;
+
+//   VSLI     : Vector Shift Left and Insert
+defm VSLI     : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli.", NEONvsli>;
+//   VSRI     : Vector Shift Right and Insert
+defm VSRI     : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri.", NEONvsri>;
+
+// Vector Absolute and Saturating Absolute.
+
+//   VABS     : Vector Absolute Value
+defm VABS     : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, "vabs.s",
+                           int_arm_neon_vabs>;
+def  VABSfd   : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
+                        v2f32, v2f32, int_arm_neon_vabsf>;
+def  VABSfq   : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
+                        v4f32, v4f32, int_arm_neon_vabsf>;
+
+//   VQABS    : Vector Saturating Absolute Value
+defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, "vqabs.s",
+                           int_arm_neon_vqabs>;
+
+// Vector Negate.
+
+def vneg      : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>;
+def vneg_conv : PatFrag<(ops node:$in), (sub immAllZerosV_bc, node:$in)>;
+
+class VNEGD<bits<2> size, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src),
+        !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set DPR:$dst, (Ty (vneg DPR:$src)))]>;
+class VNEGQ<bits<2> size, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src),
+        !strconcat(OpcodeStr, "\t$dst, $src"), "",
+        [(set QPR:$dst, (Ty (vneg QPR:$src)))]>;
+
+//   VNEG     : Vector Negate
+def  VNEGs8d  : VNEGD<0b00, "vneg.s8", v8i8>;
+def  VNEGs16d : VNEGD<0b01, "vneg.s16", v4i16>;
+def  VNEGs32d : VNEGD<0b10, "vneg.s32", v2i32>;
+def  VNEGs8q  : VNEGQ<0b00, "vneg.s8", v16i8>;
+def  VNEGs16q : VNEGQ<0b01, "vneg.s16", v8i16>;
+def  VNEGs32q : VNEGQ<0b10, "vneg.s32", v4i32>;
+
+//   VNEG     : Vector Negate (floating-point)
+def  VNEGf32d : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
+                    (outs DPR:$dst), (ins DPR:$src), "vneg.f32\t$dst, $src", "",
+                    [(set DPR:$dst, (v2f32 (fneg DPR:$src)))]>;
+def  VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0,
+                    (outs QPR:$dst), (ins QPR:$src), "vneg.f32\t$dst, $src", "",
+                    [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>;
+
+def : Pat<(v8i8 (vneg_conv DPR:$src)), (VNEGs8d DPR:$src)>;
+def : Pat<(v4i16 (vneg_conv DPR:$src)), (VNEGs16d DPR:$src)>;
+def : Pat<(v2i32 (vneg_conv DPR:$src)), (VNEGs32d DPR:$src)>;
+def : Pat<(v16i8 (vneg_conv QPR:$src)), (VNEGs8q QPR:$src)>;
+def : Pat<(v8i16 (vneg_conv QPR:$src)), (VNEGs16q QPR:$src)>;
+def : Pat<(v4i32 (vneg_conv QPR:$src)), (VNEGs32q QPR:$src)>;
+
+//   VQNEG    : Vector Saturating Negate
+defm VQNEG    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, "vqneg.s",
+                           int_arm_neon_vqneg>;
+
+// Vector Bit Counting Operations.
+
+//   VCLS     : Vector Count Leading Sign Bits
+defm VCLS     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, "vcls.s",
+                           int_arm_neon_vcls>;
+//   VCLZ     : Vector Count Leading Zeros
+defm VCLZ     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, "vclz.i",
+                           int_arm_neon_vclz>;
+//   VCNT     : Vector Count One Bits
+def  VCNTd    : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8",
+                        v8i8, v8i8, int_arm_neon_vcnt>;
+def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8",
+                        v16i8, v16i8, int_arm_neon_vcnt>;
+
+// Vector Move Operations.
+
+//   VMOV     : Vector Move (Register)
+
+def  VMOVD    : N3V<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src),
+                    "vmov\t$dst, $src", "", []>;
+def  VMOVQ    : N3V<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src),
+                    "vmov\t$dst, $src", "", []>;
+
+//   VMOV     : Vector Move (Immediate)
+
+// VMOV_get_imm8 xform function: convert build_vector to VMOV.i8 imm.
+def VMOV_get_imm8 : SDNodeXForm<build_vector, [{
+  return ARM::getVMOVImm(N, 1, *CurDAG);
+}]>;
+def vmovImm8 : PatLeaf<(build_vector), [{
+  return ARM::getVMOVImm(N, 1, *CurDAG).getNode() != 0;
+}], VMOV_get_imm8>;
+
+// VMOV_get_imm16 xform function: convert build_vector to VMOV.i16 imm.
+def VMOV_get_imm16 : SDNodeXForm<build_vector, [{
+  return ARM::getVMOVImm(N, 2, *CurDAG);
+}]>;
+def vmovImm16 : PatLeaf<(build_vector), [{
+  return ARM::getVMOVImm(N, 2, *CurDAG).getNode() != 0;
+}], VMOV_get_imm16>;
+
+// VMOV_get_imm32 xform function: convert build_vector to VMOV.i32 imm.
+def VMOV_get_imm32 : SDNodeXForm<build_vector, [{
+  return ARM::getVMOVImm(N, 4, *CurDAG);
+}]>;
+def vmovImm32 : PatLeaf<(build_vector), [{
+  return ARM::getVMOVImm(N, 4, *CurDAG).getNode() != 0;
+}], VMOV_get_imm32>;
+
+// VMOV_get_imm64 xform function: convert build_vector to VMOV.i64 imm.
+def VMOV_get_imm64 : SDNodeXForm<build_vector, [{
+  return ARM::getVMOVImm(N, 8, *CurDAG);
+}]>;
+def vmovImm64 : PatLeaf<(build_vector), [{
+  return ARM::getVMOVImm(N, 8, *CurDAG).getNode() != 0;
+}], VMOV_get_imm64>;
+
+// Note: Some of the cmode bits in the following VMOV instructions need to
+// be encoded based on the immed values.
+
+def VMOVv8i8  : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$dst),
+                         (ins i8imm:$SIMM), "vmov.i8\t$dst, $SIMM", "",
+                         [(set DPR:$dst, (v8i8 vmovImm8:$SIMM))]>;
+def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$dst),
+                         (ins i8imm:$SIMM), "vmov.i8\t$dst, $SIMM", "",
+                         [(set QPR:$dst, (v16i8 vmovImm8:$SIMM))]>;
+
+def VMOVv4i16 : N1ModImm<1, 0b000, 0b1000, 0, 0, 0, 1, (outs DPR:$dst),
+                         (ins i16imm:$SIMM), "vmov.i16\t$dst, $SIMM", "",
+                         [(set DPR:$dst, (v4i16 vmovImm16:$SIMM))]>;
+def VMOVv8i16 : N1ModImm<1, 0b000, 0b1000, 0, 1, 0, 1, (outs QPR:$dst),
+                         (ins i16imm:$SIMM), "vmov.i16\t$dst, $SIMM", "",
+                         [(set QPR:$dst, (v8i16 vmovImm16:$SIMM))]>;
+
+def VMOVv2i32 : N1ModImm<1, 0b000, 0b0000, 0, 0, 0, 1, (outs DPR:$dst),
+                         (ins i32imm:$SIMM), "vmov.i32\t$dst, $SIMM", "",
+                         [(set DPR:$dst, (v2i32 vmovImm32:$SIMM))]>;
+def VMOVv4i32 : N1ModImm<1, 0b000, 0b0000, 0, 1, 0, 1, (outs QPR:$dst),
+                         (ins i32imm:$SIMM), "vmov.i32\t$dst, $SIMM", "",
+                         [(set QPR:$dst, (v4i32 vmovImm32:$SIMM))]>;
+
+def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$dst),
+                         (ins i64imm:$SIMM), "vmov.i64\t$dst, $SIMM", "",
+                         [(set DPR:$dst, (v1i64 vmovImm64:$SIMM))]>;
+def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$dst),
+                         (ins i64imm:$SIMM), "vmov.i64\t$dst, $SIMM", "",
+                         [(set QPR:$dst, (v2i64 vmovImm64:$SIMM))]>;
+
+//   VMOV     : Vector Get Lane (move scalar to ARM core register)
+
+def VGETLNs8  : NVGetLane<0b11100101, 0b1011, 0b00,
+                          (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
+                          "vmov", ".s8\t$dst, $src[$lane]",
+                          [(set GPR:$dst, (NEONvgetlanes (v8i8 DPR:$src),
+                                           imm:$lane))]>;
+def VGETLNs16 : NVGetLane<0b11100001, 0b1011, 0b01,
+                          (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
+                          "vmov", ".s16\t$dst, $src[$lane]",
+                          [(set GPR:$dst, (NEONvgetlanes (v4i16 DPR:$src),
+                                           imm:$lane))]>;
+def VGETLNu8  : NVGetLane<0b11101101, 0b1011, 0b00,
+                          (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
+                          "vmov", ".u8\t$dst, $src[$lane]",
+                          [(set GPR:$dst, (NEONvgetlaneu (v8i8 DPR:$src),
+                                           imm:$lane))]>;
+def VGETLNu16 : NVGetLane<0b11101001, 0b1011, 0b01,
+                          (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
+                          "vmov", ".u16\t$dst, $src[$lane]",
+                          [(set GPR:$dst, (NEONvgetlaneu (v4i16 DPR:$src),
+                                           imm:$lane))]>;
+def VGETLNi32 : NVGetLane<0b11100001, 0b1011, 0b00,
+                          (outs GPR:$dst), (ins DPR:$src, i32imm:$lane),
+                          "vmov", ".32\t$dst, $src[$lane]",
+                          [(set GPR:$dst, (extractelt (v2i32 DPR:$src),
+                                           imm:$lane))]>;
+// def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
+def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane),
+          (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src,
+                           (SubReg_i8_reg imm:$lane))),
+                     (SubReg_i8_lane imm:$lane))>;
+def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane),
+          (VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src,
+                             (SubReg_i16_reg imm:$lane))),
+                     (SubReg_i16_lane imm:$lane))>;
+def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane),
+          (VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src,
+                           (SubReg_i8_reg imm:$lane))),
+                     (SubReg_i8_lane imm:$lane))>;
+def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane),
+          (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src,
+                             (SubReg_i16_reg imm:$lane))),
+                     (SubReg_i16_lane imm:$lane))>;
+def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
+          (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src,
+                             (SubReg_i32_reg imm:$lane))),
+                     (SubReg_i32_lane imm:$lane))>;
+//def : Pat<(extractelt (v2i64 QPR:$src1), imm:$src2),
+//          (EXTRACT_SUBREG QPR:$src1, (SubReg_f64_reg imm:$src2))>;
+def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG QPR:$src1, (SubReg_f64_reg imm:$src2))>;
+
+
+//   VMOV     : Vector Set Lane (move ARM core register to scalar)
+
+let Constraints = "$src1 = $dst" in {
+def VSETLNi8  : NVSetLane<0b11100100, 0b1011, 0b00, (outs DPR:$dst),
+                          (ins DPR:$src1, GPR:$src2, i32imm:$lane),
+                          "vmov", ".8\t$dst[$lane], $src2",
+                          [(set DPR:$dst, (vector_insert (v8i8 DPR:$src1),
+                                           GPR:$src2, imm:$lane))]>;
+def VSETLNi16 : NVSetLane<0b11100000, 0b1011, 0b01, (outs DPR:$dst),
+                          (ins DPR:$src1, GPR:$src2, i32imm:$lane),
+                          "vmov", ".16\t$dst[$lane], $src2",
+                          [(set DPR:$dst, (vector_insert (v4i16 DPR:$src1),
+                                           GPR:$src2, imm:$lane))]>;
+def VSETLNi32 : NVSetLane<0b11100000, 0b1011, 0b00, (outs DPR:$dst),
+                          (ins DPR:$src1, GPR:$src2, i32imm:$lane),
+                          "vmov", ".32\t$dst[$lane], $src2",
+                          [(set DPR:$dst, (insertelt (v2i32 DPR:$src1),
+                                           GPR:$src2, imm:$lane))]>;
+}
+def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
+          (v16i8 (INSERT_SUBREG QPR:$src1, 
+                  (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1,
+                                   (SubReg_i8_reg imm:$lane))),
+                            GPR:$src2, (SubReg_i8_lane imm:$lane)),
+                  (SubReg_i8_reg imm:$lane)))>;
+def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane),
+          (v8i16 (INSERT_SUBREG QPR:$src1, 
+                  (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
+                                     (SubReg_i16_reg imm:$lane))),
+                             GPR:$src2, (SubReg_i16_lane imm:$lane)),
+                  (SubReg_i16_reg imm:$lane)))>;
+def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane),
+          (v4i32 (INSERT_SUBREG QPR:$src1, 
+                  (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1,
+                                     (SubReg_i32_reg imm:$lane))),
+                             GPR:$src2, (SubReg_i32_lane imm:$lane)),
+                  (SubReg_i32_reg imm:$lane)))>;
+
+//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
+//          (INSERT_SUBREG QPR:$src1, DPR:$src2, (SubReg_f64_reg imm:$src3))>;
+def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
+          (INSERT_SUBREG QPR:$src1, DPR:$src2, (SubReg_f64_reg imm:$src3))>;
+
+//   VDUP     : Vector Duplicate (from ARM core register to all elements)
+
+def splat_lo : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs), [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return SVOp->isSplat() && SVOp->getSplatIndex() == 0;
+}]>;
+
+class VDUPD<bits<8> opcod1, bits<2> opcod3, string asmSize, ValueType Ty>
+  : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$dst), (ins GPR:$src),
+          "vdup", !strconcat(asmSize, "\t$dst, $src"),
+          [(set DPR:$dst, (Ty (splat_lo (scalar_to_vector GPR:$src), undef)))]>;
+class VDUPQ<bits<8> opcod1, bits<2> opcod3, string asmSize, ValueType Ty>
+  : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$dst), (ins GPR:$src),
+          "vdup", !strconcat(asmSize, "\t$dst, $src"),
+          [(set QPR:$dst, (Ty (splat_lo (scalar_to_vector GPR:$src), undef)))]>;
+
+def  VDUP8d   : VDUPD<0b11101100, 0b00, ".8", v8i8>;
+def  VDUP16d  : VDUPD<0b11101000, 0b01, ".16", v4i16>;
+def  VDUP32d  : VDUPD<0b11101000, 0b00, ".32", v2i32>;
+def  VDUP8q   : VDUPQ<0b11101110, 0b00, ".8", v16i8>;
+def  VDUP16q  : VDUPQ<0b11101010, 0b01, ".16", v8i16>;
+def  VDUP32q  : VDUPQ<0b11101010, 0b00, ".32", v4i32>;
+
+def  VDUPfd   : NVDup<0b11101000, 0b1011, 0b00, (outs DPR:$dst), (ins GPR:$src),
+                      "vdup", ".32\t$dst, $src",
+                      [(set DPR:$dst, (v2f32 (splat_lo
+                                              (scalar_to_vector
+                                               (f32 (bitconvert GPR:$src))),
+                                              undef)))]>;
+def  VDUPfq   : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src),
+                      "vdup", ".32\t$dst, $src",
+                      [(set QPR:$dst, (v4f32 (splat_lo
+                                              (scalar_to_vector
+                                               (f32 (bitconvert GPR:$src))),
+                                              undef)))]>;
+
+//   VDUP     : Vector Duplicate Lane (from scalar to all elements)
+
+def SHUFFLE_get_splat_lane : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getSplatIndex(), MVT::i32);
+}]>;
+
+def splat_lane : PatFrag<(ops node:$lhs, node:$rhs),
+                         (vector_shuffle node:$lhs, node:$rhs), [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return SVOp->isSplat();
+}], SHUFFLE_get_splat_lane>;
+
+class VDUPLND<bits<2> op19_18, bits<2> op17_16, string OpcodeStr, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 0, 0,
+        (outs DPR:$dst), (ins DPR:$src, i32imm:$lane),
+        !strconcat(OpcodeStr, "\t$dst, $src[$lane]"), "",
+        [(set DPR:$dst, (Ty (splat_lane:$lane DPR:$src, undef)))]>;
+
+// vector_shuffle requires that the source and destination types match, so
+// VDUP to a 128-bit result uses a target-specific VDUPLANEQ node.
+class VDUPLNQ<bits<2> op19_18, bits<2> op17_16, string OpcodeStr,
+              ValueType ResTy, ValueType OpTy>
+  : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 1, 0,
+        (outs QPR:$dst), (ins DPR:$src, i32imm:$lane),
+        !strconcat(OpcodeStr, "\t$dst, $src[$lane]"), "",
+        [(set QPR:$dst, (ResTy (NEONvduplaneq (OpTy DPR:$src), imm:$lane)))]>;
+
+def VDUPLN8d  : VDUPLND<0b00, 0b01, "vdup.8", v8i8>;
+def VDUPLN16d : VDUPLND<0b00, 0b10, "vdup.16", v4i16>;
+def VDUPLN32d : VDUPLND<0b01, 0b00, "vdup.32", v2i32>;
+def VDUPLNfd  : VDUPLND<0b01, 0b00, "vdup.32", v2f32>;
+def VDUPLN8q  : VDUPLNQ<0b00, 0b01, "vdup.8", v16i8, v8i8>;
+def VDUPLN16q : VDUPLNQ<0b00, 0b10, "vdup.16", v8i16, v4i16>;
+def VDUPLN32q : VDUPLNQ<0b01, 0b00, "vdup.32", v4i32, v2i32>;
+def VDUPLNfq  : VDUPLNQ<0b01, 0b00, "vdup.32", v4f32, v2f32>;
+
+//   VMOVN    : Vector Narrowing Move
+defm VMOVN    : N2VNInt_HSD<0b11,0b11,0b10,0b00100,0,0, "vmovn.i",
+                            int_arm_neon_vmovn>;
+//   VQMOVN   : Vector Saturating Narrowing Move
+defm VQMOVNs  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, "vqmovn.s",
+                            int_arm_neon_vqmovns>;
+defm VQMOVNu  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, "vqmovn.u",
+                            int_arm_neon_vqmovnu>;
+defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, "vqmovun.s",
+                            int_arm_neon_vqmovnsu>;
+//   VMOVL    : Vector Lengthening Move
+defm VMOVLs   : N2VLInt_QHS<0,1,0b1010,0,0,1, "vmovl.s", int_arm_neon_vmovls>;
+defm VMOVLu   : N2VLInt_QHS<1,1,0b1010,0,0,1, "vmovl.u", int_arm_neon_vmovlu>;
+
+// Vector Conversions.
+
+//   VCVT     : Vector Convert Between Floating-Point and Integers
+def  VCVTf2sd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt.s32.f32",
+                     v2i32, v2f32, fp_to_sint>;
+def  VCVTf2ud : N2VD<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt.u32.f32",
+                     v2i32, v2f32, fp_to_uint>;
+def  VCVTs2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt.f32.s32",
+                     v2f32, v2i32, sint_to_fp>;
+def  VCVTu2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt.f32.u32",
+                     v2f32, v2i32, uint_to_fp>;
+
+def  VCVTf2sq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt.s32.f32",
+                     v4i32, v4f32, fp_to_sint>;
+def  VCVTf2uq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt.u32.f32",
+                     v4i32, v4f32, fp_to_uint>;
+def  VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt.f32.s32",
+                     v4f32, v4i32, sint_to_fp>;
+def  VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt.f32.u32",
+                     v4f32, v4i32, uint_to_fp>;
+
+//   VCVT     : Vector Convert Between Floating-Point and Fixed-Point.
+// Note: Some of the opcode bits in the following VCVT instructions need to
+// be encoded based on the immed values.
+def VCVTf2xsd : N2VCvtD<0, 1, 0b000000, 0b1111, 0, 1, "vcvt.s32.f32",
+                        v2i32, v2f32, int_arm_neon_vcvtfp2fxs>;
+def VCVTf2xud : N2VCvtD<1, 1, 0b000000, 0b1111, 0, 1, "vcvt.u32.f32",
+                        v2i32, v2f32, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2fd : N2VCvtD<0, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.s32",
+                        v2f32, v2i32, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2fd : N2VCvtD<1, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.u32",
+                        v2f32, v2i32, int_arm_neon_vcvtfxu2fp>;
+
+def VCVTf2xsq : N2VCvtQ<0, 1, 0b000000, 0b1111, 0, 1, "vcvt.s32.f32",
+                        v4i32, v4f32, int_arm_neon_vcvtfp2fxs>;
+def VCVTf2xuq : N2VCvtQ<1, 1, 0b000000, 0b1111, 0, 1, "vcvt.u32.f32",
+                        v4i32, v4f32, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2fq : N2VCvtQ<0, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.s32",
+                        v4f32, v4i32, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2fq : N2VCvtQ<1, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.u32",
+                        v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// bit_convert
+def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v1i64 (bitconvert (f64   DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (v1i64 DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (f64   DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(f64   (bitconvert (v1i64 DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (v2f32 DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (v2f32 DPR:$src)>;
+
+def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 9297f08d800b..1def0933d6c0 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -319,7 +319,7 @@ def tAND : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
 
 def tASRri : TI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
                 "asr $dst, $lhs, $rhs",
-                [(set tGPR:$dst, (sra tGPR:$lhs, imm:$rhs))]>;
+                [(set tGPR:$dst, (sra tGPR:$lhs, (i32 imm:$rhs)))]>;
 
 def tASRrr : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
                  "asr $dst, $rhs",
@@ -367,7 +367,7 @@ def tEOR : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
 
 def tLSLri : TI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
                 "lsl $dst, $lhs, $rhs",
-                [(set tGPR:$dst, (shl tGPR:$lhs, imm:$rhs))]>;
+                [(set tGPR:$dst, (shl tGPR:$lhs, (i32 imm:$rhs)))]>;
 
 def tLSLrr : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
                  "lsl $dst, $rhs",
@@ -375,7 +375,7 @@ def tLSLrr : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
 
 def tLSRri : TI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
                 "lsr $dst, $lhs, $rhs",
-                [(set tGPR:$dst, (srl tGPR:$lhs, imm:$rhs))]>;
+                [(set tGPR:$dst, (srl tGPR:$lhs, (i32 imm:$rhs)))]>;
 
 def tLSRrr : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
                  "lsr $dst, $rhs",
@@ -429,18 +429,18 @@ def tREV : TI<(outs tGPR:$dst), (ins tGPR:$src),
 def tREV16 : TI<(outs tGPR:$dst), (ins tGPR:$src),
                 "rev16 $dst, $src",
                 [(set tGPR:$dst,
-                    (or (and (srl tGPR:$src, 8), 0xFF),
-                        (or (and (shl tGPR:$src, 8), 0xFF00),
-                            (or (and (srl tGPR:$src, 8), 0xFF0000),
-                                (and (shl tGPR:$src, 8), 0xFF000000)))))]>,
+                    (or (and (srl tGPR:$src, (i32 8)), 0xFF),
+                        (or (and (shl tGPR:$src, (i32 8)), 0xFF00),
+                            (or (and (srl tGPR:$src, (i32 8)), 0xFF0000),
+                                (and (shl tGPR:$src, (i32 8)), 0xFF000000)))))]>,
                 Requires<[IsThumb, HasV6]>;
 
 def tREVSH : TI<(outs tGPR:$dst), (ins tGPR:$src),
                 "revsh $dst, $src",
                 [(set tGPR:$dst,
                    (sext_inreg
-                     (or (srl (and tGPR:$src, 0xFFFF), 8),
-                         (shl tGPR:$src, 8)), i16))]>,
+                     (or (srl (and tGPR:$src, 0xFFFF), (i32 8)),
+                         (shl tGPR:$src, (i32 8))), i16))]>,
                 Requires<[IsThumb, HasV6]>;
 
 def tROR : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 07c71da46d62..0aba2d522809 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -160,7 +160,7 @@ def tMOVi16  : PseudoInst<(outs GPR:$dst), (ins i32imm:$src),
                           [(set GPR:$dst, imm0_65535:$src)]>, 
                          Requires<[HasThumb2]>;
 
-let isTwoAddress = 1 in
+let Constraints = "$src = $dst" in
 def tMOVTi16 : PseudoInst<(outs GPR:$dst), (ins GPR:$src, i32imm:$imm),
                           "movt $dst, $imm",
                           [(set GPR:$dst, (or (and GPR:$src, 0xffff), 
diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp
index bbc13001ae55..bb0cc8ff0688 100644
--- a/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -235,8 +235,10 @@ ARMRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   };
 
   static const unsigned DarwinCalleeSavedRegs[] = {
+    // Darwin ABI deviates from ARM standard ABI. R9 is not a callee-saved
+    // register.
     ARM::LR,  ARM::R7,  ARM::R6, ARM::R5, ARM::R4,
-    ARM::R11, ARM::R10, ARM::R9, ARM::R8,
+    ARM::R11, ARM::R10, ARM::R8,
 
     ARM::D15, ARM::D14, ARM::D13, ARM::D12,
     ARM::D11, ARM::D10, ARM::D9,  ARM::D8,
@@ -256,6 +258,7 @@ ARMRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
     &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
     0
   };
+
   static const TargetRegisterClass * const ThumbCalleeSavedRegClasses[] = {
     &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
     &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::tGPRRegClass,
@@ -265,7 +268,33 @@ ARMRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
     &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
     0
   };
-  return STI.isThumb() ? ThumbCalleeSavedRegClasses : CalleeSavedRegClasses;
+
+  static const TargetRegisterClass * const DarwinCalleeSavedRegClasses[] = {
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+
+  static const TargetRegisterClass * const DarwinThumbCalleeSavedRegClasses[] ={
+    &ARM::GPRRegClass,  &ARM::tGPRRegClass, &ARM::tGPRRegClass,
+    &ARM::tGPRRegClass, &ARM::tGPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass,  &ARM::GPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+
+  if (STI.isThumb()) {
+    return STI.isTargetDarwin()
+      ? DarwinThumbCalleeSavedRegClasses : ThumbCalleeSavedRegClasses;
+  }
+  return STI.isTargetDarwin()
+    ? DarwinCalleeSavedRegClasses : CalleeSavedRegClasses;
 }
 
 BitVector ARMRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@@ -497,7 +526,9 @@ ARMRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
 ///
 bool ARMRegisterInfo::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return NoFramePointerElim || MFI->hasVarSizedObjects();
+  return (NoFramePointerElim ||
+          MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
 }
 
 // hasReservedCallFrame - Under normal circumstances, when a frame pointer is
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index d864079f85e1..a057e5cabf60 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -77,6 +77,34 @@ def D13 : ARMReg<13, "d13", [S26, S27]>;
 def D14 : ARMReg<14, "d14", [S28, S29]>;
 def D15 : ARMReg<15, "d15", [S30, S31]>;
 
+// VFP3 defines 16 additional double registers
+def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d16">;
+def D18 : ARMFReg<18, "d16">; def D19 : ARMFReg<19, "d16">;
+def D20 : ARMFReg<20, "d16">; def D21 : ARMFReg<21, "d16">;
+def D22 : ARMFReg<22, "d16">; def D23 : ARMFReg<23, "d16">;
+def D24 : ARMFReg<24, "d16">; def D25 : ARMFReg<25, "d16">;
+def D26 : ARMFReg<26, "d16">; def D27 : ARMFReg<27, "d16">;
+def D28 : ARMFReg<28, "d16">; def D29 : ARMFReg<29, "d16">;
+def D30 : ARMFReg<30, "d16">; def D31 : ARMFReg<31, "d16">;
+
+// Advanced SIMD (NEON) defines 16 quad-word aliases
+def Q0  : ARMReg< 0,  "q0", [D0,   D1]>;
+def Q1  : ARMReg< 1,  "q1", [D2,   D3]>; 
+def Q2  : ARMReg< 2,  "q2", [D4,   D5]>;
+def Q3  : ARMReg< 3,  "q3", [D6,   D7]>;
+def Q4  : ARMReg< 4,  "q4", [D8,   D9]>;
+def Q5  : ARMReg< 5,  "q5", [D10, D11]>;
+def Q6  : ARMReg< 6,  "q6", [D12, D13]>;
+def Q7  : ARMReg< 7,  "q7", [D14, D15]>;
+def Q8  : ARMReg< 8,  "q8", [D16, D17]>;
+def Q9  : ARMReg< 9,  "q9", [D18, D19]>;
+def Q10 : ARMReg<10, "q10", [D20, D21]>;
+def Q11 : ARMReg<11, "q11", [D22, D23]>;
+def Q12 : ARMReg<12, "q12", [D24, D25]>;
+def Q13 : ARMReg<13, "q13", [D26, D27]>;
+def Q14 : ARMReg<14, "q14", [D28, D29]>;
+def Q15 : ARMReg<15, "q15", [D30, D31]>;
+
 // Current Program Status Register.
 def CPSR : ARMReg<0, "cpsr">;
 
@@ -87,6 +115,7 @@ def CPSR : ARMReg<0, "cpsr">;
 // sp  == Stack Pointer
 // r12 == ip (scratch)
 // r7  == Frame Pointer (thumb-style backtraces)
+// r9  == May be reserved as Thread Register
 // r11 == Frame Pointer (arm-style backtraces)
 // r10 == Stack Limit
 //
@@ -115,13 +144,13 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
       ARM::R4, ARM::R5, ARM::R6, ARM::R7,
       ARM::R8, ARM::R10,
       ARM::R11 };
-    // FP is R7, R9 is available.
+    // FP is R7, R9 is available as non-callee-saved register.
+    // This is used by Darwin.
     static const unsigned ARM_GPR_AO_3[] = {
       ARM::R0, ARM::R1, ARM::R2, ARM::R3,
-      ARM::R12,ARM::LR,
+      ARM::R9, ARM::R12,ARM::LR,
       ARM::R4, ARM::R5, ARM::R6,
-      ARM::R8, ARM::R9, ARM::R10,ARM::R11,
-      ARM::R7 };
+      ARM::R8, ARM::R10,ARM::R11,ARM::R7 };
     // FP is R7, R9 is not available.
     static const unsigned ARM_GPR_AO_4[] = {
       ARM::R0, ARM::R1, ARM::R2, ARM::R3,
@@ -155,17 +184,15 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
       GPRClass::iterator I;
 
       if (Subtarget.isTargetDarwin()) {
-        if (Subtarget.isR9Reserved()) {
+        if (Subtarget.isR9Reserved())
           I = ARM_GPR_AO_4 + (sizeof(ARM_GPR_AO_4)/sizeof(unsigned));
-        } else {
+        else
           I = ARM_GPR_AO_3 + (sizeof(ARM_GPR_AO_3)/sizeof(unsigned));
-        }
       } else {
-        if (Subtarget.isR9Reserved()) {
+        if (Subtarget.isR9Reserved())
           I = ARM_GPR_AO_2 + (sizeof(ARM_GPR_AO_2)/sizeof(unsigned));
-        } else {
+        else
           I = ARM_GPR_AO_1 + (sizeof(ARM_GPR_AO_1)/sizeof(unsigned));
-        }
       }
 
       // Mac OS X requires FP not to be clobbered for backtracing purpose.
@@ -208,14 +235,67 @@ def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
   }];
 }
 
+// Scalar single precision floating point register class..
 def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8,
   S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22,
   S23, S24, S25, S26, S27, S28, S29, S30, S31]>;
 
+// Scalar double precision floating point / generic 64-bit vector register
+// class.
 // ARM requires only word alignment for double. It's more performant if it
 // is double-word alignment though.
-def DPR : RegisterClass<"ARM", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7, D8,
-  D9, D10, D11, D12, D13, D14, D15]>;
+def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+                        [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+                         D8,  D9,  D10, D11, D12, D13, D14, D15]> {
+  let SubRegClassList = [SPR, SPR];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // VFP2
+    static const unsigned ARM_DPR_VFP2[] = { 
+      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3, 
+      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7, 
+      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11, 
+      ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
+    // VFP3
+    static const unsigned ARM_DPR_VFP3[] = {
+      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3, 
+      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7, 
+      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11, 
+      ARM::D12, ARM::D13, ARM::D14, ARM::D15,
+      ARM::D16, ARM::D17, ARM::D18, ARM::D15,
+      ARM::D20, ARM::D21, ARM::D22, ARM::D23,
+      ARM::D24, ARM::D25, ARM::D26, ARM::D27,
+      ARM::D28, ARM::D29, ARM::D30, ARM::D31 };
+    DPRClass::iterator
+    DPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.hasVFP3())
+        return ARM_DPR_VFP3;
+      return ARM_DPR_VFP2;
+    }
+
+    DPRClass::iterator
+    DPRClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.hasVFP3())
+        return ARM_DPR_VFP3 + (sizeof(ARM_DPR_VFP3)/sizeof(unsigned));
+      else
+        return ARM_DPR_VFP2 + (sizeof(ARM_DPR_VFP2)/sizeof(unsigned));
+    }
+  }];
+}
+
+// Generic 128-bit vector register class.
+def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
+                        [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
+                         Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15]> {
+  let SubRegClassList = [SPR, SPR, SPR, SPR, DPR, DPR];
+}
 
 // Condition code registers.
 def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
@@ -225,12 +305,40 @@ def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
 // sub registers for each register.
 //
 
-def : SubRegSet<1, [D0, D1, D2, D3, D4, D5, D6, D7,
-                    D8, D9, D10, D11, D12, D13, D14, D15],
-                   [S0, S2, S4, S6, S8, S10, S12, S14,
-                    S16, S18, S20, S22, S24, S26, S28, S30]>;
+def arm_ssubreg_0 : PatLeaf<(i32 1)>;
+def arm_ssubreg_1 : PatLeaf<(i32 2)>;
+def arm_ssubreg_2 : PatLeaf<(i32 3)>;
+def arm_ssubreg_3 : PatLeaf<(i32 4)>;
+def arm_dsubreg_0 : PatLeaf<(i32 5)>;
+def arm_dsubreg_1 : PatLeaf<(i32 6)>;
 
-def : SubRegSet<2, [D0, D1, D2, D3, D4, D5, D6, D7,
-                    D8, D9, D10, D11, D12, D13, D14, D15],
-                   [S1, S3, S5, S7, S9, S11, S13, S15,
+// S sub-registers of D registers.
+def : SubRegSet<1, [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+                    D8,  D9,  D10, D11, D12, D13, D14, D15],
+                   [S0,  S2,  S4,  S6,  S8,  S10, S12, S14,
+                    S16, S18, S20, S22, S24, S26, S28, S30]>;
+def : SubRegSet<2, [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+                    D8,  D9,  D10, D11, D12, D13, D14, D15],
+                   [S1,  S3,  S5,  S7,  S9,  S11, S13, S15,
                     S17, S19, S21, S23, S25, S27, S29, S31]>;
+
+// S sub-registers of Q registers.
+def : SubRegSet<1, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S0,  S4,  S8,  S12, S16, S20, S24, S28]>;
+def : SubRegSet<2, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S1,  S5,  S9,  S13, S17, S21, S25, S29]>;
+def : SubRegSet<3, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S2,  S6,  S10, S14, S18, S22, S26, S30]>;
+def : SubRegSet<4, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S3,  S7,  S11, S15, S19, S23, S27, S31]>;
+
+// D sub-registers of Q registers.
+def : SubRegSet<5, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
+                    Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15],
+                   [D0,  D2,  D4,  D6,  D8,  D10, D12, D14,
+                    D16, D18, D20, D22, D24, D26, D28, D30]>;
+def : SubRegSet<6, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
+                    Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15],
+                   [D1,  D3,  D5,  D7,  D9,  D11, D13, D15,
+                    D17, D19, D21, D23, D25, D27, D29, D31]>;
+
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 7ac7b4923d68..e61108857413 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -16,15 +16,20 @@
 #include "llvm/Module.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
+static cl::opt<bool>
+ReserveR9("arm-reserve-r9", cl::Hidden,
+          cl::desc("Reserve R9, making it unavailable as GPR"));
+
 ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
                            bool isThumb)
   : ARMArchVersion(V4T)
   , ARMFPUType(None)
   , IsThumb(isThumb)
   , ThumbMode(Thumb1)
-  , IsR9Reserved(false)
+  , IsR9Reserved(ReserveR9)
   , stackAlignment(4)
   , CPUString("generic")
   , TargetType(isELF) // Default to ELF unless otherwise specified.
@@ -46,7 +51,7 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
 
   if (Len >= 5 && TT.substr(0, 4) == "armv")
     Idx = 4;
-  else if (Len >= 6 && TT.substr(0, 6) == "thumb") {
+  else if (Len >= 6 && TT.substr(0, 5) == "thumb") {
     IsThumb = true;
     if (Len >= 7 && TT[5] == 'v')
       Idx = 6;
@@ -54,15 +59,19 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
   if (Idx) {
     unsigned SubVer = TT[Idx];
     if (SubVer > '4' && SubVer <= '9') {
-      if (SubVer >= '7')
+      if (SubVer >= '7') {
         ARMArchVersion = V7A;
-      else if (SubVer == '6')
+      } else if (SubVer == '6') {
         ARMArchVersion = V6;
-      else if (SubVer == '5') {
+        if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2')
+          ARMArchVersion = V6T2;
+      } else if (SubVer == '5') {
         ARMArchVersion = V5T;
         if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e')
           ARMArchVersion = V5TE;
       }
+      if (ARMArchVersion >= V6T2)
+        ThumbMode = Thumb2;
     }
   }
 
@@ -83,5 +92,5 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
     stackAlignment = 8;
 
   if (isTargetDarwin())
-    IsR9Reserved = true;
+    IsR9Reserved = ReserveR9 | (ARMArchVersion < V6);
 }
diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
index 948a10070d47..58ba50e7ed77 100644
--- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
@@ -285,12 +285,22 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                  const char *Modifier) {
   const MachineOperand &MO = MI->getOperand(opNum);
   switch (MO.getType()) {
-  case MachineOperand::MO_Register:
-    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
-      O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
-    else
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (Modifier && strcmp(Modifier, "dregpair") == 0) {
+        unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0
+        unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1
+        O << '{'
+          << TRI->getAsmName(DRegLo) << "-" << TRI->getAsmName(DRegHi)
+          << '}';
+      } else {
+        O << TRI->getAsmName(Reg);
+      }
+    } else
       assert(0 && "not implemented");
     break;
+  }
   case MachineOperand::MO_Immediate: {
     if (!Modifier || strcmp(Modifier, "no_hash") != 0)
       O << "#";
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 068c441ed737..0252a4aef413 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -552,3 +552,23 @@ __Z11no_overflowjj:
 
 //===---------------------------------------------------------------------===//
 
+Some of the NEON intrinsics may be appropriate for more general use, either
+as target-independent intrinsics or perhaps elsewhere in the ARM backend.
+Some of them may also be lowered to target-independent SDNodes, and perhaps
+some new SDNodes could be added.
+
+For example, maximum, minimum, and absolute value operations are well-defined
+and standard operations, both for vector and scalar types.
+
+The current NEON-specific intrinsics for count leading zeros and count one
+bits could perhaps be replaced by the target-independent ctlz and ctpop
+intrinsics.  It may also make sense to add a target-independent "ctls"
+intrinsic for "count leading sign bits".  Likewise, the backend could use
+the target-independent SDNodes for these operations.
+
+ARMv6 has scalar saturating and halving adds and subtracts.  The same
+intrinsics could possibly be used for both NEON's vector implementations of
+those operations and the ARMv6 scalar versions.
+
+//===---------------------------------------------------------------------===//
+
diff --git a/lib/Target/PIC16/PIC16ISelLowering.cpp b/lib/Target/PIC16/PIC16ISelLowering.cpp
index f113a483d692..122af70546d9 100644
--- a/lib/Target/PIC16/PIC16ISelLowering.cpp
+++ b/lib/Target/PIC16/PIC16ISelLowering.cpp
@@ -702,10 +702,12 @@ void PIC16TargetLowering::LegalizeAddress(SDValue Ptr, SelectionDAG &DAG,
   if (Ptr.getOpcode() == ISD::ADD) {
     SDValue OperLeft = Ptr.getOperand(0);
     SDValue OperRight = Ptr.getOperand(1);
-    if (OperLeft.getOpcode() == ISD::Constant) {
+    if ((OperLeft.getOpcode() == ISD::Constant) &&
+        (dyn_cast<ConstantSDNode>(OperLeft)->getZExtValue() < 32 )) {
       Offset = dyn_cast<ConstantSDNode>(OperLeft)->getZExtValue();
       Ptr = OperRight;
-    } else if (OperRight.getOpcode() == ISD::Constant) {
+    } else if ((OperRight.getOpcode() == ISD::Constant)  &&
+               (dyn_cast<ConstantSDNode>(OperRight)->getZExtValue() < 32 )){
       Offset = dyn_cast<ConstantSDNode>(OperRight)->getZExtValue();
       Ptr = OperLeft;
     }
diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
index 67fefbb70b6a..7b843df7422d 100644
--- a/lib/Target/TargetData.cpp
+++ b/lib/Target/TargetData.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/System/Mutex.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringExtras.h"
 #include <algorithm>
@@ -345,11 +346,13 @@ typedef DenseMap<LayoutKey, StructLayout*, DenseMapLayoutKeyInfo> LayoutInfoTy;
 }
 
 static ManagedStatic<LayoutInfoTy> LayoutInfo;
+static ManagedStatic<sys::SmartMutex<true> > LayoutLock;
 
 TargetData::~TargetData() {
   if (!LayoutInfo.isConstructed())
     return;
   
+  sys::SmartScopedLock<true> Lock(&*LayoutLock);
   // Remove any layouts for this TD.
   LayoutInfoTy &TheMap = *LayoutInfo;
   for (LayoutInfoTy::iterator I = TheMap.begin(), E = TheMap.end(); I != E; ) {
@@ -366,6 +369,7 @@ TargetData::~TargetData() {
 const StructLayout *TargetData::getStructLayout(const StructType *Ty) const {
   LayoutInfoTy &TheMap = *LayoutInfo;
   
+  sys::SmartScopedLock<true> Lock(&*LayoutLock);
   StructLayout *&SL = TheMap[LayoutKey(this, Ty)];
   if (SL) return SL;
 
@@ -390,6 +394,7 @@ const StructLayout *TargetData::getStructLayout(const StructType *Ty) const {
 void TargetData::InvalidateStructLayoutInfo(const StructType *Ty) const {
   if (!LayoutInfo.isConstructed()) return;  // No cache.
   
+  sys::SmartScopedLock<true> Lock(&*LayoutLock);
   LayoutInfoTy::iterator I = LayoutInfo->find(LayoutKey(this, Ty));
   if (I == LayoutInfo->end()) return;
   
diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp
index d84034b9ed4f..315118f61091 100644
--- a/lib/Target/X86/X86ELFWriterInfo.cpp
+++ b/lib/Target/X86/X86ELFWriterInfo.cpp
@@ -12,11 +12,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86ELFWriterInfo.h"
+#include "X86Relocations.h"
 #include "llvm/Function.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
+
 using namespace llvm;
 
+//===----------------------------------------------------------------------===//
+//  Implementation of the X86ELFWriterInfo class
+//===----------------------------------------------------------------------===//
+
 X86ELFWriterInfo::X86ELFWriterInfo(TargetMachine &TM)
   : TargetELFWriterInfo(TM) {
     bool is64Bit = TM.getTargetData()->getPointerSizeInBits() == 64;
@@ -25,6 +31,34 @@ X86ELFWriterInfo::X86ELFWriterInfo(TargetMachine &TM)
 
 X86ELFWriterInfo::~X86ELFWriterInfo() {}
 
+unsigned X86ELFWriterInfo::getRelocationType(unsigned MachineRelTy) const {
+  if (is64Bit) {
+    switch(MachineRelTy) {
+    case X86::reloc_pcrel_word:
+      return R_X86_64_PC32;
+    case X86::reloc_absolute_word:
+      return R_X86_64_32;
+    case X86::reloc_absolute_dword:
+      return R_X86_64_64;
+    case X86::reloc_picrel_word:
+    default:
+      assert(0 && "unknown relocation type");
+    }
+  } else {
+    switch(MachineRelTy) {
+    case X86::reloc_pcrel_word:
+      return R_386_PC32;
+    case X86::reloc_absolute_word:
+      return R_386_32;
+    case X86::reloc_absolute_dword:
+    case X86::reloc_picrel_word:
+    default:
+      assert(0 && "unknown relocation type");
+    }
+  }
+  return 0;
+}
+
 unsigned X86ELFWriterInfo::getFunctionAlignment(const Function *F) const {
   unsigned FnAlign = 4;
 
@@ -36,3 +70,15 @@ unsigned X86ELFWriterInfo::getFunctionAlignment(const Function *F) const {
 
   return (1 << FnAlign);
 }
+
+long int X86ELFWriterInfo::getAddendForRelTy(unsigned RelTy) const {
+  if (is64Bit) {
+    switch(RelTy) {
+    case R_X86_64_PC32: return -4;
+      break;
+    default:
+      assert(0 && "unknown x86 relocation type");
+    }
+  }
+  return 0;
+}
diff --git a/lib/Target/X86/X86ELFWriterInfo.h b/lib/Target/X86/X86ELFWriterInfo.h
index e9c5bc45cba3..96485b897071 100644
--- a/lib/Target/X86/X86ELFWriterInfo.h
+++ b/lib/Target/X86/X86ELFWriterInfo.h
@@ -19,11 +19,43 @@
 namespace llvm {
 
   class X86ELFWriterInfo : public TargetELFWriterInfo {
+
+    // ELF Relocation types for X86
+    enum X86RelocationType {
+      R_386_NONE = 0,
+      R_386_32   = 1,
+      R_386_PC32 = 2
+    };
+
+    // ELF Relocation types for X86_64
+    enum X86_64RelocationType {
+      R_X86_64_NONE = 0,
+      R_X86_64_64   = 1,
+      R_X86_64_PC32 = 2,
+      R_X86_64_32   = 10,
+      R_X86_64_32S  = 11,
+      R_X86_64_PC64 = 24
+    };
+
   public:
     X86ELFWriterInfo(TargetMachine &TM);
     virtual ~X86ELFWriterInfo();
 
+    /// getFunctionAlignment - Returns the alignment for function 'F', targets
+    /// with different alignment constraints should overload this method
     virtual unsigned getFunctionAlignment(const Function *F) const;
+
+    /// getRelocationType - Returns the target specific ELF Relocation type.
+    /// 'MachineRelTy' contains the object code independent relocation type
+    virtual unsigned getRelocationType(unsigned MachineRelTy) const;
+
+    /// hasRelocationAddend - True if the target uses an addend in the
+    /// ELF relocation entry.
+    virtual bool hasRelocationAddend() const { return is64Bit ? true : false; }
+
+    /// getAddendForRelTy - Gets the addend value for an ELF relocation entry
+    /// based on the target relocation type
+    virtual long int getAddendForRelTy(unsigned RelTy) const;
   };
 
 } // end llvm namespace
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 326fb38909b5..6c20e7d14074 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -96,7 +96,7 @@ namespace {
 
     void RewriteNonIntegerIVs(Loop *L);
 
-    ICmpInst *LinearFunctionTestReplace(Loop *L, SCEVHandle BackedgeTakenCount,
+    ICmpInst *LinearFunctionTestReplace(Loop *L, const SCEV* BackedgeTakenCount,
                                    Value *IndVar,
                                    BasicBlock *ExitingBlock,
                                    BranchInst *BI,
@@ -128,7 +128,7 @@ Pass *llvm::createIndVarSimplifyPass() {
 /// SCEV analysis can determine a loop-invariant trip count of the loop, which
 /// is actually a much broader range than just linear tests.
 ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
-                                   SCEVHandle BackedgeTakenCount,
+                                   const SCEV* BackedgeTakenCount,
                                    Value *IndVar,
                                    BasicBlock *ExitingBlock,
                                    BranchInst *BI,
@@ -137,13 +137,13 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
   // against the preincremented value, otherwise we prefer to compare against
   // the post-incremented value.
   Value *CmpIndVar;
-  SCEVHandle RHS = BackedgeTakenCount;
+  const SCEV* RHS = BackedgeTakenCount;
   if (ExitingBlock == L->getLoopLatch()) {
     // Add one to the "backedge-taken" count to get the trip count.
     // If this addition may overflow, we have to be more pessimistic and
     // cast the induction variable before doing the add.
-    SCEVHandle Zero = SE->getIntegerSCEV(0, BackedgeTakenCount->getType());
-    SCEVHandle N =
+    const SCEV* Zero = SE->getIntegerSCEV(0, BackedgeTakenCount->getType());
+    const SCEV* N =
       SE->getAddExpr(BackedgeTakenCount,
                      SE->getIntegerSCEV(1, BackedgeTakenCount->getType()));
     if ((isa<SCEVConstant>(N) && !N->isZero()) ||
@@ -278,7 +278,7 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L,
         // Okay, this instruction has a user outside of the current loop
         // and varies predictably *inside* the loop.  Evaluate the value it
         // contains when the loop exits, if possible.
-        SCEVHandle ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
+        const SCEV* ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
         if (!ExitValue->isLoopInvariant(L))
           continue;
 
@@ -348,7 +348,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   BasicBlock *Header       = L->getHeader();
   BasicBlock *ExitingBlock = L->getExitingBlock(); // may be null
-  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  const SCEV* BackedgeTakenCount = SE->getBackedgeTakenCount(L);
 
   // Check to see if this loop has a computable loop-invariant execution count.
   // If so, this means that we can compute the final value of any expressions
@@ -373,14 +373,14 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
       NeedCannIV = true;
   }
   for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
-    SCEVHandle Stride = IU->StrideOrder[i];
+    const SCEV* Stride = IU->StrideOrder[i];
     const Type *Ty = SE->getEffectiveSCEVType(Stride->getType());
     if (!LargestType ||
         SE->getTypeSizeInBits(Ty) >
           SE->getTypeSizeInBits(LargestType))
       LargestType = Ty;
 
-    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+    std::map<const SCEV*, IVUsersOfOneStride *>::iterator SI =
       IU->IVUsesByStride.find(IU->StrideOrder[i]);
     assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
 
@@ -473,21 +473,20 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, const Type *LargestType,
   // the need for the code evaluation methods to insert induction variables
   // of different sizes.
   for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
-    SCEVHandle Stride = IU->StrideOrder[i];
+    const SCEV* Stride = IU->StrideOrder[i];
 
-    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+    std::map<const SCEV*, IVUsersOfOneStride *>::iterator SI =
       IU->IVUsesByStride.find(IU->StrideOrder[i]);
     assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
     ilist<IVStrideUse> &List = SI->second->Users;
     for (ilist<IVStrideUse>::iterator UI = List.begin(),
          E = List.end(); UI != E; ++UI) {
-      SCEVHandle Offset = UI->getOffset();
       Value *Op = UI->getOperandValToReplace();
       const Type *UseTy = Op->getType();
       Instruction *User = UI->getUser();
 
       // Compute the final addrec to expand into code.
-      SCEVHandle AR = IU->getReplacementExpr(*UI);
+      const SCEV* AR = IU->getReplacementExpr(*UI);
 
       Value *NewVal = 0;
       if (AR->isLoopInvariant(L)) {
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 65126728c7fc..302cdec2ba4a 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -187,7 +187,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
   // Don't remove loops for which we can't solve the trip count.
   // They could be infinite, in which case we'd be changing program behavior.
   ScalarEvolution& SE = getAnalysis<ScalarEvolution>();
-  SCEVHandle S = SE.getBackedgeTakenCount(L);
+  const SCEV* S = SE.getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(S))
     return false;
   
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 7579748bbc0a..ba600584865d 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -64,11 +64,11 @@ namespace {
   /// StrengthReduceStridedIVUsers. It contains the stride, the common base, as
   /// well as the PHI node and increment value created for rewrite.
   struct VISIBILITY_HIDDEN IVExpr {
-    SCEVHandle  Stride;
-    SCEVHandle  Base;
+    const SCEV*  Stride;
+    const SCEV*  Base;
     PHINode    *PHI;
 
-    IVExpr(const SCEVHandle &stride, const SCEVHandle &base, PHINode *phi)
+    IVExpr(const SCEV* const stride, const SCEV* const base, PHINode *phi)
       : Stride(stride), Base(base), PHI(phi) {}
   };
 
@@ -77,7 +77,7 @@ namespace {
   struct VISIBILITY_HIDDEN IVsOfOneStride {
     std::vector<IVExpr> IVs;
 
-    void addIV(const SCEVHandle &Stride, const SCEVHandle &Base, PHINode *PHI) {
+    void addIV(const SCEV* const Stride, const SCEV* const Base, PHINode *PHI) {
       IVs.push_back(IVExpr(Stride, Base, PHI));
     }
   };
@@ -91,11 +91,11 @@ namespace {
 
     /// IVsByStride - Keep track of all IVs that have been inserted for a
     /// particular stride.
-    std::map<SCEVHandle, IVsOfOneStride> IVsByStride;
+    std::map<const SCEV*, IVsOfOneStride> IVsByStride;
 
     /// StrideNoReuse - Keep track of all the strides whose ivs cannot be
     /// reused (nor should they be rewritten to reuse other strides).
-    SmallSet<SCEVHandle, 4> StrideNoReuse;
+    SmallSet<const SCEV*, 4> StrideNoReuse;
 
     /// DeadInsts - Keep track of instructions we may have made dead, so that
     /// we can remove them after we are done working.
@@ -133,7 +133,7 @@ namespace {
   private:
     ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond,
                                   IVStrideUse* &CondUse,
-                                  const SCEVHandle* &CondStride);
+                                  const SCEV* const *  &CondStride);
 
     void OptimizeIndvars(Loop *L);
     void OptimizeLoopCountIV(Loop *L);
@@ -149,16 +149,16 @@ namespace {
                           IVStrideUse* &CondUse);
 
     bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
-                           const SCEVHandle *&CondStride);
+                           const SCEV* const * &CondStride);
     bool RequiresTypeConversion(const Type *Ty, const Type *NewTy);
-    SCEVHandle CheckForIVReuse(bool, bool, bool, const SCEVHandle&,
+    const SCEV* CheckForIVReuse(bool, bool, bool, const SCEV* const&,
                              IVExpr&, const Type*,
                              const std::vector<BasedUser>& UsersToProcess);
     bool ValidScale(bool, int64_t,
                     const std::vector<BasedUser>& UsersToProcess);
     bool ValidOffset(bool, int64_t, int64_t,
                      const std::vector<BasedUser>& UsersToProcess);
-    SCEVHandle CollectIVUsers(const SCEVHandle &Stride,
+    const SCEV* CollectIVUsers(const SCEV* const &Stride,
                               IVUsersOfOneStride &Uses,
                               Loop *L,
                               bool &AllUsesAreAddresses,
@@ -168,11 +168,11 @@ namespace {
                                 const std::vector<BasedUser> &UsersToProcess,
                                 const Loop *L,
                                 bool AllUsesAreAddresses,
-                                SCEVHandle Stride);
+                                const SCEV* Stride);
     void PrepareToStrengthReduceFully(
                              std::vector<BasedUser> &UsersToProcess,
-                             SCEVHandle Stride,
-                             SCEVHandle CommonExprs,
+                             const SCEV* Stride,
+                             const SCEV* CommonExprs,
                              const Loop *L,
                              SCEVExpander &PreheaderRewriter);
     void PrepareToStrengthReduceFromSmallerStride(
@@ -182,13 +182,13 @@ namespace {
                                          Instruction *PreInsertPt);
     void PrepareToStrengthReduceWithNewPhi(
                                   std::vector<BasedUser> &UsersToProcess,
-                                  SCEVHandle Stride,
-                                  SCEVHandle CommonExprs,
+                                  const SCEV* Stride,
+                                  const SCEV* CommonExprs,
                                   Value *CommonBaseV,
                                   Instruction *IVIncInsertPt,
                                   const Loop *L,
                                   SCEVExpander &PreheaderRewriter);
-    void StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
+    void StrengthReduceStridedIVUsers(const SCEV* const &Stride,
                                       IVUsersOfOneStride &Uses,
                                       Loop *L);
     void DeleteTriviallyDeadInstructions();
@@ -232,7 +232,7 @@ void LoopStrengthReduce::DeleteTriviallyDeadInstructions() {
 /// containsAddRecFromDifferentLoop - Determine whether expression S involves a 
 /// subexpression that is an AddRec from a loop other than L.  An outer loop 
 /// of L is OK, but not an inner loop nor a disjoint loop.
-static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) {
+static bool containsAddRecFromDifferentLoop(const SCEV* S, Loop *L) {
   // This is very common, put it first.
   if (isa<SCEVConstant>(S))
     return false;
@@ -327,7 +327,7 @@ namespace {
     /// this use.  As the use is processed, information gets moved from this
     /// field to the Imm field (below).  BasedUser values are sorted by this
     /// field.
-    SCEVHandle Base;
+    const SCEV* Base;
     
     /// Inst - The instruction using the induction variable.
     Instruction *Inst;
@@ -340,7 +340,7 @@ namespace {
     /// before Inst, because it will be folded into the imm field of the
     /// instruction.  This is also sometimes used for loop-variant values that
     /// must be added inside the loop.
-    SCEVHandle Imm;
+    const SCEV* Imm;
 
     /// Phi - The induction variable that performs the striding that
     /// should be used for this user.
@@ -362,13 +362,13 @@ namespace {
     // Once we rewrite the code to insert the new IVs we want, update the
     // operands of Inst to use the new expression 'NewBase', with 'Imm' added
     // to it.
-    void RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
+    void RewriteInstructionToUseNewBase(const SCEV* const &NewBase,
                                         Instruction *InsertPt,
                                        SCEVExpander &Rewriter, Loop *L, Pass *P,
                                         LoopInfo &LI,
                                         SmallVectorImpl<WeakVH> &DeadInsts);
     
-    Value *InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, 
+    Value *InsertCodeForBaseAtPosition(const SCEV* const &NewBase, 
                                        const Type *Ty,
                                        SCEVExpander &Rewriter,
                                        Instruction *IP, Loop *L,
@@ -383,7 +383,7 @@ void BasedUser::dump() const {
   cerr << "   Inst: " << *Inst;
 }
 
-Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, 
+Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV* const &NewBase, 
                                               const Type *Ty,
                                               SCEVExpander &Rewriter,
                                               Instruction *IP, Loop *L,
@@ -407,7 +407,7 @@ Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase,
   
   Value *Base = Rewriter.expandCodeFor(NewBase, 0, BaseInsertPt);
 
-  SCEVHandle NewValSCEV = SE->getUnknown(Base);
+  const SCEV* NewValSCEV = SE->getUnknown(Base);
 
   // If there is no immediate value, skip the next part.
   if (!Imm->isZero()) {
@@ -430,7 +430,7 @@ Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase,
 // value of NewBase in the case that it's a diffferent instruction from
 // the PHI that NewBase is computed from, or null otherwise.
 //
-void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
+void BasedUser::RewriteInstructionToUseNewBase(const SCEV* const &NewBase,
                                                Instruction *NewBasePt,
                                       SCEVExpander &Rewriter, Loop *L, Pass *P,
                                       LoopInfo &LI,
@@ -542,7 +542,7 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
 
 /// fitsInAddressMode - Return true if V can be subsumed within an addressing
 /// mode, and does not need to be put in a register first.
-static bool fitsInAddressMode(const SCEVHandle &V, const Type *AccessTy,
+static bool fitsInAddressMode(const SCEV* const &V, const Type *AccessTy,
                              const TargetLowering *TLI, bool HasBaseReg) {
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(V)) {
     int64_t VC = SC->getValue()->getSExtValue();
@@ -574,12 +574,12 @@ static bool fitsInAddressMode(const SCEVHandle &V, const Type *AccessTy,
 
 /// MoveLoopVariantsToImmediateField - Move any subexpressions from Val that are
 /// loop varying to the Imm operand.
-static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm,
+static void MoveLoopVariantsToImmediateField(const SCEV* &Val, const SCEV* &Imm,
                                              Loop *L, ScalarEvolution *SE) {
   if (Val->isLoopInvariant(L)) return;  // Nothing to do.
   
   if (const SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
-    SmallVector<SCEVHandle, 4> NewOps;
+    SmallVector<const SCEV*, 4> NewOps;
     NewOps.reserve(SAE->getNumOperands());
     
     for (unsigned i = 0; i != SAE->getNumOperands(); ++i)
@@ -597,10 +597,10 @@ static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm,
       Val = SE->getAddExpr(NewOps);
   } else if (const SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Val)) {
     // Try to pull immediates out of the start value of nested addrec's.
-    SCEVHandle Start = SARE->getStart();
+    const SCEV* Start = SARE->getStart();
     MoveLoopVariantsToImmediateField(Start, Imm, L, SE);
     
-    SmallVector<SCEVHandle, 4> Ops(SARE->op_begin(), SARE->op_end());
+    SmallVector<const SCEV*, 4> Ops(SARE->op_begin(), SARE->op_end());
     Ops[0] = Start;
     Val = SE->getAddRecExpr(Ops, SARE->getLoop());
   } else {
@@ -616,15 +616,15 @@ static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm,
 /// Accumulate these immediate values into the Imm value.
 static void MoveImmediateValues(const TargetLowering *TLI,
                                 const Type *AccessTy,
-                                SCEVHandle &Val, SCEVHandle &Imm,
+                                const SCEV* &Val, const SCEV* &Imm,
                                 bool isAddress, Loop *L,
                                 ScalarEvolution *SE) {
   if (const SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
-    SmallVector<SCEVHandle, 4> NewOps;
+    SmallVector<const SCEV*, 4> NewOps;
     NewOps.reserve(SAE->getNumOperands());
     
     for (unsigned i = 0; i != SAE->getNumOperands(); ++i) {
-      SCEVHandle NewOp = SAE->getOperand(i);
+      const SCEV* NewOp = SAE->getOperand(i);
       MoveImmediateValues(TLI, AccessTy, NewOp, Imm, isAddress, L, SE);
       
       if (!NewOp->isLoopInvariant(L)) {
@@ -643,11 +643,11 @@ static void MoveImmediateValues(const TargetLowering *TLI,
     return;
   } else if (const SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Val)) {
     // Try to pull immediates out of the start value of nested addrec's.
-    SCEVHandle Start = SARE->getStart();
+    const SCEV* Start = SARE->getStart();
     MoveImmediateValues(TLI, AccessTy, Start, Imm, isAddress, L, SE);
     
     if (Start != SARE->getStart()) {
-      SmallVector<SCEVHandle, 4> Ops(SARE->op_begin(), SARE->op_end());
+      SmallVector<const SCEV*, 4> Ops(SARE->op_begin(), SARE->op_end());
       Ops[0] = Start;
       Val = SE->getAddRecExpr(Ops, SARE->getLoop());
     }
@@ -658,8 +658,8 @@ static void MoveImmediateValues(const TargetLowering *TLI,
         fitsInAddressMode(SME->getOperand(0), AccessTy, TLI, false) &&
         SME->getNumOperands() == 2 && SME->isLoopInvariant(L)) {
 
-      SCEVHandle SubImm = SE->getIntegerSCEV(0, Val->getType());
-      SCEVHandle NewOp = SME->getOperand(1);
+      const SCEV* SubImm = SE->getIntegerSCEV(0, Val->getType());
+      const SCEV* NewOp = SME->getOperand(1);
       MoveImmediateValues(TLI, AccessTy, NewOp, SubImm, isAddress, L, SE);
       
       // If we extracted something out of the subexpressions, see if we can 
@@ -694,7 +694,7 @@ static void MoveImmediateValues(const TargetLowering *TLI,
 
 static void MoveImmediateValues(const TargetLowering *TLI,
                                 Instruction *User,
-                                SCEVHandle &Val, SCEVHandle &Imm,
+                                const SCEV* &Val, const SCEV* &Imm,
                                 bool isAddress, Loop *L,
                                 ScalarEvolution *SE) {
   const Type *AccessTy = getAccessType(User);
@@ -704,19 +704,19 @@ static void MoveImmediateValues(const TargetLowering *TLI,
 /// SeparateSubExprs - Decompose Expr into all of the subexpressions that are
 /// added together.  This is used to reassociate common addition subexprs
 /// together for maximal sharing when rewriting bases.
-static void SeparateSubExprs(SmallVector<SCEVHandle, 16> &SubExprs,
-                             SCEVHandle Expr,
+static void SeparateSubExprs(SmallVector<const SCEV*, 16> &SubExprs,
+                             const SCEV* Expr,
                              ScalarEvolution *SE) {
   if (const SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(Expr)) {
     for (unsigned j = 0, e = AE->getNumOperands(); j != e; ++j)
       SeparateSubExprs(SubExprs, AE->getOperand(j), SE);
   } else if (const SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Expr)) {
-    SCEVHandle Zero = SE->getIntegerSCEV(0, Expr->getType());
+    const SCEV* Zero = SE->getIntegerSCEV(0, Expr->getType());
     if (SARE->getOperand(0) == Zero) {
       SubExprs.push_back(Expr);
     } else {
       // Compute the addrec with zero as its base.
-      SmallVector<SCEVHandle, 4> Ops(SARE->op_begin(), SARE->op_end());
+      SmallVector<const SCEV*, 4> Ops(SARE->op_begin(), SARE->op_end());
       Ops[0] = Zero;   // Start with zero base.
       SubExprs.push_back(SE->getAddRecExpr(Ops, SARE->getLoop()));
       
@@ -740,7 +740,7 @@ struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; };
 /// not remove anything.  This looks for things like (a+b+c) and
 /// (a+c+d) and computes the common (a+c) subexpression.  The common expression
 /// is *removed* from the Bases and returned.
-static SCEVHandle 
+static const SCEV* 
 RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
                                     ScalarEvolution *SE, Loop *L,
                                     const TargetLowering *TLI) {
@@ -748,9 +748,9 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
 
   // Only one use?  This is a very common case, so we handle it specially and
   // cheaply.
-  SCEVHandle Zero = SE->getIntegerSCEV(0, Uses[0].Base->getType());
-  SCEVHandle Result = Zero;
-  SCEVHandle FreeResult = Zero;
+  const SCEV* Zero = SE->getIntegerSCEV(0, Uses[0].Base->getType());
+  const SCEV* Result = Zero;
+  const SCEV* FreeResult = Zero;
   if (NumUses == 1) {
     // If the use is inside the loop, use its base, regardless of what it is:
     // it is clearly shared across all the IV's.  If the use is outside the loop
@@ -766,13 +766,13 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
   // Also track whether all uses of each expression can be moved into an
   // an addressing mode "for free"; such expressions are left within the loop.
   // struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; };
-  std::map<SCEVHandle, SubExprUseData> SubExpressionUseData;
+  std::map<const SCEV*, SubExprUseData> SubExpressionUseData;
   
   // UniqueSubExprs - Keep track of all of the subexpressions we see in the
   // order we see them.
-  SmallVector<SCEVHandle, 16> UniqueSubExprs;
+  SmallVector<const SCEV*, 16> UniqueSubExprs;
 
-  SmallVector<SCEVHandle, 16> SubExprs;
+  SmallVector<const SCEV*, 16> SubExprs;
   unsigned NumUsesInsideLoop = 0;
   for (unsigned i = 0; i != NumUses; ++i) {
     // If the user is outside the loop, just ignore it for base computation.
@@ -816,7 +816,7 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
   // Now that we know how many times each is used, build Result.  Iterate over
   // UniqueSubexprs so that we have a stable ordering.
   for (unsigned i = 0, e = UniqueSubExprs.size(); i != e; ++i) {
-    std::map<SCEVHandle, SubExprUseData>::iterator I = 
+    std::map<const SCEV*, SubExprUseData>::iterator I = 
        SubExpressionUseData.find(UniqueSubExprs[i]);
     assert(I != SubExpressionUseData.end() && "Entry not found?");
     if (I->second.Count == NumUsesInsideLoop) { // Found CSE! 
@@ -860,7 +860,7 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
   if (FreeResult != Zero) {
     SeparateSubExprs(SubExprs, FreeResult, SE);
     for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) {
-      std::map<SCEVHandle, SubExprUseData>::iterator I = 
+      std::map<const SCEV*, SubExprUseData>::iterator I = 
          SubExpressionUseData.find(SubExprs[j]);
       SubExpressionUseData.erase(I);
     }
@@ -989,10 +989,10 @@ bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty1,
 /// be folded into the addressing mode, nor even that the factor be constant; 
 /// a multiply (executed once) outside the loop is better than another IV 
 /// within.  Well, usually.
-SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
+const SCEV* LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
                                 bool AllUsesAreAddresses,
                                 bool AllUsesAreOutsideLoop,
-                                const SCEVHandle &Stride, 
+                                const SCEV* const &Stride, 
                                 IVExpr &IV, const Type *Ty,
                                 const std::vector<BasedUser>& UsersToProcess) {
   if (StrideNoReuse.count(Stride))
@@ -1002,7 +1002,7 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
     int64_t SInt = SC->getValue()->getSExtValue();
     for (unsigned NewStride = 0, e = IU->StrideOrder.size();
          NewStride != e; ++NewStride) {
-      std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
+      std::map<const SCEV*, IVsOfOneStride>::iterator SI = 
                 IVsByStride.find(IU->StrideOrder[NewStride]);
       if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first) ||
           StrideNoReuse.count(SI->first))
@@ -1055,7 +1055,7 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
     // an existing IV if we can.
     for (unsigned NewStride = 0, e = IU->StrideOrder.size();
          NewStride != e; ++NewStride) {
-      std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
+      std::map<const SCEV*, IVsOfOneStride>::iterator SI = 
                 IVsByStride.find(IU->StrideOrder[NewStride]);
       if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first))
         continue;
@@ -1075,7 +1075,7 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
     // -1*old.
     for (unsigned NewStride = 0, e = IU->StrideOrder.size();
          NewStride != e; ++NewStride) {
-      std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
+      std::map<const SCEV*, IVsOfOneStride>::iterator SI = 
                 IVsByStride.find(IU->StrideOrder[NewStride]);
       if (SI == IVsByStride.end()) 
         continue;
@@ -1104,7 +1104,7 @@ static bool PartitionByIsUseOfPostIncrementedValue(const BasedUser &Val) {
 
 /// isNonConstantNegative - Return true if the specified scev is negated, but
 /// not a constant.
-static bool isNonConstantNegative(const SCEVHandle &Expr) {
+static bool isNonConstantNegative(const SCEV* const &Expr) {
   const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Expr);
   if (!Mul) return false;
   
@@ -1121,7 +1121,7 @@ static bool isNonConstantNegative(const SCEVHandle &Expr) {
 /// of the strided accesses, as well as the old information from Uses. We
 /// progressively move information from the Base field to the Imm field, until
 /// we eventually have the full access expression to rewrite the use.
-SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
+const SCEV* LoopStrengthReduce::CollectIVUsers(const SCEV* const &Stride,
                                               IVUsersOfOneStride &Uses,
                                               Loop *L,
                                               bool &AllUsesAreAddresses,
@@ -1152,7 +1152,7 @@ SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
   // for the strides (e.g. if we have "A+C+B" and "A+B+D" as our bases, find
   // "A+B"), emit it to the preheader, then remove the expression from the
   // UsersToProcess base values.
-  SCEVHandle CommonExprs =
+  const SCEV* CommonExprs =
     RemoveCommonExpressionsFromUseBases(UsersToProcess, SE, L, TLI);
 
   // Next, figure out what we can represent in the immediate fields of
@@ -1218,7 +1218,7 @@ bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode(
                                    const std::vector<BasedUser> &UsersToProcess,
                                    const Loop *L,
                                    bool AllUsesAreAddresses,
-                                   SCEVHandle Stride) {
+                                   const SCEV* Stride) {
   if (!EnableFullLSRMode)
     return false;
 
@@ -1255,7 +1255,7 @@ bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode(
         if (!Imm)       Imm = SE->getIntegerSCEV(0, Stride->getType());
         const Instruction *Inst = UsersToProcess[i].Inst;
         const Type *AccessTy = getAccessType(Inst);
-        SCEVHandle Diff = SE->getMinusSCEV(UsersToProcess[i].Imm, Imm);
+        const SCEV* Diff = SE->getMinusSCEV(UsersToProcess[i].Imm, Imm);
         if (!Diff->isZero() &&
             (!AllUsesAreAddresses ||
              !fitsInAddressMode(Diff, AccessTy, TLI, /*HasBaseReg=*/true)))
@@ -1289,7 +1289,7 @@ bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode(
 ///
 /// Return the created phi node.
 ///
-static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step,
+static PHINode *InsertAffinePhi(const SCEV* Start, const SCEV* Step,
                                 Instruction *IVIncInsertPt,
                                 const Loop *L,
                                 SCEVExpander &Rewriter) {
@@ -1309,7 +1309,7 @@ static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step,
   // If the stride is negative, insert a sub instead of an add for the
   // increment.
   bool isNegative = isNonConstantNegative(Step);
-  SCEVHandle IncAmount = Step;
+  const SCEV* IncAmount = Step;
   if (isNegative)
     IncAmount = Rewriter.SE.getNegativeSCEV(Step);
 
@@ -1348,13 +1348,13 @@ static void SortUsersToProcess(std::vector<BasedUser> &UsersToProcess) {
   // loop before users outside of the loop with a particular base.
   //
   // We would like to use stable_sort here, but we can't.  The problem is that
-  // SCEVHandle's don't have a deterministic ordering w.r.t to each other, so
+  // const SCEV*'s don't have a deterministic ordering w.r.t to each other, so
   // we don't have anything to do a '<' comparison on.  Because we think the
   // number of uses is small, do a horrible bubble sort which just relies on
   // ==.
   for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
     // Get a base value.
-    SCEVHandle Base = UsersToProcess[i].Base;
+    const SCEV* Base = UsersToProcess[i].Base;
 
     // Compact everything with this base to be consecutive with this one.
     for (unsigned j = i+1; j != e; ++j) {
@@ -1373,8 +1373,8 @@ static void SortUsersToProcess(std::vector<BasedUser> &UsersToProcess) {
 void
 LoopStrengthReduce::PrepareToStrengthReduceFully(
                                         std::vector<BasedUser> &UsersToProcess,
-                                        SCEVHandle Stride,
-                                        SCEVHandle CommonExprs,
+                                        const SCEV* Stride,
+                                        const SCEV* CommonExprs,
                                         const Loop *L,
                                         SCEVExpander &PreheaderRewriter) {
   DOUT << "  Fully reducing all users\n";
@@ -1386,9 +1386,9 @@ LoopStrengthReduce::PrepareToStrengthReduceFully(
     // TODO: The uses are grouped by base, but not sorted. We arbitrarily
     // pick the first Imm value here to start with, and adjust it for the
     // other uses.
-    SCEVHandle Imm = UsersToProcess[i].Imm;
-    SCEVHandle Base = UsersToProcess[i].Base;
-    SCEVHandle Start = SE->getAddExpr(CommonExprs, Base, Imm);
+    const SCEV* Imm = UsersToProcess[i].Imm;
+    const SCEV* Base = UsersToProcess[i].Base;
+    const SCEV* Start = SE->getAddExpr(CommonExprs, Base, Imm);
     PHINode *Phi = InsertAffinePhi(Start, Stride, IVIncInsertPt, L,
                                    PreheaderRewriter);
     // Loop over all the users with the same base.
@@ -1420,8 +1420,8 @@ static Instruction *FindIVIncInsertPt(std::vector<BasedUser> &UsersToProcess,
 void
 LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi(
                                          std::vector<BasedUser> &UsersToProcess,
-                                         SCEVHandle Stride,
-                                         SCEVHandle CommonExprs,
+                                         const SCEV* Stride,
+                                         const SCEV* CommonExprs,
                                          Value *CommonBaseV,
                                          Instruction *IVIncInsertPt,
                                          const Loop *L,
@@ -1497,7 +1497,7 @@ static bool IsImmFoldedIntoAddrMode(GlobalValue *GV, int64_t Offset,
 /// StrengthReduceStridedIVUsers - Strength reduce all of the users of a single
 /// stride of IV.  All of the users may have different starting values, and this
 /// may not be the only stride.
-void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
+void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEV* const &Stride,
                                                       IVUsersOfOneStride &Uses,
                                                       Loop *L) {
   // If all the users are moved to another stride, then there is nothing to do.
@@ -1520,7 +1520,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
   // move information from the Base field to the Imm field, until we eventually
   // have the full access expression to rewrite the use.
   std::vector<BasedUser> UsersToProcess;
-  SCEVHandle CommonExprs = CollectIVUsers(Stride, Uses, L, AllUsesAreAddresses,
+  const SCEV* CommonExprs = CollectIVUsers(Stride, Uses, L, AllUsesAreAddresses,
                                           AllUsesAreOutsideLoop,
                                           UsersToProcess);
 
@@ -1538,8 +1538,8 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
   // If all uses are addresses, consider sinking the immediate part of the
   // common expression back into uses if they can fit in the immediate fields.
   if (TLI && HaveCommonExprs && AllUsesAreAddresses) {
-    SCEVHandle NewCommon = CommonExprs;
-    SCEVHandle Imm = SE->getIntegerSCEV(0, ReplacedTy);
+    const SCEV* NewCommon = CommonExprs;
+    const SCEV* Imm = SE->getIntegerSCEV(0, ReplacedTy);
     MoveImmediateValues(TLI, Type::VoidTy, NewCommon, Imm, true, L, SE);
     if (!Imm->isZero()) {
       bool DoSink = true;
@@ -1585,7 +1585,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
 
   Value *CommonBaseV = Constant::getNullValue(ReplacedTy);
 
-  SCEVHandle RewriteFactor = SE->getIntegerSCEV(0, ReplacedTy);
+  const SCEV* RewriteFactor = SE->getIntegerSCEV(0, ReplacedTy);
   IVExpr   ReuseIV(SE->getIntegerSCEV(0, Type::Int32Ty),
                    SE->getIntegerSCEV(0, Type::Int32Ty),
                    0);
@@ -1625,7 +1625,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
   // strength-reduced forms.  This outer loop handles all bases, the inner
   // loop handles all users of a particular base.
   while (!UsersToProcess.empty()) {
-    SCEVHandle Base = UsersToProcess.back().Base;
+    const SCEV* Base = UsersToProcess.back().Base;
     Instruction *Inst = UsersToProcess.back().Inst;
 
     // Emit the code for Base into the preheader.
@@ -1679,7 +1679,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
           User.Inst->moveBefore(IVIncInsertPt);
       }
 
-      SCEVHandle RewriteExpr = SE->getUnknown(RewriteOp);
+      const SCEV* RewriteExpr = SE->getUnknown(RewriteOp);
 
       if (SE->getEffectiveSCEVType(RewriteOp->getType()) !=
           SE->getEffectiveSCEVType(ReplacedTy)) {
@@ -1711,7 +1711,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
         // The base has been used to initialize the PHI node but we don't want
         // it here.
         if (!ReuseIV.Base->isZero()) {
-          SCEVHandle typedBase = ReuseIV.Base;
+          const SCEV* typedBase = ReuseIV.Base;
           if (SE->getEffectiveSCEVType(RewriteExpr->getType()) !=
               SE->getEffectiveSCEVType(ReuseIV.Base->getType())) {
             // It's possible the original IV is a larger type than the new IV,
@@ -1776,10 +1776,10 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
 /// set the IV user and stride information and return true, otherwise return
 /// false.
 bool LoopStrengthReduce::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
-                                       const SCEVHandle *&CondStride) {
+                                       const SCEV* const * &CondStride) {
   for (unsigned Stride = 0, e = IU->StrideOrder.size();
        Stride != e && !CondUse; ++Stride) {
-    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+    std::map<const SCEV*, IVUsersOfOneStride *>::iterator SI =
       IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
     assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
 
@@ -1806,7 +1806,7 @@ namespace {
     const ScalarEvolution *SE;
     explicit StrideCompare(const ScalarEvolution *se) : SE(se) {}
 
-    bool operator()(const SCEVHandle &LHS, const SCEVHandle &RHS) {
+    bool operator()(const SCEV* const &LHS, const SCEV* const &RHS) {
       const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS);
       const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
       if (LHSC && RHSC) {
@@ -1849,14 +1849,14 @@ namespace {
 /// if (v1 < 30) goto loop
 ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
                                                 IVStrideUse* &CondUse,
-                                                const SCEVHandle* &CondStride) {
+                                              const SCEV* const* &CondStride) {
   // If there's only one stride in the loop, there's nothing to do here.
   if (IU->StrideOrder.size() < 2)
     return Cond;
   // If there are other users of the condition's stride, don't bother
   // trying to change the condition because the stride will still
   // remain.
-  std::map<SCEVHandle, IVUsersOfOneStride *>::iterator I =
+  std::map<const SCEV*, IVUsersOfOneStride *>::iterator I =
     IU->IVUsesByStride.find(*CondStride);
   if (I == IU->IVUsesByStride.end() ||
       I->second->Users.size() != 1)
@@ -1873,11 +1873,11 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
   const Type *NewCmpTy = NULL;
   unsigned TyBits = SE->getTypeSizeInBits(CmpTy);
   unsigned NewTyBits = 0;
-  SCEVHandle *NewStride = NULL;
+  const SCEV* *NewStride = NULL;
   Value *NewCmpLHS = NULL;
   Value *NewCmpRHS = NULL;
   int64_t Scale = 1;
-  SCEVHandle NewOffset = SE->getIntegerSCEV(0, CmpTy);
+  const SCEV* NewOffset = SE->getIntegerSCEV(0, CmpTy);
 
   if (ConstantInt *C = dyn_cast<ConstantInt>(Cond->getOperand(1))) {
     int64_t CmpVal = C->getValue().getSExtValue();
@@ -1889,7 +1889,7 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
 
     // Look for a suitable stride / iv as replacement.
     for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
-      std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      std::map<const SCEV*, IVUsersOfOneStride *>::iterator SI =
         IU->IVUsesByStride.find(IU->StrideOrder[i]);
       if (!isa<SCEVConstant>(SI->first))
         continue;
@@ -1969,7 +1969,7 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
       bool AllUsesAreAddresses = true;
       bool AllUsesAreOutsideLoop = true;
       std::vector<BasedUser> UsersToProcess;
-      SCEVHandle CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
+      const SCEV* CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
                                               AllUsesAreAddresses,
                                               AllUsesAreOutsideLoop,
                                               UsersToProcess);
@@ -2104,13 +2104,13 @@ ICmpInst *LoopStrengthReduce::OptimizeMax(Loop *L, ICmpInst *Cond,
   SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
   if (!Sel || !Sel->hasOneUse()) return Cond;
 
-  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  const SCEV* BackedgeTakenCount = SE->getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
     return Cond;
-  SCEVHandle One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType());
+  const SCEV* One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType());
 
   // Add one to the backedge-taken count to get the trip count.
-  SCEVHandle IterationCount = SE->getAddExpr(BackedgeTakenCount, One);
+  const SCEV* IterationCount = SE->getAddExpr(BackedgeTakenCount, One);
 
   // Check for a max calculation that matches the pattern.
   if (!isa<SCEVSMaxExpr>(IterationCount) && !isa<SCEVUMaxExpr>(IterationCount))
@@ -2123,13 +2123,13 @@ ICmpInst *LoopStrengthReduce::OptimizeMax(Loop *L, ICmpInst *Cond,
   if (Max->getNumOperands() != 2)
     return Cond;
 
-  SCEVHandle MaxLHS = Max->getOperand(0);
-  SCEVHandle MaxRHS = Max->getOperand(1);
+  const SCEV* MaxLHS = Max->getOperand(0);
+  const SCEV* MaxRHS = Max->getOperand(1);
   if (!MaxLHS || MaxLHS != One) return Cond;
 
   // Check the relevant induction variable for conformance to
   // the pattern.
-  SCEVHandle IV = SE->getSCEV(Cond->getOperand(0));
+  const SCEV* IV = SE->getSCEV(Cond->getOperand(0));
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
   if (!AR || !AR->isAffine() ||
       AR->getStart() != One ||
@@ -2175,13 +2175,13 @@ ICmpInst *LoopStrengthReduce::OptimizeMax(Loop *L, ICmpInst *Cond,
 /// inside the loop then try to eliminate the cast opeation.
 void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
 
-  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  const SCEV* BackedgeTakenCount = SE->getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
     return;
 
   for (unsigned Stride = 0, e = IU->StrideOrder.size(); Stride != e;
        ++Stride) {
-    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+    std::map<const SCEV*, IVUsersOfOneStride *>::iterator SI =
       IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
     assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
     if (!isa<SCEVConstant>(SI->first))
@@ -2311,7 +2311,7 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
 
   // Search IVUsesByStride to find Cond's IVUse if there is one.
   IVStrideUse *CondUse = 0;
-  const SCEVHandle *CondStride = 0;
+  const SCEV* const *CondStride = 0;
   ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
   if (!FindIVUserForCond(Cond, CondUse, CondStride))
     return; // setcc doesn't use the IV.
@@ -2341,7 +2341,7 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
       int64_t SInt = SC->getValue()->getSExtValue();
       for (unsigned NewStride = 0, ee = IU->StrideOrder.size(); NewStride != ee;
            ++NewStride) {
-        std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+        std::map<const SCEV*, IVUsersOfOneStride *>::iterator SI =
           IU->IVUsesByStride.find(IU->StrideOrder[NewStride]);
         if (!isa<SCEVConstant>(SI->first) || SI->first == *CondStride)
           continue;
@@ -2355,7 +2355,7 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
         bool AllUsesAreAddresses = true;
         bool AllUsesAreOutsideLoop = true;
         std::vector<BasedUser> UsersToProcess;
-        SCEVHandle CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
+        const SCEV* CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
                                                 AllUsesAreAddresses,
                                                 AllUsesAreOutsideLoop,
                                                 UsersToProcess);
@@ -2416,7 +2416,7 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
 void LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) {
 
   // If the number of times the loop is executed isn't computable, give up.
-  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  const SCEV* BackedgeTakenCount = SE->getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
     return;
 
@@ -2445,9 +2445,9 @@ void LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) {
   // Handle only tests for equality for the moment, and only stride 1.
   if (Cond->getPredicate() != CmpInst::ICMP_EQ)
     return;
-  SCEVHandle IV = SE->getSCEV(Cond->getOperand(0));
+  const SCEV* IV = SE->getSCEV(Cond->getOperand(0));
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
-  SCEVHandle One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType());
+  const SCEV* One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType());
   if (!AR || !AR->isAffine() || AR->getStepRecurrence(*SE) != One)
     return;
   // If the RHS of the comparison is defined inside the loop, the rewrite
@@ -2563,7 +2563,7 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
     // strides deterministic - not dependent on map order.
     for (unsigned Stride = 0, e = IU->StrideOrder.size();
          Stride != e; ++Stride) {
-      std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      std::map<const SCEV*, IVUsersOfOneStride *>::iterator SI =
         IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
       assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
       // FIXME: Generalize to non-affine IV's.
diff --git a/test/Analysis/ScalarEvolution/pointer-sign-bits.ll b/test/Analysis/ScalarEvolution/pointer-sign-bits.ll
new file mode 100644
index 000000000000..05cb81b3ba6b
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/pointer-sign-bits.ll
@@ -0,0 +1,220 @@
+; RUN: llvm-as < %s | opt -analyze -scalar-evolution -disable-output
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+  %JavaObject = type { [0 x i32 (...)*]*, i8* }
+
+define void @JnJVM_antlr_CSharpCodeGenerator_genBitSet__Lantlr_collections_impl_BitSet_2I(%JavaObject*, %JavaObject*, i32) {
+start:
+  br i1 undef, label %"stack overflow", label %"no stack overflow"
+
+"GOTO or IF*2":         ; preds = %"true verifyAndComputePtr89", %verifyNullCont84
+  unreachable
+
+"GOTO or IF*5":         ; preds = %"true verifyAndComputePtr127", %"GOTO or IF*6"
+  unreachable
+
+"GOTO or IF*6":         ; preds = %"true verifyAndComputePtr131.GOTO or IF*6_crit_edge", %"true verifyAndComputePtr89"
+  %indvar = phi i32 [ %indvar.next, %"true verifyAndComputePtr131.GOTO or IF*6_crit_edge" ], [ 0, %"true verifyAndComputePtr89" ]               ; <i32> [#uses=2]
+  %.0.in = add i32 %indvar, 0           ; <i32> [#uses=1]
+  %.0 = add i32 %.0.in, 1               ; <i32> [#uses=1]
+  %3 = icmp slt i32 %.0, %4             ; <i1> [#uses=1]
+  br i1 %3, label %verifyNullCont126, label %"GOTO or IF*5"
+
+end:            ; preds = %"no exception block35"
+  ret void
+
+"stack overflow":               ; preds = %start
+  ret void
+
+"no stack overflow":            ; preds = %start
+  br i1 undef, label %verifyNullCont, label %"no stack overflow.end_crit_edge"
+
+"no stack overflow.end_crit_edge":              ; preds = %"no stack overflow"
+  ret void
+
+verifyNullCont:         ; preds = %"no stack overflow"
+  br i1 undef, label %verifyNullCont9, label %verifyNullCont.end_crit_edge
+
+verifyNullCont.end_crit_edge:           ; preds = %verifyNullCont
+  ret void
+
+verifyNullCont9:                ; preds = %verifyNullCont
+  br i1 undef, label %verifyNullCont12, label %verifyNullCont9.end_crit_edge
+
+verifyNullCont9.end_crit_edge:          ; preds = %verifyNullCont9
+  ret void
+
+verifyNullCont12:               ; preds = %verifyNullCont9
+  br i1 undef, label %"no exception block13", label %verifyNullCont12.end_crit_edge
+
+verifyNullCont12.end_crit_edge:         ; preds = %verifyNullCont12
+  ret void
+
+"no exception block13":         ; preds = %verifyNullCont12
+  br i1 undef, label %verifyNullExit14, label %verifyNullCont15
+
+verifyNullExit14:               ; preds = %"no exception block13"
+  ret void
+
+verifyNullCont15:               ; preds = %"no exception block13"
+  br i1 undef, label %"no exception block16", label %verifyNullCont15.end_crit_edge
+
+verifyNullCont15.end_crit_edge:         ; preds = %verifyNullCont15
+  ret void
+
+"no exception block16":         ; preds = %verifyNullCont15
+  br i1 undef, label %verifyNullExit17, label %verifyNullCont18
+
+verifyNullExit17:               ; preds = %"no exception block16"
+  ret void
+
+verifyNullCont18:               ; preds = %"no exception block16"
+  br i1 undef, label %"no exception block19", label %verifyNullCont18.end_crit_edge
+
+verifyNullCont18.end_crit_edge:         ; preds = %verifyNullCont18
+  ret void
+
+"no exception block19":         ; preds = %verifyNullCont18
+  br i1 undef, label %verifyNullExit20, label %verifyNullCont21
+
+verifyNullExit20:               ; preds = %"no exception block19"
+  ret void
+
+verifyNullCont21:               ; preds = %"no exception block19"
+  br i1 undef, label %verifyNullCont24, label %verifyNullCont21.end_crit_edge
+
+verifyNullCont21.end_crit_edge:         ; preds = %verifyNullCont21
+  ret void
+
+verifyNullCont24:               ; preds = %verifyNullCont21
+  br i1 undef, label %verifyNullCont27, label %verifyNullCont24.end_crit_edge
+
+verifyNullCont24.end_crit_edge:         ; preds = %verifyNullCont24
+  ret void
+
+verifyNullCont27:               ; preds = %verifyNullCont24
+  br i1 undef, label %verifyNullCont32, label %verifyNullCont27.end_crit_edge
+
+verifyNullCont27.end_crit_edge:         ; preds = %verifyNullCont27
+  ret void
+
+verifyNullCont32:               ; preds = %verifyNullCont27
+  br i1 undef, label %verifyNullExit33, label %verifyNullCont34
+
+verifyNullExit33:               ; preds = %verifyNullCont32
+  ret void
+
+verifyNullCont34:               ; preds = %verifyNullCont32
+  br i1 undef, label %"no exception block35", label %verifyNullCont34.end_crit_edge
+
+verifyNullCont34.end_crit_edge:         ; preds = %verifyNullCont34
+  ret void
+
+"no exception block35":         ; preds = %verifyNullCont34
+  br i1 undef, label %end, label %verifyNullCont60
+
+verifyNullCont60:               ; preds = %"no exception block35"
+  br i1 undef, label %verifyNullCont63, label %verifyNullCont60.end_crit_edge
+
+verifyNullCont60.end_crit_edge:         ; preds = %verifyNullCont60
+  ret void
+
+verifyNullCont63:               ; preds = %verifyNullCont60
+  br i1 undef, label %"no exception block64", label %verifyNullCont63.end_crit_edge
+
+verifyNullCont63.end_crit_edge:         ; preds = %verifyNullCont63
+  ret void
+
+"no exception block64":         ; preds = %verifyNullCont63
+  br i1 undef, label %verifyNullExit65, label %verifyNullCont66
+
+verifyNullExit65:               ; preds = %"no exception block64"
+  ret void
+
+verifyNullCont66:               ; preds = %"no exception block64"
+  br i1 undef, label %"no exception block67", label %verifyNullCont66.end_crit_edge
+
+verifyNullCont66.end_crit_edge:         ; preds = %verifyNullCont66
+  ret void
+
+"no exception block67":         ; preds = %verifyNullCont66
+  br i1 undef, label %verifyNullExit68, label %verifyNullCont69
+
+verifyNullExit68:               ; preds = %"no exception block67"
+  ret void
+
+verifyNullCont69:               ; preds = %"no exception block67"
+  br i1 undef, label %"no exception block70", label %verifyNullCont69.end_crit_edge
+
+verifyNullCont69.end_crit_edge:         ; preds = %verifyNullCont69
+  ret void
+
+"no exception block70":         ; preds = %verifyNullCont69
+  br i1 undef, label %verifyNullExit71, label %verifyNullCont72
+
+verifyNullExit71:               ; preds = %"no exception block70"
+  ret void
+
+verifyNullCont72:               ; preds = %"no exception block70"
+  br i1 undef, label %verifyNullCont75, label %verifyNullCont72.end_crit_edge
+
+verifyNullCont72.end_crit_edge:         ; preds = %verifyNullCont72
+  ret void
+
+verifyNullCont75:               ; preds = %verifyNullCont72
+  br i1 undef, label %verifyNullCont78, label %verifyNullCont75.end_crit_edge
+
+verifyNullCont75.end_crit_edge:         ; preds = %verifyNullCont75
+  ret void
+
+verifyNullCont78:               ; preds = %verifyNullCont75
+  br i1 undef, label %"verifyNullCont78.GOTO or IF*4_crit_edge", label %verifyNullCont78.end_crit_edge
+
+"verifyNullCont78.GOTO or IF*4_crit_edge":              ; preds = %verifyNullCont78
+  br i1 undef, label %verifyNullExit80, label %verifyNullCont81
+
+verifyNullCont78.end_crit_edge:         ; preds = %verifyNullCont78
+  ret void
+
+verifyNullExit80:               ; preds = %"verifyNullCont78.GOTO or IF*4_crit_edge"
+  ret void
+
+verifyNullCont81:               ; preds = %"verifyNullCont78.GOTO or IF*4_crit_edge"
+  %4 = ptrtoint i8* undef to i32                ; <i32> [#uses=2]
+  %5 = icmp slt i32 0, %4               ; <i1> [#uses=1]
+  br i1 %5, label %verifyNullCont84, label %verifyNullCont172
+
+verifyNullCont84:               ; preds = %verifyNullCont81
+  br i1 undef, label %"GOTO or IF*2", label %verifyNullCont86
+
+verifyNullCont86:               ; preds = %verifyNullCont84
+  br i1 undef, label %"true verifyAndComputePtr", label %"false verifyAndComputePtr"
+
+"true verifyAndComputePtr":             ; preds = %verifyNullCont86
+  br i1 undef, label %"true verifyAndComputePtr89", label %"false verifyAndComputePtr90"
+
+"false verifyAndComputePtr":            ; preds = %verifyNullCont86
+  ret void
+
+"true verifyAndComputePtr89":           ; preds = %"true verifyAndComputePtr"
+  br i1 undef, label %"GOTO or IF*6", label %"GOTO or IF*2"
+
+"false verifyAndComputePtr90":          ; preds = %"true verifyAndComputePtr"
+  ret void
+
+verifyNullCont126:              ; preds = %"GOTO or IF*6"
+  br i1 undef, label %"true verifyAndComputePtr127", label %"false verifyAndComputePtr128"
+
+"true verifyAndComputePtr127":          ; preds = %verifyNullCont126
+  br i1 undef, label %"true verifyAndComputePtr131.GOTO or IF*6_crit_edge", label %"GOTO or IF*5"
+
+"false verifyAndComputePtr128":         ; preds = %verifyNullCont126
+  ret void
+
+"true verifyAndComputePtr131.GOTO or IF*6_crit_edge":           ; preds = %"true verifyAndComputePtr127"
+  %indvar.next = add i32 %indvar, 1             ; <i32> [#uses=1]
+  br label %"GOTO or IF*6"
+
+verifyNullCont172:              ; preds = %verifyNullCont81
+  unreachable
+}
diff --git a/test/Analysis/ScalarEvolution/trip-count7.ll b/test/Analysis/ScalarEvolution/trip-count7.ll
new file mode 100644
index 000000000000..cea826ef1d30
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/trip-count7.ll
@@ -0,0 +1,150 @@
+; RUN: llvm-as < %s | opt -analyze -scalar-evolution -disable-output \
+; RUN:   | grep {Loop bb7.i: Unpredictable backedge-taken count\\.}
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+
+	%struct.complex = type { float, float }
+	%struct.element = type { i32, i32 }
+	%struct.node = type { %struct.node*, %struct.node*, i32 }
+@seed = external global i64		; <i64*> [#uses=0]
+@_2E_str = external constant [18 x i8], align 1		; <[18 x i8]*> [#uses=0]
+@_2E_str1 = external constant [4 x i8], align 1		; <[4 x i8]*> [#uses=0]
+@value = external global float		; <float*> [#uses=0]
+@fixed = external global float		; <float*> [#uses=0]
+@floated = external global float		; <float*> [#uses=0]
+@permarray = external global [11 x i32], align 32		; <[11 x i32]*> [#uses=0]
+@pctr = external global i32		; <i32*> [#uses=0]
+@tree = external global %struct.node*		; <%struct.node**> [#uses=0]
+@stack = external global [4 x i32], align 16		; <[4 x i32]*> [#uses=0]
+@cellspace = external global [19 x %struct.element], align 32		; <[19 x %struct.element]*> [#uses=0]
+@freelist = external global i32		; <i32*> [#uses=0]
+@movesdone = external global i32		; <i32*> [#uses=0]
+@ima = external global [41 x [41 x i32]], align 32		; <[41 x [41 x i32]]*> [#uses=0]
+@imb = external global [41 x [41 x i32]], align 32		; <[41 x [41 x i32]]*> [#uses=0]
+@imr = external global [41 x [41 x i32]], align 32		; <[41 x [41 x i32]]*> [#uses=0]
+@rma = external global [41 x [41 x float]], align 32		; <[41 x [41 x float]]*> [#uses=0]
+@rmb = external global [41 x [41 x float]], align 32		; <[41 x [41 x float]]*> [#uses=0]
+@rmr = external global [41 x [41 x float]], align 32		; <[41 x [41 x float]]*> [#uses=0]
+@piececount = external global [4 x i32], align 16		; <[4 x i32]*> [#uses=0]
+@class = external global [13 x i32], align 32		; <[13 x i32]*> [#uses=0]
+@piecemax = external global [13 x i32], align 32		; <[13 x i32]*> [#uses=0]
+@puzzl = external global [512 x i32], align 32		; <[512 x i32]*> [#uses=0]
+@p = external global [13 x [512 x i32]], align 32		; <[13 x [512 x i32]]*> [#uses=0]
+@n = external global i32		; <i32*> [#uses=0]
+@kount = external global i32		; <i32*> [#uses=0]
+@sortlist = external global [5001 x i32], align 32		; <[5001 x i32]*> [#uses=0]
+@biggest = external global i32		; <i32*> [#uses=0]
+@littlest = external global i32		; <i32*> [#uses=0]
+@top = external global i32		; <i32*> [#uses=0]
+@z = external global [257 x %struct.complex], align 32		; <[257 x %struct.complex]*> [#uses=0]
+@w = external global [257 x %struct.complex], align 32		; <[257 x %struct.complex]*> [#uses=0]
+@e = external global [130 x %struct.complex], align 32		; <[130 x %struct.complex]*> [#uses=0]
+@zr = external global float		; <float*> [#uses=0]
+@zi = external global float		; <float*> [#uses=0]
+
+declare void @Initrand() nounwind
+
+declare i32 @Rand() nounwind
+
+declare void @Try(i32, i32*, i32*, i32*, i32*, i32*) nounwind
+
+declare i32 @puts(i8* nocapture) nounwind
+
+declare void @Queens(i32) nounwind
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+declare i32 @main() nounwind
+
+declare void @Doit() nounwind
+
+declare void @Doit_bb7([15 x i32]*, [17 x i32]*, [9 x i32]*) nounwind
+
+define void @Doit_bb7_2E_i([9 x i32]* %x1, [15 x i32]* %c, [17 x i32]* %b, [9 x i32]* %a, i32* %q, i32* %x1.sub, i32* %b9, i32* %a10, i32* %c11) nounwind {
+newFuncRoot:
+	br label %bb7.i
+
+Try.exit.exitStub:		; preds = %bb7.i
+	ret void
+
+bb.i:		; preds = %bb7.i
+	%tmp = add i32 %j.0.i, 1		; <i32> [#uses=5]
+	store i32 0, i32* %q, align 4
+	%tmp1 = sext i32 %tmp to i64		; <i64> [#uses=1]
+	%tmp2 = getelementptr [9 x i32]* %a, i64 0, i64 %tmp1		; <i32*> [#uses=1]
+	%tmp3 = load i32* %tmp2, align 4		; <i32> [#uses=1]
+	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb.i.bb7.i.backedge_crit_edge, label %bb1.i
+
+bb1.i:		; preds = %bb.i
+	%tmp5 = add i32 %j.0.i, 2		; <i32> [#uses=1]
+	%tmp6 = sext i32 %tmp5 to i64		; <i64> [#uses=1]
+	%tmp7 = getelementptr [17 x i32]* %b, i64 0, i64 %tmp6		; <i32*> [#uses=1]
+	%tmp8 = load i32* %tmp7, align 4		; <i32> [#uses=1]
+	%tmp9 = icmp eq i32 %tmp8, 0		; <i1> [#uses=1]
+	br i1 %tmp9, label %bb1.i.bb7.i.backedge_crit_edge, label %bb2.i
+
+bb2.i:		; preds = %bb1.i
+	%tmp10 = sub i32 7, %j.0.i		; <i32> [#uses=1]
+	%tmp11 = sext i32 %tmp10 to i64		; <i64> [#uses=1]
+	%tmp12 = getelementptr [15 x i32]* %c, i64 0, i64 %tmp11		; <i32*> [#uses=1]
+	%tmp13 = load i32* %tmp12, align 4		; <i32> [#uses=1]
+	%tmp14 = icmp eq i32 %tmp13, 0		; <i1> [#uses=1]
+	br i1 %tmp14, label %bb2.i.bb7.i.backedge_crit_edge, label %bb3.i
+
+bb3.i:		; preds = %bb2.i
+	%tmp15 = getelementptr [9 x i32]* %x1, i64 0, i64 1		; <i32*> [#uses=1]
+	store i32 %tmp, i32* %tmp15, align 4
+	%tmp16 = sext i32 %tmp to i64		; <i64> [#uses=1]
+	%tmp17 = getelementptr [9 x i32]* %a, i64 0, i64 %tmp16		; <i32*> [#uses=1]
+	store i32 0, i32* %tmp17, align 4
+	%tmp18 = add i32 %j.0.i, 2		; <i32> [#uses=1]
+	%tmp19 = sext i32 %tmp18 to i64		; <i64> [#uses=1]
+	%tmp20 = getelementptr [17 x i32]* %b, i64 0, i64 %tmp19		; <i32*> [#uses=1]
+	store i32 0, i32* %tmp20, align 4
+	%tmp21 = sub i32 7, %j.0.i		; <i32> [#uses=1]
+	%tmp22 = sext i32 %tmp21 to i64		; <i64> [#uses=1]
+	%tmp23 = getelementptr [15 x i32]* %c, i64 0, i64 %tmp22		; <i32*> [#uses=1]
+	store i32 0, i32* %tmp23, align 4
+	call void @Try(i32 2, i32* %q, i32* %b9, i32* %a10, i32* %c11, i32* %x1.sub) nounwind
+	%tmp24 = load i32* %q, align 4		; <i32> [#uses=1]
+	%tmp25 = icmp eq i32 %tmp24, 0		; <i1> [#uses=1]
+	br i1 %tmp25, label %bb5.i, label %bb3.i.bb7.i.backedge_crit_edge
+
+bb5.i:		; preds = %bb3.i
+	%tmp26 = sext i32 %tmp to i64		; <i64> [#uses=1]
+	%tmp27 = getelementptr [9 x i32]* %a, i64 0, i64 %tmp26		; <i32*> [#uses=1]
+	store i32 1, i32* %tmp27, align 4
+	%tmp28 = add i32 %j.0.i, 2		; <i32> [#uses=1]
+	%tmp29 = sext i32 %tmp28 to i64		; <i64> [#uses=1]
+	%tmp30 = getelementptr [17 x i32]* %b, i64 0, i64 %tmp29		; <i32*> [#uses=1]
+	store i32 1, i32* %tmp30, align 4
+	%tmp31 = sub i32 7, %j.0.i		; <i32> [#uses=1]
+	%tmp32 = sext i32 %tmp31 to i64		; <i64> [#uses=1]
+	%tmp33 = getelementptr [15 x i32]* %c, i64 0, i64 %tmp32		; <i32*> [#uses=1]
+	store i32 1, i32* %tmp33, align 4
+	br label %bb7.i.backedge
+
+bb7.i.backedge:		; preds = %bb3.i.bb7.i.backedge_crit_edge, %bb2.i.bb7.i.backedge_crit_edge, %bb1.i.bb7.i.backedge_crit_edge, %bb.i.bb7.i.backedge_crit_edge, %bb5.i
+	br label %bb7.i
+
+bb7.i:		; preds = %bb7.i.backedge, %newFuncRoot
+	%j.0.i = phi i32 [ 0, %newFuncRoot ], [ %tmp, %bb7.i.backedge ]		; <i32> [#uses=8]
+	%tmp34 = load i32* %q, align 4		; <i32> [#uses=1]
+	%tmp35 = icmp eq i32 %tmp34, 0		; <i1> [#uses=1]
+	%tmp36 = icmp ne i32 %j.0.i, 8		; <i1> [#uses=1]
+	%tmp37 = and i1 %tmp35, %tmp36		; <i1> [#uses=1]
+	br i1 %tmp37, label %bb.i, label %Try.exit.exitStub
+
+bb.i.bb7.i.backedge_crit_edge:		; preds = %bb.i
+	br label %bb7.i.backedge
+
+bb1.i.bb7.i.backedge_crit_edge:		; preds = %bb1.i
+	br label %bb7.i.backedge
+
+bb2.i.bb7.i.backedge_crit_edge:		; preds = %bb2.i
+	br label %bb7.i.backedge
+
+bb3.i.bb7.i.backedge_crit_edge:		; preds = %bb3.i
+	br label %bb7.i.backedge
+}
diff --git a/test/CodeGen/ARM/2007-03-13-InstrSched.ll b/test/CodeGen/ARM/2007-03-13-InstrSched.ll
index 1b917f0ac2b2..07390add5538 100644
--- a/test/CodeGen/ARM/2007-03-13-InstrSched.ll
+++ b/test/CodeGen/ARM/2007-03-13-InstrSched.ll
@@ -1,5 +1,8 @@
 ; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -relocation-model=pic \
-; RUN:   -mattr=+v6 -ifcvt-limit=0 -stats |& grep asm-printer | grep 35
+; RUN:   -mattr=+v6 | grep r9
+; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -relocation-model=pic \
+; RUN:   -mattr=+v6 -arm-reserve-r9 -ifcvt-limit=0 -stats |& grep asm-printer
+; | grep 35
 
 define void @test(i32 %tmp56222, i32 %tmp36224, i32 %tmp46223, i32 %i.0196.0.ph, i32 %tmp8, i32* %tmp1011, i32** %tmp1, i32* %d2.1.out, i32* %d3.1.out, i32* %d0.1.out, i32* %d1.1.out) {
 newFuncRoot:
diff --git a/test/CodeGen/ARM/2008-09-14-CoaleserBug.ll b/test/CodeGen/ARM/2008-09-14-CoalescerBug.ll
index c601b90e0710..c601b90e0710 100644
--- a/test/CodeGen/ARM/2008-09-14-CoaleserBug.ll
+++ b/test/CodeGen/ARM/2008-09-14-CoalescerBug.ll
diff --git a/test/CodeGen/ARM/2009-06-02-ISelCrash.ll b/test/CodeGen/ARM/2009-06-02-ISelCrash.ll
new file mode 100644
index 000000000000..7cd35b9557d0
--- /dev/null
+++ b/test/CodeGen/ARM/2009-06-02-ISelCrash.ll
@@ -0,0 +1,62 @@
+; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -relocation-model=pic -mattr=+v6,+vfp2
+
+@"\01LC" = external constant [15 x i8]		; <[15 x i8]*> [#uses=1]
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+define i32 @main() nounwind {
+entry:
+	br label %bb.i1.i
+
+bb.i1.i:		; preds = %Cos.exit.i.i, %entry
+	br label %bb.i.i.i
+
+bb.i.i.i:		; preds = %bb.i.i.i, %bb.i1.i
+	br i1 undef, label %Cos.exit.i.i, label %bb.i.i.i
+
+Cos.exit.i.i:		; preds = %bb.i.i.i
+	br i1 undef, label %bb2.i.i, label %bb.i1.i
+
+bb2.i.i:		; preds = %Cos.exit.i.i
+	br label %bb3.i.i
+
+bb3.i.i:		; preds = %bb5.i.i, %bb2.i.i
+	br label %bb4.i.i
+
+bb4.i.i:		; preds = %bb4.i.i, %bb3.i.i
+	br i1 undef, label %bb5.i.i, label %bb4.i.i
+
+bb5.i.i:		; preds = %bb4.i.i
+	br i1 undef, label %bb.i, label %bb3.i.i
+
+bb.i:		; preds = %bb.i, %bb5.i.i
+	br i1 undef, label %bb1.outer2.i.i.outer, label %bb.i
+
+bb1.outer2.i.i.outer:		; preds = %Fft.exit.i, %bb5.i12.i, %bb.i
+	br label %bb1.outer2.i.i
+
+bb1.outer2.i.i:		; preds = %bb2.i9.i, %bb1.outer2.i.i.outer
+	br label %bb1.i.i
+
+bb1.i.i:		; preds = %bb1.i.i, %bb1.outer2.i.i
+	br i1 undef, label %bb2.i9.i, label %bb1.i.i
+
+bb2.i9.i:		; preds = %bb1.i.i
+	br i1 undef, label %bb4.i11.i, label %bb1.outer2.i.i
+
+bb4.i11.i:		; preds = %bb4.i11.i, %bb2.i9.i
+	br i1 undef, label %bb5.i12.i, label %bb4.i11.i
+
+bb5.i12.i:		; preds = %bb4.i11.i
+	br i1 undef, label %bb7.i.i, label %bb1.outer2.i.i.outer
+
+bb7.i.i:		; preds = %bb7.i.i, %bb5.i12.i
+	br i1 undef, label %Fft.exit.i, label %bb7.i.i
+
+Fft.exit.i:		; preds = %bb7.i.i
+	br i1 undef, label %bb5.i, label %bb1.outer2.i.i.outer
+
+bb5.i:		; preds = %Fft.exit.i
+	%0 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([15 x i8]* @"\01LC", i32 0, i32 0), double undef, double undef) nounwind		; <i32> [#uses=0]
+	unreachable
+}
diff --git a/test/CodeGen/ARM/2009-06-22-CoalescerBug.ll b/test/CodeGen/ARM/2009-06-22-CoalescerBug.ll
new file mode 100644
index 000000000000..5c8d7b0f6220
--- /dev/null
+++ b/test/CodeGen/ARM/2009-06-22-CoalescerBug.ll
@@ -0,0 +1,43 @@
+; RUN: llvm-as < %s | llc -mtriple=armv6-apple-darwin
+
+	%struct.rtunion = type { i64 }
+	%struct.rtx_def = type { i16, i8, i8, [1 x %struct.rtunion] }
+
+define arm_apcscc void @simplify_unary_real(i8* nocapture %p) nounwind {
+entry:
+	%tmp121 = load i64* null, align 4		; <i64> [#uses=1]
+	%0 = getelementptr %struct.rtx_def* null, i32 0, i32 3, i32 3, i32 0		; <i64*> [#uses=1]
+	%tmp122 = load i64* %0, align 4		; <i64> [#uses=1]
+	%1 = zext i64 undef to i192		; <i192> [#uses=2]
+	%2 = zext i64 %tmp121 to i192		; <i192> [#uses=1]
+	%3 = shl i192 %2, 64		; <i192> [#uses=2]
+	%4 = zext i64 %tmp122 to i192		; <i192> [#uses=1]
+	%5 = shl i192 %4, 128		; <i192> [#uses=1]
+	%6 = or i192 %3, %1		; <i192> [#uses=1]
+	%7 = or i192 %6, %5		; <i192> [#uses=2]
+	switch i32 undef, label %bb82 [
+		i32 77, label %bb38
+		i32 129, label %bb21
+		i32 130, label %bb20
+	]
+
+bb20:		; preds = %entry
+	ret void
+
+bb21:		; preds = %entry
+	br i1 undef, label %bb82, label %bb29
+
+bb29:		; preds = %bb21
+	%tmp18.i = and i192 %3, 1208907372870555465154560		; <i192> [#uses=1]
+	%mask.i = or i192 %tmp18.i, %1		; <i192> [#uses=1]
+	%mask41.i = or i192 %mask.i, 0		; <i192> [#uses=1]
+	br label %bb82
+
+bb38:		; preds = %entry
+	br label %bb82
+
+bb82:		; preds = %bb38, %bb29, %bb21, %entry
+	%d.0 = phi i192 [ %mask41.i, %bb29 ], [ undef, %bb38 ], [ %7, %entry ], [ %7, %bb21 ]		; <i192> [#uses=1]
+	%tmp51 = trunc i192 %d.0 to i64		; <i64> [#uses=0]
+	ret void
+}
diff --git a/test/CodeGen/ARM/arm-frameaddr.ll b/test/CodeGen/ARM/arm-frameaddr.ll
new file mode 100644
index 000000000000..f1e4c2aeb7fb
--- /dev/null
+++ b/test/CodeGen/ARM/arm-frameaddr.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin  | grep mov | grep r7
+; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi | grep mov | grep r11
+; PR4344
+; PR4416
+
+define arm_aapcscc i8* @t() nounwind {
+entry:
+	%0 = call i8* @llvm.frameaddress(i32 0)
+        ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/ARM/neon_arith1.ll b/test/CodeGen/ARM/neon_arith1.ll
new file mode 100644
index 000000000000..18b516fc1a8c
--- /dev/null
+++ b/test/CodeGen/ARM/neon_arith1.ll
@@ -0,0 +1,7 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vadd
+
+define <8 x i8> @t_i8x8(<8 x i8> %a, <8 x i8> %b) nounwind {
+entry:
+	%0 = add <8 x i8> %a, %b
+	ret <8 x i8> %0
+}
diff --git a/test/CodeGen/ARM/neon_ld1.ll b/test/CodeGen/ARM/neon_ld1.ll
new file mode 100644
index 000000000000..8901ba177dac
--- /dev/null
+++ b/test/CodeGen/ARM/neon_ld1.ll
@@ -0,0 +1,22 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fldd | count 4
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fstd
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fmrrd
+
+define void @t1(<2 x i32>* %r, <4 x i16>* %a, <4 x i16>* %b) nounwind {
+entry:
+	%0 = load <4 x i16>* %a, align 8		; <<4 x i16>> [#uses=1]
+	%1 = load <4 x i16>* %b, align 8		; <<4 x i16>> [#uses=1]
+	%2 = add <4 x i16> %0, %1		; <<4 x i16>> [#uses=1]
+	%3 = bitcast <4 x i16> %2 to <2 x i32>		; <<2 x i32>> [#uses=1]
+	store <2 x i32> %3, <2 x i32>* %r, align 8
+	ret void
+}
+
+define <2 x i32> @t2(<4 x i16>* %a, <4 x i16>* %b) nounwind readonly {
+entry:
+	%0 = load <4 x i16>* %a, align 8		; <<4 x i16>> [#uses=1]
+	%1 = load <4 x i16>* %b, align 8		; <<4 x i16>> [#uses=1]
+	%2 = sub <4 x i16> %0, %1		; <<4 x i16>> [#uses=1]
+	%3 = bitcast <4 x i16> %2 to <2 x i32>		; <<2 x i32>> [#uses=1]
+	ret <2 x i32> %3
+}
diff --git a/test/CodeGen/ARM/neon_ld2.ll b/test/CodeGen/ARM/neon_ld2.ll
new file mode 100644
index 000000000000..a26904afca30
--- /dev/null
+++ b/test/CodeGen/ARM/neon_ld2.ll
@@ -0,0 +1,23 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vldmia | count 4
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vstmia | count 1
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fmrrd  | count 2
+
+define void @t1(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+entry:
+	%0 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
+	%1 = load <2 x i64>* %b, align 16		; <<2 x i64>> [#uses=1]
+	%2 = add <2 x i64> %0, %1		; <<2 x i64>> [#uses=1]
+	%3 = bitcast <2 x i64> %2 to <4 x i32>		; <<4 x i32>> [#uses=1]
+	store <4 x i32> %3, <4 x i32>* %r, align 16
+	ret void
+}
+
+define <4 x i32> @t2(<2 x i64>* %a, <2 x i64>* %b) nounwind readonly {
+entry:
+	%0 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
+	%1 = load <2 x i64>* %b, align 16		; <<2 x i64>> [#uses=1]
+	%2 = sub <2 x i64> %0, %1		; <<2 x i64>> [#uses=1]
+	%3 = bitcast <2 x i64> %2 to <4 x i32>		; <<4 x i32>> [#uses=1]
+	ret <4 x i32> %3
+}
+
diff --git a/test/CodeGen/ARM/vaba.ll b/test/CodeGen/ARM/vaba.ll
new file mode 100644
index 000000000000..98ee1e155ba8
--- /dev/null
+++ b/test/CodeGen/ARM/vaba.ll
@@ -0,0 +1,119 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vaba\\.s8} %t | count 2
+; RUN: grep {vaba\\.s16} %t | count 2
+; RUN: grep {vaba\\.s32} %t | count 2
+; RUN: grep {vaba\\.u8} %t | count 2
+; RUN: grep {vaba\\.u16} %t | count 2
+; RUN: grep {vaba\\.u32} %t | count 2
+
+define <8 x i8> @vabas8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i8> @llvm.arm.neon.vabas.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vabas16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vabas32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i32> %tmp4
+}
+
+define <8 x i8> @vabau8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i8> @llvm.arm.neon.vabau.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vabau16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vabau32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i32> %tmp4
+}
+
+define <16 x i8> @vabaQs8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = call <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3)
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vabaQs16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vabaQs32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <16 x i8> @vabaQu8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = call <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3)
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vabaQu16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vabaQu32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+declare <8 x i8>  @llvm.arm.neon.vabas.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vabau.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vabal.ll b/test/CodeGen/ARM/vabal.ll
new file mode 100644
index 000000000000..ed9460345b34
--- /dev/null
+++ b/test/CodeGen/ARM/vabal.ll
@@ -0,0 +1,63 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vabal\\.s8} %t | count 1
+; RUN: grep {vabal\\.s16} %t | count 1
+; RUN: grep {vabal\\.s32} %t | count 1
+; RUN: grep {vabal\\.u8} %t | count 1
+; RUN: grep {vabal\\.u16} %t | count 1
+; RUN: grep {vabal\\.u32} %t | count 1
+
+define <8 x i16> @vabals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vabals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vabals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @vabalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vabalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vabalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+declare <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vabd.ll b/test/CodeGen/ARM/vabd.ll
new file mode 100644
index 000000000000..0fe5ddb94ba4
--- /dev/null
+++ b/test/CodeGen/ARM/vabd.ll
@@ -0,0 +1,126 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vabd\\.s8} %t | count 2
+; RUN: grep {vabd\\.s16} %t | count 2
+; RUN: grep {vabd\\.s32} %t | count 2
+; RUN: grep {vabd\\.u8} %t | count 2
+; RUN: grep {vabd\\.u16} %t | count 2
+; RUN: grep {vabd\\.u32} %t | count 2
+; RUN: grep {vabd\\.f32} %t | count 2
+
+define <8 x i8> @vabds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vabds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vabds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vabdu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vabdu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vabdu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vabdf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vabdQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vabdQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vabdQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vabdQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vabdQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vabdQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vabdQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vabdl.ll b/test/CodeGen/ARM/vabdl.ll
new file mode 100644
index 000000000000..b351d0f8d3e2
--- /dev/null
+++ b/test/CodeGen/ARM/vabdl.ll
@@ -0,0 +1,57 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vabdl\\.s8} %t | count 1
+; RUN: grep {vabdl\\.s16} %t | count 1
+; RUN: grep {vabdl\\.s32} %t | count 1
+; RUN: grep {vabdl\\.u8} %t | count 1
+; RUN: grep {vabdl\\.u16} %t | count 1
+; RUN: grep {vabdl\\.u32} %t | count 1
+
+define <8 x i16> @vabdls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vabdls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vabdls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vabdlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vabdlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vabdlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vabs.ll b/test/CodeGen/ARM/vabs.ll
new file mode 100644
index 000000000000..629baa762a00
--- /dev/null
+++ b/test/CodeGen/ARM/vabs.ll
@@ -0,0 +1,64 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vabs\\.s8} %t | count 2
+; RUN: grep {vabs\\.s16} %t | count 2
+; RUN: grep {vabs\\.s32} %t | count 2
+; RUN: grep {vabs\\.f32} %t | count 2
+
+define <8 x i8> @vabss8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vabss16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vabss32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vabsf32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = call <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vabsQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vabsQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vabsQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vabsQf32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = call <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vabs.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float>) nounwind readnone
+
diff --git a/test/CodeGen/ARM/vacge.ll b/test/CodeGen/ARM/vacge.ll
new file mode 100644
index 000000000000..5703dd15f388
--- /dev/null
+++ b/test/CodeGen/ARM/vacge.ll
@@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vacge\\.f32} %t | count 2
+
+define <2 x i32> @vacgef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @vacgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vacgt.ll b/test/CodeGen/ARM/vacgt.ll
new file mode 100644
index 000000000000..ebf590e98a1d
--- /dev/null
+++ b/test/CodeGen/ARM/vacgt.ll
@@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vacgt\\.f32} %t | count 2
+
+define <2 x i32> @vacgtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vadd.ll b/test/CodeGen/ARM/vadd.ll
new file mode 100644
index 000000000000..b2b0e2397c72
--- /dev/null
+++ b/test/CodeGen/ARM/vadd.ll
@@ -0,0 +1,76 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vadd\\.i8} %t | count 2
+; RUN: grep {vadd\\.i16} %t | count 2
+; RUN: grep {vadd\\.i32} %t | count 2
+; RUN: grep {vadd\\.i64} %t | count 2
+; RUN: grep {vadd\\.f32} %t | count 2
+
+define <8 x i8> @vaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = add <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = add <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = add <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vaddi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = add <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <2 x float> @vaddf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = add <2 x float> %tmp1, %tmp2
+	ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vaddQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = add <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vaddQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = add <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = add <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = add <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
+
+define <4 x float> @vaddQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = add <4 x float> %tmp1, %tmp2
+	ret <4 x float> %tmp3
+}
diff --git a/test/CodeGen/ARM/vaddhn.ll b/test/CodeGen/ARM/vaddhn.ll
new file mode 100644
index 000000000000..b2d1b142f3ea
--- /dev/null
+++ b/test/CodeGen/ARM/vaddhn.ll
@@ -0,0 +1,29 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vaddhn\\.i16} %t | count 1
+; RUN: grep {vaddhn\\.i32} %t | count 1
+; RUN: grep {vaddhn\\.i64} %t | count 1
+
+define <8 x i8> @vaddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vaddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vaddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vaddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vaddl.ll b/test/CodeGen/ARM/vaddl.ll
new file mode 100644
index 000000000000..26ab17bbc070
--- /dev/null
+++ b/test/CodeGen/ARM/vaddl.ll
@@ -0,0 +1,57 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vaddl\\.s8} %t | count 1
+; RUN: grep {vaddl\\.s16} %t | count 1
+; RUN: grep {vaddl\\.s32} %t | count 1
+; RUN: grep {vaddl\\.u8} %t | count 1
+; RUN: grep {vaddl\\.u16} %t | count 1
+; RUN: grep {vaddl\\.u32} %t | count 1
+
+define <8 x i16> @vaddls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vaddlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vaddw.ll b/test/CodeGen/ARM/vaddw.ll
new file mode 100644
index 000000000000..e06f94a8df61
--- /dev/null
+++ b/test/CodeGen/ARM/vaddw.ll
@@ -0,0 +1,57 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vaddw\\.s8} %t | count 1
+; RUN: grep {vaddw\\.s16} %t | count 1
+; RUN: grep {vaddw\\.s32} %t | count 1
+; RUN: grep {vaddw\\.u8} %t | count 1
+; RUN: grep {vaddw\\.u16} %t | count 1
+; RUN: grep {vaddw\\.u32} %t | count 1
+
+define <8 x i16> @vaddws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vaddwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vand.ll b/test/CodeGen/ARM/vand.ll
new file mode 100644
index 000000000000..b3eaf1c5dc6c
--- /dev/null
+++ b/test/CodeGen/ARM/vand.ll
@@ -0,0 +1,59 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vand %t | count 8
+; Note: function names do not include "vand" to allow simple grep for opcodes
+
+define <8 x i8> @v_andi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = and <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @v_andi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = and <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @v_andi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = and <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @v_andi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = and <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @v_andQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = and <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @v_andQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = and <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @v_andQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = and <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @v_andQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = and <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
diff --git a/test/CodeGen/ARM/vbic.ll b/test/CodeGen/ARM/vbic.ll
new file mode 100644
index 000000000000..dbc11ea58c6a
--- /dev/null
+++ b/test/CodeGen/ARM/vbic.ll
@@ -0,0 +1,67 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vbic %t | count 8
+; Note: function names do not include "vbic" to allow simple grep for opcodes
+
+define <8 x i8> @v_bici8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp4 = and <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @v_bici16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp4 = and <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_bici32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 >
+	%tmp4 = and <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @v_bici64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = xor <1 x i64> %tmp2, < i64 -1 >
+	%tmp4 = and <1 x i64> %tmp1, %tmp3
+	ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @v_bicQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp4 = and <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @v_bicQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp4 = and <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @v_bicQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 >
+	%tmp4 = and <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @v_bicQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 >
+	%tmp4 = and <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
diff --git a/test/CodeGen/ARM/vbsl.ll b/test/CodeGen/ARM/vbsl.ll
new file mode 100644
index 000000000000..37ddf4de6d32
--- /dev/null
+++ b/test/CodeGen/ARM/vbsl.ll
@@ -0,0 +1,91 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vbsl %t | count 8
+; Note: function names do not include "vbsl" to allow simple grep for opcodes
+
+define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = and <8 x i8> %tmp1, %tmp2
+	%tmp5 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp6 = and <8 x i8> %tmp5, %tmp3
+	%tmp7 = or <8 x i8> %tmp4, %tmp6
+	ret <8 x i8> %tmp7
+}
+
+define <4 x i16> @v_bsli16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = and <4 x i16> %tmp1, %tmp2
+	%tmp5 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp6 = and <4 x i16> %tmp5, %tmp3
+	%tmp7 = or <4 x i16> %tmp4, %tmp6
+	ret <4 x i16> %tmp7
+}
+
+define <2 x i32> @v_bsli32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = and <2 x i32> %tmp1, %tmp2
+	%tmp5 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 >
+	%tmp6 = and <2 x i32> %tmp5, %tmp3
+	%tmp7 = or <2 x i32> %tmp4, %tmp6
+	ret <2 x i32> %tmp7
+}
+
+define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = load <1 x i64>* %C
+	%tmp4 = and <1 x i64> %tmp1, %tmp2
+	%tmp5 = xor <1 x i64> %tmp1, < i64 -1 >
+	%tmp6 = and <1 x i64> %tmp5, %tmp3
+	%tmp7 = or <1 x i64> %tmp4, %tmp6
+	ret <1 x i64> %tmp7
+}
+
+define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = and <16 x i8> %tmp1, %tmp2
+	%tmp5 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp6 = and <16 x i8> %tmp5, %tmp3
+	%tmp7 = or <16 x i8> %tmp4, %tmp6
+	ret <16 x i8> %tmp7
+}
+
+define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = and <8 x i16> %tmp1, %tmp2
+	%tmp5 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp6 = and <8 x i16> %tmp5, %tmp3
+	%tmp7 = or <8 x i16> %tmp4, %tmp6
+	ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = and <4 x i32> %tmp1, %tmp2
+	%tmp5 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 >
+	%tmp6 = and <4 x i32> %tmp5, %tmp3
+	%tmp7 = or <4 x i32> %tmp4, %tmp6
+	ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = load <2 x i64>* %C
+	%tmp4 = and <2 x i64> %tmp1, %tmp2
+	%tmp5 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 >
+	%tmp6 = and <2 x i64> %tmp5, %tmp3
+	%tmp7 = or <2 x i64> %tmp4, %tmp6
+	ret <2 x i64> %tmp7
+}
diff --git a/test/CodeGen/ARM/vceq.ll b/test/CodeGen/ARM/vceq.ll
new file mode 100644
index 000000000000..77f1890d0865
--- /dev/null
+++ b/test/CodeGen/ARM/vceq.ll
@@ -0,0 +1,61 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vceq\\.i8} %t | count 2
+; RUN: grep {vceq\\.i16} %t | count 2
+; RUN: grep {vceq\\.i32} %t | count 2
+; RUN: grep {vceq\\.f32} %t | count 2
+
+define <8 x i8> @vceqi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp eq <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vceqi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp eq <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vceqi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp eq <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @vceqf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp oeq <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vceqQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp eq <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vceqQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp eq <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vceqQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp eq <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <4 x i32> @vceqQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = vfcmp oeq <4 x float> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
diff --git a/test/CodeGen/ARM/vcge.ll b/test/CodeGen/ARM/vcge.ll
new file mode 100644
index 000000000000..14c623ea082f
--- /dev/null
+++ b/test/CodeGen/ARM/vcge.ll
@@ -0,0 +1,106 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcge\\.s8} %t | count 2
+; RUN: grep {vcge\\.s16} %t | count 2
+; RUN: grep {vcge\\.s32} %t | count 2
+; RUN: grep {vcge\\.u8} %t | count 2
+; RUN: grep {vcge\\.u16} %t | count 2
+; RUN: grep {vcge\\.u32} %t | count 2
+; RUN: grep {vcge\\.f32} %t | count 2
+
+define <8 x i8> @vcges8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp sge <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcges16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp sge <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcges32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp sge <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vcgeu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp uge <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcgeu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp uge <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcgeu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp uge <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @vcgef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp oge <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vcgeQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp sge <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcgeQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp sge <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcgeQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp sge <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vcgeQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp uge <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcgeQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp uge <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcgeQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp uge <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <4 x i32> @vcgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = vfcmp oge <4 x float> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
diff --git a/test/CodeGen/ARM/vcgt.ll b/test/CodeGen/ARM/vcgt.ll
new file mode 100644
index 000000000000..3f7e55078733
--- /dev/null
+++ b/test/CodeGen/ARM/vcgt.ll
@@ -0,0 +1,106 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcgt\\.s8} %t | count 2
+; RUN: grep {vcgt\\.s16} %t | count 2
+; RUN: grep {vcgt\\.s32} %t | count 2
+; RUN: grep {vcgt\\.u8} %t | count 2
+; RUN: grep {vcgt\\.u16} %t | count 2
+; RUN: grep {vcgt\\.u32} %t | count 2
+; RUN: grep {vcgt\\.f32} %t | count 2
+
+define <8 x i8> @vcgts8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp sgt <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcgts16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp sgt <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcgts32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp sgt <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vcgtu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp ugt <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcgtu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp ugt <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcgtu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp ugt <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @vcgtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ogt <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vcgtQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp sgt <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcgtQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp sgt <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcgtQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp sgt <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vcgtQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp ugt <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcgtQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp ugt <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcgtQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp ugt <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <4 x i32> @vcgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = vfcmp ogt <4 x float> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
diff --git a/test/CodeGen/ARM/vcls.ll b/test/CodeGen/ARM/vcls.ll
new file mode 100644
index 000000000000..69e4bdbe163e
--- /dev/null
+++ b/test/CodeGen/ARM/vcls.ll
@@ -0,0 +1,48 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcls\\.s8} %t | count 2
+; RUN: grep {vcls\\.s16} %t | count 2
+; RUN: grep {vcls\\.s32} %t | count 2
+
+define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vclss16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vclss32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vclz.ll b/test/CodeGen/ARM/vclz.ll
new file mode 100644
index 000000000000..575ea7d2e848
--- /dev/null
+++ b/test/CodeGen/ARM/vclz.ll
@@ -0,0 +1,48 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vclz\\.i8} %t | count 2
+; RUN: grep {vclz\\.i16} %t | count 2
+; RUN: grep {vclz\\.i32} %t | count 2
+
+define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vclz.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vcnt.ll b/test/CodeGen/ARM/vcnt.ll
new file mode 100644
index 000000000000..981716895894
--- /dev/null
+++ b/test/CodeGen/ARM/vcnt.ll
@@ -0,0 +1,17 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcnt\\.8} %t | count 2
+
+define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vcnt.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM/vcvt.ll b/test/CodeGen/ARM/vcvt.ll
new file mode 100644
index 000000000000..1cb42bf155cb
--- /dev/null
+++ b/test/CodeGen/ARM/vcvt.ll
@@ -0,0 +1,53 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcvt\\.s32\\.f32} %t | count 2
+; RUN: grep {vcvt\\.u32\\.f32} %t | count 2
+; RUN: grep {vcvt\\.f32\\.s32} %t | count 2
+; RUN: grep {vcvt\\.f32\\.u32} %t | count 2
+
+define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = fptosi <2 x float> %tmp1 to <2 x i32>
+	ret <2 x i32> %tmp2
+}
+
+define <2 x i32> @vcvt_f32tou32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = fptoui <2 x float> %tmp1 to <2 x i32>
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vcvt_s32tof32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = sitofp <2 x i32> %tmp1 to <2 x float>
+	ret <2 x float> %tmp2
+}
+
+define <2 x float> @vcvt_u32tof32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = uitofp <2 x i32> %tmp1 to <2 x float>
+	ret <2 x float> %tmp2
+}
+
+define <4 x i32> @vcvtQ_f32tos32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = fptosi <4 x float> %tmp1 to <4 x i32>
+	ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @vcvtQ_f32tou32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = fptoui <4 x float> %tmp1 to <4 x i32>
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vcvtQ_s32tof32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = sitofp <4 x i32> %tmp1 to <4 x float>
+	ret <4 x float> %tmp2
+}
+
+define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = uitofp <4 x i32> %tmp1 to <4 x float>
+	ret <4 x float> %tmp2
+}
diff --git a/test/CodeGen/ARM/vcvt_n.ll b/test/CodeGen/ARM/vcvt_n.ll
new file mode 100644
index 000000000000..ac86b73d8d72
--- /dev/null
+++ b/test/CodeGen/ARM/vcvt_n.ll
@@ -0,0 +1,64 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcvt\\.s32\\.f32} %t | count 2
+; RUN: grep {vcvt\\.u32\\.f32} %t | count 2
+; RUN: grep {vcvt\\.f32\\.s32} %t | count 2
+; RUN: grep {vcvt\\.f32\\.u32} %t | count 2
+
+define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %tmp1, i32 1)
+	ret <2 x i32> %tmp2
+}
+
+define <2 x i32> @vcvt_f32tou32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %tmp1, i32 1)
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vcvt_s32tof32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1)
+	ret <2 x float> %tmp2
+}
+
+define <2 x float> @vcvt_u32tof32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1)
+	ret <2 x float> %tmp2
+}
+
+declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+
+define <4 x i32> @vcvtQ_f32tos32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %tmp1, i32 1)
+	ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @vcvtQ_f32tou32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %tmp1, i32 1)
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vcvtQ_s32tof32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1)
+	ret <4 x float> %tmp2
+}
+
+define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1)
+	ret <4 x float> %tmp2
+}
+
+declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+
diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
new file mode 100644
index 000000000000..1c0887a2492d
--- /dev/null
+++ b/test/CodeGen/ARM/vdup.ll
@@ -0,0 +1,134 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vdup.8 %t | count 4
+; RUN: grep vdup.16 %t | count 4
+; RUN: grep vdup.32 %t | count 8
+
+define <8 x i8> @v_dup8(i8 %A) nounwind {
+	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
+	ret <8 x i8> %tmp8
+}
+
+define <4 x i16> @v_dup16(i16 %A) nounwind {
+	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_dup32(i32 %A) nounwind {
+	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_dupfloat(float %A) nounwind {
+	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_dupQ8(i8 %A) nounwind {
+	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
+	%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
+	%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
+	%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
+	%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
+	%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
+	%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
+	%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
+	%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
+	ret <16 x i8> %tmp16
+}
+
+define <8 x i16> @v_dupQ16(i16 %A) nounwind {
+	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
+	%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
+	%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
+	%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
+	%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
+	ret <8 x i16> %tmp8
+}
+
+define <4 x i32> @v_dupQ32(i32 %A) nounwind {
+	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
+	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
+	%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
+	ret <4 x i32> %tmp4
+}
+
+define <4 x float> @v_dupQfloat(float %A) nounwind {
+	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
+	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
+	%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
+	ret <4 x float> %tmp4
+}
+
+; Check to make sure it works with shuffles, too.
+
+define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
+	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
+	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
+	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_shuffledupfloat(float %A) nounwind {
+	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
+	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
+	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
+	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
+	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+	ret <4 x float> %tmp2
+}
diff --git a/test/CodeGen/ARM/vdup_lane.ll b/test/CodeGen/ARM/vdup_lane.ll
new file mode 100644
index 000000000000..adadc9f5d31b
--- /dev/null
+++ b/test/CodeGen/ARM/vdup_lane.ll
@@ -0,0 +1,52 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vdup.8 %t | count 2
+; RUN: grep vdup.16 %t | count 2
+; RUN: grep vdup.32 %t | count 4
+
+define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x float> %tmp2
+}
diff --git a/test/CodeGen/ARM/veor.ll b/test/CodeGen/ARM/veor.ll
new file mode 100644
index 000000000000..47a5f3f9f7ec
--- /dev/null
+++ b/test/CodeGen/ARM/veor.ll
@@ -0,0 +1,59 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep veor %t | count 8
+; Note: function names do not include "veor" to allow simple grep for opcodes
+
+define <8 x i8> @v_eori8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = xor <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @v_eori16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = xor <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @v_eori32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = xor <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @v_eori64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = xor <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @v_eorQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = xor <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @v_eorQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = xor <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @v_eorQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = xor <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @v_eorQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = xor <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
diff --git a/test/CodeGen/ARM/vfcmp.ll b/test/CodeGen/ARM/vfcmp.ll
new file mode 100644
index 000000000000..58c2068bc8f4
--- /dev/null
+++ b/test/CodeGen/ARM/vfcmp.ll
@@ -0,0 +1,96 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vceq\\.f32} %t | count 1
+; RUN: grep {vcgt\\.f32} %t | count 9
+; RUN: grep {vcge\\.f32} %t | count 5
+; RUN: grep vorr %t | count 4
+; RUN: grep vmvn %t | count 7
+
+; This tests vfcmp operations that do not map directly to NEON instructions.
+
+; une is implemented with VCEQ/VMVN
+define <2 x i32> @vcunef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp une <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; olt is implemented with VCGT
+define <2 x i32> @vcoltf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp olt <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ole is implemented with VCGE
+define <2 x i32> @vcolef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ole <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; uge is implemented with VCGT/VMVN
+define <2 x i32> @vcugef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp uge <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ule is implemented with VCGT/VMVN
+define <2 x i32> @vculef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ule <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ugt is implemented with VCGE/VMVN
+define <2 x i32> @vcugtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ugt <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ult is implemented with VCGE/VMVN
+define <2 x i32> @vcultf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ult <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ueq is implemented with VCGT/VCGT/VORR/VMVN
+define <2 x i32> @vcueqf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ueq <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; one is implemented with VCGT/VCGT/VORR
+define <2 x i32> @vconef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp one <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; uno is implemented with VCGT/VCGE/VORR/VMVN
+define <2 x i32> @vcunof32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp uno <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ord is implemented with VCGT/VCGE/VORR
+define <2 x i32> @vcordf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ord <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
diff --git a/test/CodeGen/ARM/vget_lane.ll b/test/CodeGen/ARM/vget_lane.ll
new file mode 100644
index 000000000000..a361ba2ba97f
--- /dev/null
+++ b/test/CodeGen/ARM/vget_lane.ll
@@ -0,0 +1,78 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmov\\.s8} %t | count 2
+; RUN: grep {vmov\\.s16} %t | count 2
+; RUN: grep {vmov\\.u8} %t | count 2
+; RUN: grep {vmov\\.u16} %t | count 2
+; RUN: grep {vmov\\.32} %t | count 2
+
+define i32 @vget_lanes8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = extractelement <8 x i8> %tmp1, i32 1
+	%tmp3 = sext i8 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vget_lanes16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = extractelement <4 x i16> %tmp1, i32 1
+	%tmp3 = sext i16 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vget_laneu8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = extractelement <8 x i8> %tmp1, i32 1
+	%tmp3 = zext i8 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vget_laneu16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = extractelement <4 x i16> %tmp1, i32 1
+	%tmp3 = zext i16 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+; Do a vector add to keep the extraction from being done directly from memory.
+define i32 @vget_lanei32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = add <2 x i32> %tmp1, %tmp1
+	%tmp3 = extractelement <2 x i32> %tmp2, i32 1
+	ret i32 %tmp3
+}
+
+define i32 @vgetQ_lanes8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = extractelement <16 x i8> %tmp1, i32 1
+	%tmp3 = sext i8 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vgetQ_lanes16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = extractelement <8 x i16> %tmp1, i32 1
+	%tmp3 = sext i16 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vgetQ_laneu8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = extractelement <16 x i8> %tmp1, i32 1
+	%tmp3 = zext i8 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vgetQ_laneu16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = extractelement <8 x i16> %tmp1, i32 1
+	%tmp3 = zext i16 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+; Do a vector add to keep the extraction from being done directly from memory.
+define i32 @vgetQ_lanei32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = add <4 x i32> %tmp1, %tmp1
+	%tmp3 = extractelement <4 x i32> %tmp2, i32 1
+	ret i32 %tmp3
+}
diff --git a/test/CodeGen/ARM/vhadd.ll b/test/CodeGen/ARM/vhadd.ll
new file mode 100644
index 000000000000..5e7503dc71cf
--- /dev/null
+++ b/test/CodeGen/ARM/vhadd.ll
@@ -0,0 +1,107 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vhadd\\.s8} %t | count 2
+; RUN: grep {vhadd\\.s16} %t | count 2
+; RUN: grep {vhadd\\.s32} %t | count 2
+; RUN: grep {vhadd\\.u8} %t | count 2
+; RUN: grep {vhadd\\.u16} %t | count 2
+; RUN: grep {vhadd\\.u32} %t | count 2
+
+define <8 x i8> @vhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vhsub.ll b/test/CodeGen/ARM/vhsub.ll
new file mode 100644
index 000000000000..32a66e547945
--- /dev/null
+++ b/test/CodeGen/ARM/vhsub.ll
@@ -0,0 +1,107 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vhsub\\.s8} %t | count 2
+; RUN: grep {vhsub\\.s16} %t | count 2
+; RUN: grep {vhsub\\.s32} %t | count 2
+; RUN: grep {vhsub\\.u8} %t | count 2
+; RUN: grep {vhsub\\.u16} %t | count 2
+; RUN: grep {vhsub\\.u32} %t | count 2
+
+define <8 x i8> @vhsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vhsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vhsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vhsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vhsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vhsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vhsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vhsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vhsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vhsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vhsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vhsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vhsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vhsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vicmp.ll b/test/CodeGen/ARM/vicmp.ll
new file mode 100644
index 000000000000..86858f929348
--- /dev/null
+++ b/test/CodeGen/ARM/vicmp.ll
@@ -0,0 +1,85 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vceq\\.i8} %t | count 2
+; RUN: grep {vceq\\.i16} %t | count 2
+; RUN: grep {vceq\\.i32} %t | count 2
+; RUN: grep vmvn %t | count 6
+; RUN: grep {vcgt\\.s8} %t | count 1
+; RUN: grep {vcge\\.s16} %t | count 1
+; RUN: grep {vcgt\\.u16} %t | count 1
+; RUN: grep {vcge\\.u32} %t | count 1
+
+; This tests vicmp operations that do not map directly to NEON instructions.
+; Not-equal (ne) operations are implemented by VCEQ/VMVN.  Less-than (lt/ult)
+; and less-than-or-equal (le/ule) are implemented by swapping the arguments
+; to VCGT and VCGE.  Test all the operand types for not-equal but only sample
+; the other operations.
+
+define <8 x i8> @vcnei8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp ne <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcnei16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp ne <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcnei32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp ne <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vcneQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp ne <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcneQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp ne <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcneQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp ne <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vcltQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp slt <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @vcles16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp sle <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <4 x i16> @vcltu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp ult <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <4 x i32> @vcleQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp ule <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
diff --git a/test/CodeGen/ARM/vmax.ll b/test/CodeGen/ARM/vmax.ll
new file mode 100644
index 000000000000..60322f85d399
--- /dev/null
+++ b/test/CodeGen/ARM/vmax.ll
@@ -0,0 +1,126 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmax\\.s8} %t | count 2
+; RUN: grep {vmax\\.s16} %t | count 2
+; RUN: grep {vmax\\.s32} %t | count 2
+; RUN: grep {vmax\\.u8} %t | count 2
+; RUN: grep {vmax\\.u16} %t | count 2
+; RUN: grep {vmax\\.u32} %t | count 2
+; RUN: grep {vmax\\.f32} %t | count 2
+
+define <8 x i8> @vmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vmaxf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vmaxQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vmaxQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmaxQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vmaxQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vmaxQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmaxQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vmaxQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm.neon.vmaxf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vmaxf.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x float> @llvm.arm.neon.vmaxf.v4f32(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vmin.ll b/test/CodeGen/ARM/vmin.ll
new file mode 100644
index 000000000000..a6936937c7b6
--- /dev/null
+++ b/test/CodeGen/ARM/vmin.ll
@@ -0,0 +1,126 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmin\\.s8} %t | count 2
+; RUN: grep {vmin\\.s16} %t | count 2
+; RUN: grep {vmin\\.s32} %t | count 2
+; RUN: grep {vmin\\.u8} %t | count 2
+; RUN: grep {vmin\\.u16} %t | count 2
+; RUN: grep {vmin\\.u32} %t | count 2
+; RUN: grep {vmin\\.f32} %t | count 2
+
+define <8 x i8> @vmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vminf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vminf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vminQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vminQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vminQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vminQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vminQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vminQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vminQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm.neon.vminf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vminf.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x float> @llvm.arm.neon.vminf.v4f32(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vmla.ll b/test/CodeGen/ARM/vmla.ll
new file mode 100644
index 000000000000..ed77e11a7c47
--- /dev/null
+++ b/test/CodeGen/ARM/vmla.ll
@@ -0,0 +1,77 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmla\\.i8} %t | count 2
+; RUN: grep {vmla\\.i16} %t | count 2
+; RUN: grep {vmla\\.i32} %t | count 2
+; RUN: grep {vmla\\.f32} %t | count 2
+
+define <8 x i8> @vmlai8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = mul <8 x i8> %tmp2, %tmp3
+	%tmp5 = add <8 x i8> %tmp1, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vmlai16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = mul <4 x i16> %tmp2, %tmp3
+	%tmp5 = add <4 x i16> %tmp1, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @vmlai32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = mul <2 x i32> %tmp2, %tmp3
+	%tmp5 = add <2 x i32> %tmp1, %tmp4
+	ret <2 x i32> %tmp5
+}
+
+define <2 x float> @vmlaf32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = load <2 x float>* %C
+	%tmp4 = mul <2 x float> %tmp2, %tmp3
+	%tmp5 = add <2 x float> %tmp1, %tmp4
+	ret <2 x float> %tmp5
+}
+
+define <16 x i8> @vmlaQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8> * %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = mul <16 x i8> %tmp2, %tmp3
+	%tmp5 = add <16 x i8> %tmp1, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vmlaQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = mul <8 x i16> %tmp2, %tmp3
+	%tmp5 = add <8 x i16> %tmp1, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vmlaQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = mul <4 x i32> %tmp2, %tmp3
+	%tmp5 = add <4 x i32> %tmp1, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vmlaQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = load <4 x float>* %C
+	%tmp4 = mul <4 x float> %tmp2, %tmp3
+	%tmp5 = add <4 x float> %tmp1, %tmp4
+	ret <4 x float> %tmp5
+}
diff --git a/test/CodeGen/ARM/vmlal.ll b/test/CodeGen/ARM/vmlal.ll
new file mode 100644
index 000000000000..7fd00bac5e24
--- /dev/null
+++ b/test/CodeGen/ARM/vmlal.ll
@@ -0,0 +1,63 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmlal\\.s8} %t | count 1
+; RUN: grep {vmlal\\.s16} %t | count 1
+; RUN: grep {vmlal\\.s32} %t | count 1
+; RUN: grep {vmlal\\.u8} %t | count 1
+; RUN: grep {vmlal\\.u16} %t | count 1
+; RUN: grep {vmlal\\.u32} %t | count 1
+
+define <8 x i16> @vmlals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @vmlalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+declare <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vmls.ll b/test/CodeGen/ARM/vmls.ll
new file mode 100644
index 000000000000..d519b7e70e1e
--- /dev/null
+++ b/test/CodeGen/ARM/vmls.ll
@@ -0,0 +1,77 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmls\\.i8} %t | count 2
+; RUN: grep {vmls\\.i16} %t | count 2
+; RUN: grep {vmls\\.i32} %t | count 2
+; RUN: grep {vmls\\.f32} %t | count 2
+
+define <8 x i8> @vmlsi8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = mul <8 x i8> %tmp2, %tmp3
+	%tmp5 = sub <8 x i8> %tmp1, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vmlsi16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = mul <4 x i16> %tmp2, %tmp3
+	%tmp5 = sub <4 x i16> %tmp1, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @vmlsi32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = mul <2 x i32> %tmp2, %tmp3
+	%tmp5 = sub <2 x i32> %tmp1, %tmp4
+	ret <2 x i32> %tmp5
+}
+
+define <2 x float> @vmlsf32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = load <2 x float>* %C
+	%tmp4 = mul <2 x float> %tmp2, %tmp3
+	%tmp5 = sub <2 x float> %tmp1, %tmp4
+	ret <2 x float> %tmp5
+}
+
+define <16 x i8> @vmlsQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8> * %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = mul <16 x i8> %tmp2, %tmp3
+	%tmp5 = sub <16 x i8> %tmp1, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vmlsQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = mul <8 x i16> %tmp2, %tmp3
+	%tmp5 = sub <8 x i16> %tmp1, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vmlsQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = mul <4 x i32> %tmp2, %tmp3
+	%tmp5 = sub <4 x i32> %tmp1, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vmlsQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = load <4 x float>* %C
+	%tmp4 = mul <4 x float> %tmp2, %tmp3
+	%tmp5 = sub <4 x float> %tmp1, %tmp4
+	ret <4 x float> %tmp5
+}
diff --git a/test/CodeGen/ARM/vmlsl.ll b/test/CodeGen/ARM/vmlsl.ll
new file mode 100644
index 000000000000..94910dc77b0e
--- /dev/null
+++ b/test/CodeGen/ARM/vmlsl.ll
@@ -0,0 +1,63 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmlsl\\.s8} %t | count 1
+; RUN: grep {vmlsl\\.s16} %t | count 1
+; RUN: grep {vmlsl\\.s32} %t | count 1
+; RUN: grep {vmlsl\\.u8} %t | count 1
+; RUN: grep {vmlsl\\.u16} %t | count 1
+; RUN: grep {vmlsl\\.u32} %t | count 1
+
+define <8 x i16> @vmlsls8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @vmlslu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlslu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlslu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+declare <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vmov.ll b/test/CodeGen/ARM/vmov.ll
new file mode 100644
index 000000000000..af9c8e25989c
--- /dev/null
+++ b/test/CodeGen/ARM/vmov.ll
@@ -0,0 +1,101 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vmov.i8 %t | count 2
+; RUN: grep vmov.i16 %t | count 4
+; RUN: grep vmov.i32 %t | count 12
+; RUN: grep vmov.i64 %t | count 2
+; Note: function names do not include "vmov" to allow simple grep for opcodes
+
+define <8 x i8> @v_movi8() nounwind {
+	ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+}
+
+define <4 x i16> @v_movi16a() nounwind {
+	ret <4 x i16> < i16 16, i16 16, i16 16, i16 16 >
+}
+
+; 0x1000 = 4096
+define <4 x i16> @v_movi16b() nounwind {
+	ret <4 x i16> < i16 4096, i16 4096, i16 4096, i16 4096 >
+}
+
+define <2 x i32> @v_movi32a() nounwind {
+	ret <2 x i32> < i32 32, i32 32 >
+}
+
+; 0x2000 = 8192
+define <2 x i32> @v_movi32b() nounwind {
+	ret <2 x i32> < i32 8192, i32 8192 >
+}
+
+; 0x200000 = 2097152
+define <2 x i32> @v_movi32c() nounwind {
+	ret <2 x i32> < i32 2097152, i32 2097152 >
+}
+
+; 0x20000000 = 536870912
+define <2 x i32> @v_movi32d() nounwind {
+	ret <2 x i32> < i32 536870912, i32 536870912 >
+}
+
+; 0x20ff = 8447
+define <2 x i32> @v_movi32e() nounwind {
+	ret <2 x i32> < i32 8447, i32 8447 >
+}
+
+; 0x20ffff = 2162687
+define <2 x i32> @v_movi32f() nounwind {
+	ret <2 x i32> < i32 2162687, i32 2162687 >
+}
+
+; 0xff0000ff0000ffff = 18374687574888349695
+define <1 x i64> @v_movi64() nounwind {
+	ret <1 x i64> < i64 18374687574888349695 >
+}
+
+define <16 x i8> @v_movQi8() nounwind {
+	ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+}
+
+define <8 x i16> @v_movQi16a() nounwind {
+	ret <8 x i16> < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+}
+
+; 0x1000 = 4096
+define <8 x i16> @v_movQi16b() nounwind {
+	ret <8 x i16> < i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096 >
+}
+
+define <4 x i32> @v_movQi32a() nounwind {
+	ret <4 x i32> < i32 32, i32 32, i32 32, i32 32 >
+}
+
+; 0x2000 = 8192
+define <4 x i32> @v_movQi32b() nounwind {
+	ret <4 x i32> < i32 8192, i32 8192, i32 8192, i32 8192 >
+}
+
+; 0x200000 = 2097152
+define <4 x i32> @v_movQi32c() nounwind {
+	ret <4 x i32> < i32 2097152, i32 2097152, i32 2097152, i32 2097152 >
+}
+
+; 0x20000000 = 536870912
+define <4 x i32> @v_movQi32d() nounwind {
+	ret <4 x i32> < i32 536870912, i32 536870912, i32 536870912, i32 536870912 >
+}
+
+; 0x20ff = 8447
+define <4 x i32> @v_movQi32e() nounwind {
+	ret <4 x i32> < i32 8447, i32 8447, i32 8447, i32 8447 >
+}
+
+; 0x20ffff = 2162687
+define <4 x i32> @v_movQi32f() nounwind {
+	ret <4 x i32> < i32 2162687, i32 2162687, i32 2162687, i32 2162687 >
+}
+
+; 0xff0000ff0000ffff = 18374687574888349695
+define <2 x i64> @v_movQi64() nounwind {
+	ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
+}
+
diff --git a/test/CodeGen/ARM/vmovl.ll b/test/CodeGen/ARM/vmovl.ll
new file mode 100644
index 000000000000..09a77af0be67
--- /dev/null
+++ b/test/CodeGen/ARM/vmovl.ll
@@ -0,0 +1,51 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmovl\\.s8} %t | count 1
+; RUN: grep {vmovl\\.s16} %t | count 1
+; RUN: grep {vmovl\\.s32} %t | count 1
+; RUN: grep {vmovl\\.u8} %t | count 1
+; RUN: grep {vmovl\\.u16} %t | count 1
+; RUN: grep {vmovl\\.u32} %t | count 1
+
+define <8 x i16> @vmovls8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vmovls16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vmovls32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32> %tmp1)
+	ret <2 x i64> %tmp2
+}
+
+define <8 x i16> @vmovlu8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vmovlu16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vmovlu32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32> %tmp1)
+	ret <2 x i64> %tmp2
+}
+
+declare <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vmovn.ll b/test/CodeGen/ARM/vmovn.ll
new file mode 100644
index 000000000000..d4d027c26c61
--- /dev/null
+++ b/test/CodeGen/ARM/vmovn.ll
@@ -0,0 +1,26 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmovn\\.i16} %t | count 1
+; RUN: grep {vmovn\\.i32} %t | count 1
+; RUN: grep {vmovn\\.i64} %t | count 1
+
+define <8 x i8> @vmovni16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vmovni32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vmovni64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vmovn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
new file mode 100644
index 000000000000..eb9ae7b95c2d
--- /dev/null
+++ b/test/CodeGen/ARM/vmul.ll
@@ -0,0 +1,79 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmul\\.i8} %t | count 2
+; RUN: grep {vmul\\.i16} %t | count 2
+; RUN: grep {vmul\\.i32} %t | count 2
+; RUN: grep {vmul\\.f32} %t | count 2
+; RUN: grep {vmul\\.p8} %t | count 2
+
+define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = mul <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmuli16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = mul <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmuli32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = mul <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vmulf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = mul <2 x float> %tmp1, %tmp2
+	ret <2 x float> %tmp3
+}
+
+define <8 x i8> @vmulp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @vmulQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = mul <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vmulQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = mul <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmulQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = mul <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vmulQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = mul <4 x float> %tmp1, %tmp2
+	ret <4 x float> %tmp3
+}
+
+define <16 x i8> @vmulQp8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8>  @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM/vmull.ll b/test/CodeGen/ARM/vmull.ll
new file mode 100644
index 000000000000..2fe6f3741139
--- /dev/null
+++ b/test/CodeGen/ARM/vmull.ll
@@ -0,0 +1,67 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmull\\.s8} %t | count 1
+; RUN: grep {vmull\\.s16} %t | count 1
+; RUN: grep {vmull\\.s32} %t | count 1
+; RUN: grep {vmull\\.u8} %t | count 1
+; RUN: grep {vmull\\.u16} %t | count 1
+; RUN: grep {vmull\\.u32} %t | count 1
+; RUN: grep {vmull\\.p8} %t | count 1
+
+define <8 x i16> @vmulls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16>  @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM/vmvn.ll b/test/CodeGen/ARM/vmvn.ll
new file mode 100644
index 000000000000..9bc135241ec6
--- /dev/null
+++ b/test/CodeGen/ARM/vmvn.ll
@@ -0,0 +1,51 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vmvn %t | count 8
+; Note: function names do not include "vmvn" to allow simple grep for opcodes
+
+define <8 x i8> @v_mvni8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @v_mvni16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @v_mvni32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 >
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @v_mvni64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = xor <1 x i64> %tmp1, < i64 -1 >
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @v_mvnQi8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @v_mvnQi16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @v_mvnQi32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 >
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @v_mvnQi64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 >
+	ret <2 x i64> %tmp2
+}
diff --git a/test/CodeGen/ARM/vneg.ll b/test/CodeGen/ARM/vneg.ll
new file mode 100644
index 000000000000..9fa527f52fcc
--- /dev/null
+++ b/test/CodeGen/ARM/vneg.ll
@@ -0,0 +1,53 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vneg\\.s8} %t | count 2
+; RUN: grep {vneg\\.s16} %t | count 2
+; RUN: grep {vneg\\.s32} %t | count 2
+; RUN: grep {vneg\\.f32} %t | count 2
+
+define <8 x i8> @vnegs8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = sub <8 x i8> zeroinitializer, %tmp1
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vnegs16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = sub <4 x i16> zeroinitializer, %tmp1
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vnegs32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = sub <2 x i32> zeroinitializer, %tmp1
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vnegf32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = sub <2 x float> < float -0.000000e+00, float -0.000000e+00 >, %tmp1
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vnegQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = sub <16 x i8> zeroinitializer, %tmp1
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vnegQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = sub <8 x i16> zeroinitializer, %tmp1
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vnegQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = sub <4 x i32> zeroinitializer, %tmp1
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vnegQf32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = sub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %tmp1
+	ret <4 x float> %tmp2
+}
diff --git a/test/CodeGen/ARM/vorn.ll b/test/CodeGen/ARM/vorn.ll
new file mode 100644
index 000000000000..cee744ccb166
--- /dev/null
+++ b/test/CodeGen/ARM/vorn.ll
@@ -0,0 +1,67 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vorn %t | count 8
+; Note: function names do not include "vorn" to allow simple grep for opcodes
+
+define <8 x i8> @v_orni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp4 = or <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @v_orni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp4 = or <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_orni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 >
+	%tmp4 = or <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @v_orni64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = xor <1 x i64> %tmp2, < i64 -1 >
+	%tmp4 = or <1 x i64> %tmp1, %tmp3
+	ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @v_ornQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp4 = or <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @v_ornQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp4 = or <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @v_ornQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 >
+	%tmp4 = or <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @v_ornQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 >
+	%tmp4 = or <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
diff --git a/test/CodeGen/ARM/vorr.ll b/test/CodeGen/ARM/vorr.ll
new file mode 100644
index 000000000000..39f4814c66c8
--- /dev/null
+++ b/test/CodeGen/ARM/vorr.ll
@@ -0,0 +1,59 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vorr %t | count 8
+; Note: function names do not include "vorr" to allow simple grep for opcodes
+
+define <8 x i8> @v_orri8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = or <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @v_orri16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = or <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @v_orri32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = or <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @v_orri64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = or <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @v_orrQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = or <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @v_orrQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = or <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @v_orrQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = or <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @v_orrQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = or <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
diff --git a/test/CodeGen/ARM/vpadal.ll b/test/CodeGen/ARM/vpadal.ll
new file mode 100644
index 000000000000..c41c532988e8
--- /dev/null
+++ b/test/CodeGen/ARM/vpadal.ll
@@ -0,0 +1,107 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vpadal\\.s8} %t | count 2
+; RUN: grep {vpadal\\.s16} %t | count 2
+; RUN: grep {vpadal\\.s32} %t | count 2
+; RUN: grep {vpadal\\.u8} %t | count 2
+; RUN: grep {vpadal\\.u16} %t | count 2
+; RUN: grep {vpadal\\.u32} %t | count 2
+
+define <4 x i16> @vpadals8(<4 x i16>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %tmp1, <8 x i8> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpadals16(<2 x i32>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %tmp1, <4 x i16> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vpadals32(<1 x i64>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %tmp1, <2 x i32> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <4 x i16> @vpadalu8(<4 x i16>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %tmp1, <8 x i8> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpadalu16(<2 x i32>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %tmp1, <4 x i16> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vpadalu32(<1 x i64>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %tmp1, <2 x i32> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i16> @vpadalQs8(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %tmp1, <16 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vpadalQs16(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %tmp1, <8 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vpadalQs32(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %tmp1, <4 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vpadalQu8(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %tmp1, <16 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vpadalQu16(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %tmp1, <8 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vpadalQu32(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %tmp1, <4 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16>, <8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64>, <2 x i32>) nounwind readnone
+
+declare <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16>, <8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16>, <16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32>, <8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64>, <4 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16>, <16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32>, <8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vpadd.ll b/test/CodeGen/ARM/vpadd.ll
new file mode 100644
index 000000000000..baff49227e64
--- /dev/null
+++ b/test/CodeGen/ARM/vpadd.ll
@@ -0,0 +1,39 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vpadd\\.i8} %t | count 1
+; RUN: grep {vpadd\\.i16} %t | count 1
+; RUN: grep {vpadd\\.i32} %t | count 1
+; RUN: grep {vpadd\\.f32} %t | count 1
+
+define <8 x i8> @vpaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vpaddi.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vpaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vpaddi.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vpaddi.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vpaddf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vpaddf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vpaddi.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpaddi.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddi.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vpaddf.v2f32(<2 x float>, <2 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vpaddl.ll b/test/CodeGen/ARM/vpaddl.ll
new file mode 100644
index 000000000000..babb49515a03
--- /dev/null
+++ b/test/CodeGen/ARM/vpaddl.ll
@@ -0,0 +1,95 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vpaddl\\.s8} %t | count 2
+; RUN: grep {vpaddl\\.s16} %t | count 2
+; RUN: grep {vpaddl\\.s32} %t | count 2
+; RUN: grep {vpaddl\\.u8} %t | count 2
+; RUN: grep {vpaddl\\.u16} %t | count 2
+; RUN: grep {vpaddl\\.u32} %t | count 2
+
+define <4 x i16> @vpaddls8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vpaddls16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vpaddls32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %tmp1)
+	ret <1 x i64> %tmp2
+}
+
+define <4 x i16> @vpaddlu8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vpaddlu16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vpaddlu32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %tmp1)
+	ret <1 x i64> %tmp2
+}
+
+define <8 x i16> @vpaddlQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vpaddlQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vpaddlQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %tmp1)
+	ret <2 x i64> %tmp2
+}
+
+define <8 x i16> @vpaddlQu8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vpaddlQu16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1)
+	ret <2 x i64> %tmp2
+}
+
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vpmax.ll b/test/CodeGen/ARM/vpmax.ll
new file mode 100644
index 000000000000..9878ca8c7ba6
--- /dev/null
+++ b/test/CodeGen/ARM/vpmax.ll
@@ -0,0 +1,67 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vpmax\\.s8} %t | count 1
+; RUN: grep {vpmax\\.s16} %t | count 1
+; RUN: grep {vpmax\\.s32} %t | count 1
+; RUN: grep {vpmax\\.u8} %t | count 1
+; RUN: grep {vpmax\\.u16} %t | count 1
+; RUN: grep {vpmax\\.u32} %t | count 1
+; RUN: grep {vpmax\\.f32} %t | count 1
+
+define <8 x i8> @vpmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vpmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vpmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vpmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vpmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vpmaxf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vpmaxf.v2f32(<2 x float>, <2 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vpmin.ll b/test/CodeGen/ARM/vpmin.ll
new file mode 100644
index 000000000000..7b5348baa544
--- /dev/null
+++ b/test/CodeGen/ARM/vpmin.ll
@@ -0,0 +1,67 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vpmin\\.s8} %t | count 1
+; RUN: grep {vpmin\\.s16} %t | count 1
+; RUN: grep {vpmin\\.s32} %t | count 1
+; RUN: grep {vpmin\\.u8} %t | count 1
+; RUN: grep {vpmin\\.u16} %t | count 1
+; RUN: grep {vpmin\\.u32} %t | count 1
+; RUN: grep {vpmin\\.f32} %t | count 1
+
+define <8 x i8> @vpmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vpmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vpminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vpminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vpminf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vpminf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vpmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vpminf.v2f32(<2 x float>, <2 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqabs.ll b/test/CodeGen/ARM/vqabs.ll
new file mode 100644
index 000000000000..04b8ad995b91
--- /dev/null
+++ b/test/CodeGen/ARM/vqabs.ll
@@ -0,0 +1,48 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqabs\\.s8} %t | count 2
+; RUN: grep {vqabs\\.s16} %t | count 2
+; RUN: grep {vqabs\\.s32} %t | count 2
+
+define <8 x i8> @vqabss8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqabss16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqabss32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vqabsQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vqabsQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vqabsQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqabs.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqadd.ll b/test/CodeGen/ARM/vqadd.ll
new file mode 100644
index 000000000000..c9e235995360
--- /dev/null
+++ b/test/CodeGen/ARM/vqadd.ll
@@ -0,0 +1,141 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqadd\\.s8} %t | count 2
+; RUN: grep {vqadd\\.s16} %t | count 2
+; RUN: grep {vqadd\\.s32} %t | count 2
+; RUN: grep {vqadd\\.s64} %t | count 2
+; RUN: grep {vqadd\\.u8} %t | count 2
+; RUN: grep {vqadd\\.u16} %t | count 2
+; RUN: grep {vqadd\\.u32} %t | count 2
+; RUN: grep {vqadd\\.u64} %t | count 2
+
+define <8 x i8> @vqadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqadds64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vqaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqaddu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vqaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqaddQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vqaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqaddQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqdmlal.ll b/test/CodeGen/ARM/vqdmlal.ll
new file mode 100644
index 000000000000..f05bf530c4c1
--- /dev/null
+++ b/test/CodeGen/ARM/vqdmlal.ll
@@ -0,0 +1,22 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqdmlal\\.s16} %t | count 1
+; RUN: grep {vqdmlal\\.s32} %t | count 1
+
+define <4 x i32> @vqdmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vqdmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+declare <4 x i32>  @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64>  @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqdmlsl.ll b/test/CodeGen/ARM/vqdmlsl.ll
new file mode 100644
index 000000000000..2b71ed556355
--- /dev/null
+++ b/test/CodeGen/ARM/vqdmlsl.ll
@@ -0,0 +1,22 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqdmlsl\\.s16} %t | count 1
+; RUN: grep {vqdmlsl\\.s32} %t | count 1
+
+define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vqdmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+declare <4 x i32>  @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64>  @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqdmulh.ll b/test/CodeGen/ARM/vqdmulh.ll
new file mode 100644
index 000000000000..dd686ddd9393
--- /dev/null
+++ b/test/CodeGen/ARM/vqdmulh.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqdmulh\\.s16} %t | count 2
+; RUN: grep {vqdmulh\\.s32} %t | count 2
+; RUN: grep {vqrdmulh\\.s16} %t | count 2
+; RUN: grep {vqrdmulh\\.s32} %t | count 2
+
+define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqdmull.ll b/test/CodeGen/ARM/vqdmull.ll
new file mode 100644
index 000000000000..3ff90f795df6
--- /dev/null
+++ b/test/CodeGen/ARM/vqdmull.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqdmull\\.s16} %t | count 1
+; RUN: grep {vqdmull\\.s32} %t | count 1
+
+define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <4 x i32>  @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64>  @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqmovn.ll b/test/CodeGen/ARM/vqmovn.ll
new file mode 100644
index 000000000000..78e6d23416b9
--- /dev/null
+++ b/test/CodeGen/ARM/vqmovn.ll
@@ -0,0 +1,76 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqmovn\\.s16} %t | count 1
+; RUN: grep {vqmovn\\.s32} %t | count 1
+; RUN: grep {vqmovn\\.s64} %t | count 1
+; RUN: grep {vqmovn\\.u16} %t | count 1
+; RUN: grep {vqmovn\\.u32} %t | count 1
+; RUN: grep {vqmovn\\.u64} %t | count 1
+; RUN: grep {vqmovun\\.s16} %t | count 1
+; RUN: grep {vqmovun\\.s32} %t | count 1
+; RUN: grep {vqmovun\\.s64} %t | count 1
+
+define <8 x i8> @vqmovns16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqmovns32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqmovns64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqmovnu16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqmovnu32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqmovnu64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqmovuns16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqmovuns32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqmovuns64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqneg.ll b/test/CodeGen/ARM/vqneg.ll
new file mode 100644
index 000000000000..72035f024ff9
--- /dev/null
+++ b/test/CodeGen/ARM/vqneg.ll
@@ -0,0 +1,48 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqneg\\.s8} %t | count 2
+; RUN: grep {vqneg\\.s16} %t | count 2
+; RUN: grep {vqneg\\.s32} %t | count 2
+
+define <8 x i8> @vqnegs8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqnegs16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqnegs32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vqnegQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vqnegQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vqnegQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqneg.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqrshl.ll b/test/CodeGen/ARM/vqrshl.ll
new file mode 100644
index 000000000000..63e03c124565
--- /dev/null
+++ b/test/CodeGen/ARM/vqrshl.ll
@@ -0,0 +1,141 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqrshl\\.s8} %t | count 2
+; RUN: grep {vqrshl\\.s16} %t | count 2
+; RUN: grep {vqrshl\\.s32} %t | count 2
+; RUN: grep {vqrshl\\.s64} %t | count 2
+; RUN: grep {vqrshl\\.u8} %t | count 2
+; RUN: grep {vqrshl\\.u16} %t | count 2
+; RUN: grep {vqrshl\\.u32} %t | count 2
+; RUN: grep {vqrshl\\.u64} %t | count 2
+
+define <8 x i8> @vqrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vqrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vqrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vqrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqrshrn.ll b/test/CodeGen/ARM/vqrshrn.ll
new file mode 100644
index 000000000000..38f98f2a2ccf
--- /dev/null
+++ b/test/CodeGen/ARM/vqrshrn.ll
@@ -0,0 +1,76 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqrshrn\\.s16} %t | count 1
+; RUN: grep {vqrshrn\\.s32} %t | count 1
+; RUN: grep {vqrshrn\\.s64} %t | count 1
+; RUN: grep {vqrshrn\\.u16} %t | count 1
+; RUN: grep {vqrshrn\\.u32} %t | count 1
+; RUN: grep {vqrshrn\\.u64} %t | count 1
+; RUN: grep {vqrshrun\\.s16} %t | count 1
+; RUN: grep {vqrshrun\\.s32} %t | count 1
+; RUN: grep {vqrshrun\\.s64} %t | count 1
+
+define <8 x i8> @vqrshrns8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqrshrns16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqrshrns32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqrshrnu8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqrshrnu16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqrshrnu32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqrshruns8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqrshruns16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqrshruns32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqshl.ll b/test/CodeGen/ARM/vqshl.ll
new file mode 100644
index 000000000000..60b04bd5830e
--- /dev/null
+++ b/test/CodeGen/ARM/vqshl.ll
@@ -0,0 +1,307 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqshl\\.s8} %t | count 4
+; RUN: grep {vqshl\\.s16} %t | count 4
+; RUN: grep {vqshl\\.s32} %t | count 4
+; RUN: grep {vqshl\\.s64} %t | count 4
+; RUN: grep {vqshl\\.u8} %t | count 4
+; RUN: grep {vqshl\\.u16} %t | count 4
+; RUN: grep {vqshl\\.u32} %t | count 4
+; RUN: grep {vqshl\\.u64} %t | count 4
+; RUN: grep {vqshl\\.s8.*#7} %t | count 2
+; RUN: grep {vqshl\\.s16.*#15} %t | count 2
+; RUN: grep {vqshl\\.s32.*#31} %t | count 2
+; RUN: grep {vqshl\\.s64.*#63} %t | count 2
+; RUN: grep {vqshl\\.u8.*#7} %t | count 2
+; RUN: grep {vqshl\\.u16.*#15} %t | count 2
+; RUN: grep {vqshl\\.u32.*#31} %t | count 2
+; RUN: grep {vqshl\\.u64.*#63} %t | count 2
+; RUN: grep {vqshlu\\.s8} %t | count 2
+; RUN: grep {vqshlu\\.s16} %t | count 2
+; RUN: grep {vqshlu\\.s32} %t | count 2
+; RUN: grep {vqshlu\\.s64} %t | count 2
+
+define <8 x i8> @vqshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vqshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vqshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vqshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @vqshls_n8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqshls_n16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqshls_n32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vqshls_n64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
+	ret <1 x i64> %tmp2
+}
+
+define <8 x i8> @vqshlu_n8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqshlu_n16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqshlu_n32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vqshlu_n64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
+	ret <1 x i64> %tmp2
+}
+
+define <8 x i8> @vqshlsu_n8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqshlsu_n16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqshlsu_n32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vqshlsu_n64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vqshlQs_n8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vqshlQs_n16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vqshlQs_n32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vqshlQs_n64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
+	ret <2 x i64> %tmp2
+}
+
+define <16 x i8> @vqshlQu_n8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vqshlQu_n16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vqshlQu_n32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vqshlQu_n64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
+	ret <2 x i64> %tmp2
+}
+
+define <16 x i8> @vqshlQsu_n8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vqshlQsu_n16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vqshlQsu_n32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vqshlQsu_n64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
+	ret <2 x i64> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqshrn.ll b/test/CodeGen/ARM/vqshrn.ll
new file mode 100644
index 000000000000..6bd607abb4d0
--- /dev/null
+++ b/test/CodeGen/ARM/vqshrn.ll
@@ -0,0 +1,76 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqshrn\\.s16} %t | count 1
+; RUN: grep {vqshrn\\.s32} %t | count 1
+; RUN: grep {vqshrn\\.s64} %t | count 1
+; RUN: grep {vqshrn\\.u16} %t | count 1
+; RUN: grep {vqshrn\\.u32} %t | count 1
+; RUN: grep {vqshrn\\.u64} %t | count 1
+; RUN: grep {vqshrun\\.s16} %t | count 1
+; RUN: grep {vqshrun\\.s32} %t | count 1
+; RUN: grep {vqshrun\\.s64} %t | count 1
+
+define <8 x i8> @vqshrns8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqshrns16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqshrns32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqshrnu8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqshrnu16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqshrnu32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqshruns8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqshruns16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqshruns32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqsub.ll b/test/CodeGen/ARM/vqsub.ll
new file mode 100644
index 000000000000..07052f78dbea
--- /dev/null
+++ b/test/CodeGen/ARM/vqsub.ll
@@ -0,0 +1,141 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqsub\\.s8} %t | count 2
+; RUN: grep {vqsub\\.s16} %t | count 2
+; RUN: grep {vqsub\\.s32} %t | count 2
+; RUN: grep {vqsub\\.s64} %t | count 2
+; RUN: grep {vqsub\\.u8} %t | count 2
+; RUN: grep {vqsub\\.u16} %t | count 2
+; RUN: grep {vqsub\\.u32} %t | count 2
+; RUN: grep {vqsub\\.u64} %t | count 2
+
+define <8 x i8> @vqsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqsubs64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vqsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqsubu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vqsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqsubQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vqsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqsubQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vraddhn.ll b/test/CodeGen/ARM/vraddhn.ll
new file mode 100644
index 000000000000..d69e54554c49
--- /dev/null
+++ b/test/CodeGen/ARM/vraddhn.ll
@@ -0,0 +1,29 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vraddhn\\.i16} %t | count 1
+; RUN: grep {vraddhn\\.i32} %t | count 1
+; RUN: grep {vraddhn\\.i64} %t | count 1
+
+define <8 x i8> @vraddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vraddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vraddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vrecpe.ll b/test/CodeGen/ARM/vrecpe.ll
new file mode 100644
index 000000000000..79cb595bc840
--- /dev/null
+++ b/test/CodeGen/ARM/vrecpe.ll
@@ -0,0 +1,33 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrecpe\\.u32} %t | count 2
+; RUN: grep {vrecpe\\.f32} %t | count 2
+
+define <2 x i32> @vrecpei32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <4 x i32> @vrecpeQi32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x float> @vrecpef32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = call <2 x float> @llvm.arm.neon.vrecpef.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp2
+}
+
+define <4 x float> @vrecpeQf32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = call <4 x float> @llvm.arm.neon.vrecpef.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp2
+}
+
+declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vrecpef.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecpef.v4f32(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vrecps.ll b/test/CodeGen/ARM/vrecps.ll
new file mode 100644
index 000000000000..7ded4157f0d1
--- /dev/null
+++ b/test/CodeGen/ARM/vrecps.ll
@@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrecps\\.f32} %t | count 2
+
+define <2 x float> @vrecpsf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @vrecpsQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vrhadd.ll b/test/CodeGen/ARM/vrhadd.ll
new file mode 100644
index 000000000000..4f6437caeabd
--- /dev/null
+++ b/test/CodeGen/ARM/vrhadd.ll
@@ -0,0 +1,107 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrhadd\\.s8} %t | count 2
+; RUN: grep {vrhadd\\.s16} %t | count 2
+; RUN: grep {vrhadd\\.s32} %t | count 2
+; RUN: grep {vrhadd\\.u8} %t | count 2
+; RUN: grep {vrhadd\\.u16} %t | count 2
+; RUN: grep {vrhadd\\.u32} %t | count 2
+
+define <8 x i8> @vrhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vrhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vrhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vrhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vrhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vrhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vrhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vrhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vrshl.ll b/test/CodeGen/ARM/vrshl.ll
new file mode 100644
index 000000000000..fbbf5489c719
--- /dev/null
+++ b/test/CodeGen/ARM/vrshl.ll
@@ -0,0 +1,245 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrshl\\.s8} %t | count 2
+; RUN: grep {vrshl\\.s16} %t | count 2
+; RUN: grep {vrshl\\.s32} %t | count 2
+; RUN: grep {vrshl\\.s64} %t | count 2
+; RUN: grep {vrshl\\.u8} %t | count 2
+; RUN: grep {vrshl\\.u16} %t | count 2
+; RUN: grep {vrshl\\.u32} %t | count 2
+; RUN: grep {vrshl\\.u64} %t | count 2
+; RUN: grep {vrshr\\.s8} %t | count 2
+; RUN: grep {vrshr\\.s16} %t | count 2
+; RUN: grep {vrshr\\.s32} %t | count 2
+; RUN: grep {vrshr\\.s64} %t | count 2
+; RUN: grep {vrshr\\.u8} %t | count 2
+; RUN: grep {vrshr\\.u16} %t | count 2
+; RUN: grep {vrshr\\.u32} %t | count 2
+; RUN: grep {vrshr\\.u64} %t | count 2
+
+define <8 x i8> @vrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @vrshrs8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vrshrs16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vrshrs32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vrshrs64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
+	ret <1 x i64> %tmp2
+}
+
+define <8 x i8> @vrshru8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vrshru16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vrshru32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vrshru64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vrshrQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vrshrQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vrshrQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vrshrQs64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
+	ret <2 x i64> %tmp2
+}
+
+define <16 x i8> @vrshrQu8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vrshrQu16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vrshrQu32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vrshrQu64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
+	ret <2 x i64> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vrshrn.ll b/test/CodeGen/ARM/vrshrn.ll
new file mode 100644
index 000000000000..8af24fd4d622
--- /dev/null
+++ b/test/CodeGen/ARM/vrshrn.ll
@@ -0,0 +1,26 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrshrn\\.i16} %t | count 1
+; RUN: grep {vrshrn\\.i32} %t | count 1
+; RUN: grep {vrshrn\\.i64} %t | count 1
+
+define <8 x i8> @vrshrns8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vrshrns16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vrshrns32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vrshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vrsqrte.ll b/test/CodeGen/ARM/vrsqrte.ll
new file mode 100644
index 000000000000..10529f61b56f
--- /dev/null
+++ b/test/CodeGen/ARM/vrsqrte.ll
@@ -0,0 +1,33 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrsqrte\\.u32} %t | count 2
+; RUN: grep {vrsqrte\\.f32} %t | count 2
+
+define <2 x i32> @vrsqrtei32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <4 x i32> @vrsqrteQi32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x float> @vrsqrtef32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = call <2 x float> @llvm.arm.neon.vrsqrtef.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp2
+}
+
+define <4 x float> @vrsqrteQf32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = call <4 x float> @llvm.arm.neon.vrsqrtef.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp2
+}
+
+declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vrsqrtef.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrtef.v4f32(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vrsqrts.ll b/test/CodeGen/ARM/vrsqrts.ll
new file mode 100644
index 000000000000..3fd0567842a3
--- /dev/null
+++ b/test/CodeGen/ARM/vrsqrts.ll
@@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrsqrts\\.f32} %t | count 2
+
+define <2 x float> @vrsqrtsf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @vrsqrtsQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vrsubhn.ll b/test/CodeGen/ARM/vrsubhn.ll
new file mode 100644
index 000000000000..587092397c7a
--- /dev/null
+++ b/test/CodeGen/ARM/vrsubhn.ll
@@ -0,0 +1,29 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrsubhn\\.i16} %t | count 1
+; RUN: grep {vrsubhn\\.i32} %t | count 1
+; RUN: grep {vrsubhn\\.i64} %t | count 1
+
+define <8 x i8> @vrsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vset_lane.ll b/test/CodeGen/ARM/vset_lane.ll
new file mode 100644
index 000000000000..279c628aed6e
--- /dev/null
+++ b/test/CodeGen/ARM/vset_lane.ll
@@ -0,0 +1,40 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmov\\.8} %t | count 2
+; RUN: grep {vmov\\.16} %t | count 2
+; RUN: grep {vmov\\.32} %t | count 2
+
+define <8 x i8> @vset_lane8(<8 x i8>* %A, i8 %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = insertelement <8 x i8> %tmp1, i8 %B, i32 1
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vset_lane16(<4 x i16>* %A, i16 %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = insertelement <4 x i16> %tmp1, i16 %B, i32 1
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vset_lane32(<2 x i32>* %A, i32 %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = insertelement <2 x i32> %tmp1, i32 %B, i32 1
+	ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vsetQ_lane8(<16 x i8>* %A, i8 %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = insertelement <16 x i8> %tmp1, i8 %B, i32 1
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = insertelement <8 x i16> %tmp1, i16 %B, i32 1
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1
+	ret <4 x i32> %tmp2
+}
diff --git a/test/CodeGen/ARM/vshift.ll b/test/CodeGen/ARM/vshift.ll
new file mode 100644
index 000000000000..8c5c4aad18d8
--- /dev/null
+++ b/test/CodeGen/ARM/vshift.ll
@@ -0,0 +1,337 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vshl\\.s8} %t | count 2
+; RUN: grep {vshl\\.s16} %t | count 2
+; RUN: grep {vshl\\.s32} %t | count 2
+; RUN: grep {vshl\\.s64} %t | count 2
+; RUN: grep {vshl\\.u8} %t | count 4
+; RUN: grep {vshl\\.u16} %t | count 4
+; RUN: grep {vshl\\.u32} %t | count 4
+; RUN: grep {vshl\\.u64} %t | count 4
+; RUN: grep {vshl\\.i8} %t | count 2
+; RUN: grep {vshl\\.i16} %t | count 2
+; RUN: grep {vshl\\.i32} %t | count 2
+; RUN: grep {vshl\\.i64} %t | count 2
+; RUN: grep {vshr\\.u8} %t | count 2
+; RUN: grep {vshr\\.u16} %t | count 2
+; RUN: grep {vshr\\.u32} %t | count 2
+; RUN: grep {vshr\\.u64} %t | count 2
+; RUN: grep {vshr\\.s8} %t | count 2
+; RUN: grep {vshr\\.s16} %t | count 2
+; RUN: grep {vshr\\.s32} %t | count 2
+; RUN: grep {vshr\\.s64} %t | count 2
+; RUN: grep {vneg\\.s8} %t | count 4
+; RUN: grep {vneg\\.s16} %t | count 4
+; RUN: grep {vneg\\.s32} %t | count 4
+; RUN: grep {vsub\\.i64} %t | count 4
+
+define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shl <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shl <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = shl <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = shl <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vshli8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shl <8 x i8> %tmp1, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vshli16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shl <4 x i16> %tmp1, < i16 15, i16 15, i16 15, i16 15 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vshli32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shl <2 x i32> %tmp1, < i32 31, i32 31 >
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vshli64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = shl <1 x i64> %tmp1, < i64 63 >
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shl <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shl <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shl <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = shl <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vshlQi8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shl <16 x i8> %tmp1, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vshlQi16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shl <8 x i16> %tmp1, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vshlQi32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = shl <4 x i32> %tmp1, < i32 31, i32 31, i32 31, i32 31 >
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vshlQi64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = shl <2 x i64> %tmp1, < i64 63, i64 63 >
+	ret <2 x i64> %tmp2
+}
+
+define <8 x i8> @vlshru8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = lshr <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vlshru16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = lshr <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vlshru32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = lshr <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vlshru64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = lshr <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vlshri8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = lshr <8 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vlshri16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = lshr <4 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vlshri32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = lshr <2 x i32> %tmp1, < i32 32, i32 32 >
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vlshri64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = lshr <1 x i64> %tmp1, < i64 64 >
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vlshrQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = lshr <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vlshrQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = lshr <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vlshrQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = lshr <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vlshrQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = lshr <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vlshrQi8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = lshr <16 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vlshrQi16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = lshr <8 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vlshrQi32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = lshr <4 x i32> %tmp1, < i32 32, i32 32, i32 32, i32 32 >
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vlshrQi64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = lshr <2 x i64> %tmp1, < i64 64, i64 64 >
+	ret <2 x i64> %tmp2
+}
+
+define <8 x i8> @vashrs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = ashr <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vashrs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = ashr <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vashrs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = ashr <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vashrs64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = ashr <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vashri8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = ashr <8 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vashri16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = ashr <4 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vashri32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = ashr <2 x i32> %tmp1, < i32 32, i32 32 >
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vashri64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = ashr <1 x i64> %tmp1, < i64 64 >
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vashrQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = ashr <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vashrQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = ashr <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vashrQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = ashr <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vashrQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = ashr <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vashrQi8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = ashr <16 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vashrQi16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = ashr <8 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vashrQi32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = ashr <4 x i32> %tmp1, < i32 32, i32 32, i32 32, i32 32 >
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vashrQi64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = ashr <2 x i64> %tmp1, < i64 64, i64 64 >
+	ret <2 x i64> %tmp2
+}
diff --git a/test/CodeGen/ARM/vshiftins.ll b/test/CodeGen/ARM/vshiftins.ll
new file mode 100644
index 000000000000..cb7cbb89ecdb
--- /dev/null
+++ b/test/CodeGen/ARM/vshiftins.ll
@@ -0,0 +1,131 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vsli\\.8} %t | count 2
+; RUN: grep {vsli\\.16} %t | count 2
+; RUN: grep {vsli\\.32} %t | count 2
+; RUN: grep {vsli\\.64} %t | count 2
+; RUN: grep {vsri\\.8} %t | count 2
+; RUN: grep {vsri\\.16} %t | count 2
+; RUN: grep {vsri\\.32} %t | count 2
+; RUN: grep {vsri\\.64} %t | count 2
+
+define <8 x i8> @vsli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vsli16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vsli32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> < i32 31, i32 31 >)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vsli64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, <1 x i64> < i64 63 >)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vsliQ8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vsliQ16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsliQ32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsliQ64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, <2 x i64> < i64 63, i64 63 >)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @vsri8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vsri16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vsri32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vsri64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, <1 x i64> < i64 -64 >)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vsriQ8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vsriQ16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsriQ32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsriQ64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vshiftins.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64>, <1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vshl.ll b/test/CodeGen/ARM/vshl.ll
new file mode 100644
index 000000000000..993126ea578c
--- /dev/null
+++ b/test/CodeGen/ARM/vshl.ll
@@ -0,0 +1,302 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vshl\\.s8} %t | count 2
+; RUN: grep {vshl\\.s16} %t | count 2
+; RUN: grep {vshl\\.s32} %t | count 2
+; RUN: grep {vshl\\.s64} %t | count 2
+; RUN: grep {vshl\\.u8} %t | count 2
+; RUN: grep {vshl\\.u16} %t | count 2
+; RUN: grep {vshl\\.u32} %t | count 2
+; RUN: grep {vshl\\.u64} %t | count 2
+; RUN: grep {vshl\\.i8} %t | count 2
+; RUN: grep {vshl\\.i16} %t | count 2
+; RUN: grep {vshl\\.i32} %t | count 2
+; RUN: grep {vshl\\.i64} %t | count 2
+; RUN: grep {vshr\\.s8} %t | count 2
+; RUN: grep {vshr\\.s16} %t | count 2
+; RUN: grep {vshr\\.s32} %t | count 2
+; RUN: grep {vshr\\.s64} %t | count 2
+; RUN: grep {vshr\\.u8} %t | count 2
+; RUN: grep {vshr\\.u16} %t | count 2
+; RUN: grep {vshr\\.u32} %t | count 2
+; RUN: grep {vshr\\.u64} %t | count 2
+
+define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+; For left shifts by immediates, the signedness is irrelevant.
+; Test a mix of both signed and unsigned intrinsics.
+
+define <8 x i8> @vshli8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vshli16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vshli32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vshli64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vshlQi8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vshlQi16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vshlQi32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vshlQi64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
+	ret <2 x i64> %tmp2
+}
+
+; Right shift by immediate:
+
+define <8 x i8> @vshrs8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vshrs16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vshrs32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vshrs64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
+	ret <1 x i64> %tmp2
+}
+
+define <8 x i8> @vshru8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vshru16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vshru32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vshru64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vshrQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vshrQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vshrQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vshrQs64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
+	ret <2 x i64> %tmp2
+}
+
+define <16 x i8> @vshrQu8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vshrQu16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vshrQu32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vshrQu64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
+	ret <2 x i64> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vshll.ll b/test/CodeGen/ARM/vshll.ll
new file mode 100644
index 000000000000..f81c09a7b9d3
--- /dev/null
+++ b/test/CodeGen/ARM/vshll.ll
@@ -0,0 +1,74 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vshll\\.s8} %t | count 1
+; RUN: grep {vshll\\.s16} %t | count 1
+; RUN: grep {vshll\\.s32} %t | count 1
+; RUN: grep {vshll\\.u8} %t | count 1
+; RUN: grep {vshll\\.u16} %t | count 1
+; RUN: grep {vshll\\.u32} %t | count 1
+; RUN: grep {vshll\\.i8} %t | count 1
+; RUN: grep {vshll\\.i16} %t | count 1
+; RUN: grep {vshll\\.i32} %t | count 1
+
+define <8 x i16> @vshlls8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vshlls16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftls.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vshlls32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
+	ret <2 x i64> %tmp2
+}
+
+define <8 x i16> @vshllu8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftlu.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vshllu16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vshllu32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftlu.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
+	ret <2 x i64> %tmp2
+}
+
+; The following tests use the maximum shift count, so the signedness is
+; irrelevant.  Test both signed and unsigned versions.
+define <8 x i16> @vshlli8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vshlli16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 16, i16 16, i16 16, i16 16 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vshlli32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 32, i32 32 >)
+	ret <2 x i64> %tmp2
+}
+
+declare <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vshiftls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vshiftlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vshiftlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vshrn.ll b/test/CodeGen/ARM/vshrn.ll
new file mode 100644
index 000000000000..bc640cbbca61
--- /dev/null
+++ b/test/CodeGen/ARM/vshrn.ll
@@ -0,0 +1,26 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vshrn\\.i16} %t | count 1
+; RUN: grep {vshrn\\.i32} %t | count 1
+; RUN: grep {vshrn\\.i64} %t | count 1
+
+define <8 x i8> @vshrns8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vshrns16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vshrns32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vsra.ll b/test/CodeGen/ARM/vsra.ll
new file mode 100644
index 000000000000..e2829dcddae6
--- /dev/null
+++ b/test/CodeGen/ARM/vsra.ll
@@ -0,0 +1,293 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vsra\\.s8} %t | count 2
+; RUN: grep {vsra\\.s16} %t | count 2
+; RUN: grep {vsra\\.s32} %t | count 2
+; RUN: grep {vsra\\.s64} %t | count 2
+; RUN: grep {vsra\\.u8} %t | count 2
+; RUN: grep {vsra\\.u16} %t | count 2
+; RUN: grep {vsra\\.u32} %t | count 2
+; RUN: grep {vsra\\.u64} %t | count 2
+; RUN: grep {vrsra\\.s8} %t | count 2
+; RUN: grep {vrsra\\.s16} %t | count 2
+; RUN: grep {vrsra\\.s32} %t | count 2
+; RUN: grep {vrsra\\.s64} %t | count 2
+; RUN: grep {vrsra\\.u8} %t | count 2
+; RUN: grep {vrsra\\.u16} %t | count 2
+; RUN: grep {vrsra\\.u32} %t | count 2
+; RUN: grep {vrsra\\.u64} %t | count 2
+
+define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = ashr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = ashr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 >
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = ashr <2 x i32> %tmp2, < i32 32, i32 32 >
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @vsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = ashr <1 x i64> %tmp2, < i64 64 >
+        %tmp4 = add <1 x i64> %tmp1, %tmp3
+	ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = ashr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = ashr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = ashr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 >
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = ashr <2 x i64> %tmp2, < i64 64, i64 64 >
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = lshr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = lshr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 >
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = lshr <2 x i32> %tmp2, < i32 32, i32 32 >
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @vsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = lshr <1 x i64> %tmp2, < i64 64 >
+        %tmp4 = add <1 x i64> %tmp1, %tmp3
+	ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = lshr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = lshr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = lshr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 >
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = lshr <2 x i64> %tmp2, < i64 64, i64 64 >
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @vrsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vrsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vrsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >)
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @vrsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp2, <1 x i64> < i64 -64 >)
+        %tmp4 = add <1 x i64> %tmp1, %tmp3
+	ret <1 x i64> %tmp4
+}
+
+define <8 x i8> @vrsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vrsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vrsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >)
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @vrsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp2, <1 x i64> < i64 -64 >)
+        %tmp4 = add <1 x i64> %tmp1, %tmp3
+	ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @vrsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vrsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vrsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vrsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >)
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+define <16 x i8> @vrsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vrsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vrsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vrsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >)
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+declare <8 x i8>  @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vsub.ll b/test/CodeGen/ARM/vsub.ll
new file mode 100644
index 000000000000..85dea41835f8
--- /dev/null
+++ b/test/CodeGen/ARM/vsub.ll
@@ -0,0 +1,76 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vsub\\.i8} %t | count 2
+; RUN: grep {vsub\\.i16} %t | count 2
+; RUN: grep {vsub\\.i32} %t | count 2
+; RUN: grep {vsub\\.i64} %t | count 2
+; RUN: grep {vsub\\.f32} %t | count 2
+
+define <8 x i8> @vsubi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = sub <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vsubi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = sub <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vsubi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = sub <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vsubi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = sub <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <2 x float> @vsubf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = sub <2 x float> %tmp1, %tmp2
+	ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vsubQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = sub <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vsubQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = sub <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsubQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = sub <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsubQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = sub <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
+
+define <4 x float> @vsubQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = sub <4 x float> %tmp1, %tmp2
+	ret <4 x float> %tmp3
+}
diff --git a/test/CodeGen/ARM/vsubhn.ll b/test/CodeGen/ARM/vsubhn.ll
new file mode 100644
index 000000000000..5adc9ed97d2c
--- /dev/null
+++ b/test/CodeGen/ARM/vsubhn.ll
@@ -0,0 +1,29 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vsubhn\\.i16} %t | count 1
+; RUN: grep {vsubhn\\.i32} %t | count 1
+; RUN: grep {vsubhn\\.i64} %t | count 1
+
+define <8 x i8> @vsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vsubl.ll b/test/CodeGen/ARM/vsubl.ll
new file mode 100644
index 000000000000..d2af455843ee
--- /dev/null
+++ b/test/CodeGen/ARM/vsubl.ll
@@ -0,0 +1,57 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vsubl\\.s8} %t | count 1
+; RUN: grep {vsubl\\.s16} %t | count 1
+; RUN: grep {vsubl\\.s32} %t | count 1
+; RUN: grep {vsubl\\.u8} %t | count 1
+; RUN: grep {vsubl\\.u16} %t | count 1
+; RUN: grep {vsubl\\.u32} %t | count 1
+
+define <8 x i16> @vsubls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsubls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsubls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vsublu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsublu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsublu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vsubw.ll b/test/CodeGen/ARM/vsubw.ll
new file mode 100644
index 000000000000..30bec5532a79
--- /dev/null
+++ b/test/CodeGen/ARM/vsubw.ll
@@ -0,0 +1,57 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vsubw\\.s8} %t | count 1
+; RUN: grep {vsubw\\.s16} %t | count 1
+; RUN: grep {vsubw\\.s32} %t | count 1
+; RUN: grep {vsubw\\.u8} %t | count 1
+; RUN: grep {vsubw\\.u16} %t | count 1
+; RUN: grep {vsubw\\.u32} %t | count 1
+
+define <8 x i16> @vsubws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsubws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsubws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vsubwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsubwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsubwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vtst.ll b/test/CodeGen/ARM/vtst.ll
new file mode 100644
index 000000000000..d4f7d2aef17a
--- /dev/null
+++ b/test/CodeGen/ARM/vtst.ll
@@ -0,0 +1,52 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vtst\\.i8} %t | count 2
+; RUN: grep {vtst\\.i16} %t | count 2
+; RUN: grep {vtst\\.i32} %t | count 2
+
+define <8 x i8> @vtsti8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = and <8 x i8> %tmp1, %tmp2
+	%tmp4 = vicmp ne <8 x i8> %tmp3, zeroinitializer
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vtsti16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = and <4 x i16> %tmp1, %tmp2
+	%tmp4 = vicmp ne <4 x i16> %tmp3, zeroinitializer
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vtsti32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = and <2 x i32> %tmp1, %tmp2
+	%tmp4 = vicmp ne <2 x i32> %tmp3, zeroinitializer
+	ret <2 x i32> %tmp4
+}
+
+define <16 x i8> @vtstQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = and <16 x i8> %tmp1, %tmp2
+	%tmp4 = vicmp ne <16 x i8> %tmp3, zeroinitializer
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vtstQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = and <8 x i16> %tmp1, %tmp2
+	%tmp4 = vicmp ne <8 x i16> %tmp3, zeroinitializer
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vtstQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = and <4 x i32> %tmp1, %tmp2
+	%tmp4 = vicmp ne <4 x i32> %tmp3, zeroinitializer
+	ret <4 x i32> %tmp4
+}
diff --git a/tools/llvm-mc/AsmLexer.cpp b/tools/llvm-mc/AsmLexer.cpp
index 0828594a35b5..dbd3c06db772 100644
--- a/tools/llvm-mc/AsmLexer.cpp
+++ b/tools/llvm-mc/AsmLexer.cpp
@@ -14,6 +14,7 @@
 #include "AsmLexer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Config/config.h"  // for strtoull.
 #include <cerrno>
 #include <cstdio>
 #include <cstdlib>
@@ -74,17 +75,18 @@ asmtok::TokKind AsmLexer::LexIdentifier() {
   while (isalnum(*CurPtr) || *CurPtr == '_' || *CurPtr == '$' ||
          *CurPtr == '.' || *CurPtr == '@')
     ++CurPtr;
-  CurStrVal.assign(TokStart, CurPtr);   // Include %
+  CurStrVal.assign(TokStart, CurPtr);
   return asmtok::Identifier;
 }
 
 /// LexPercent: Register: %[a-zA-Z0-9]+
 asmtok::TokKind AsmLexer::LexPercent() {
   if (!isalnum(*CurPtr))
-    return ReturnError(TokStart, "invalid register name");
+    return asmtok::Percent;  // Single %.
+  
   while (isalnum(*CurPtr))
     ++CurPtr;
-  CurStrVal.assign(TokStart, CurPtr);   // Skip %
+  CurStrVal.assign(TokStart, CurPtr);   // Include %
   return asmtok::Register;
 }
 
@@ -242,6 +244,10 @@ asmtok::TokKind AsmLexer::LexToken() {
   case '*': return asmtok::Star;
   case ',': return asmtok::Comma;
   case '$': return asmtok::Dollar;
+  case '|': return asmtok::Pipe;
+  case '^': return asmtok::Caret;
+  case '&': return asmtok::Amp;
+  case '!': return asmtok::Exclaim;
   case '%': return LexPercent();
   case '/': return LexSlash();
   case '#': return LexHash();
@@ -249,6 +255,20 @@ asmtok::TokKind AsmLexer::LexToken() {
   case '0': case '1': case '2': case '3': case '4':
   case '5': case '6': case '7': case '8': case '9':
     return LexDigit();
+  case '<':
+    if (*CurPtr == '<') {
+      ++CurPtr;
+      return asmtok::LessLess;
+    }
+    // Don't have any use for bare '<' yet.
+    return ReturnError(TokStart, "invalid character in input");
+  case '>':
+    if (*CurPtr == '>') {
+      ++CurPtr;
+      return asmtok::GreaterGreater;
+    }
+    // Don't have any use for bare '>' yet.
+    return ReturnError(TokStart, "invalid character in input");
       
   // TODO: Quoted identifiers (objc methods etc)
   // local labels: [0-9][:]
diff --git a/tools/llvm-mc/AsmLexer.h b/tools/llvm-mc/AsmLexer.h
index a6c93230c6cd..23c5f851bc07 100644
--- a/tools/llvm-mc/AsmLexer.h
+++ b/tools/llvm-mc/AsmLexer.h
@@ -42,7 +42,10 @@ namespace asmtok {
     Plus, Minus, Tilde,
     Slash,    // '/'
     LParen, RParen,
-    Star, Comma, Dollar
+    Star, Comma, Dollar,
+    
+    Pipe, Caret, Amp, Exclaim,
+    Percent, LessLess, GreaterGreater
   };
 }
 
diff --git a/tools/llvm-mc/AsmParser.cpp b/tools/llvm-mc/AsmParser.cpp
index 715ff3932bc6..9e8b3cf2effe 100644
--- a/tools/llvm-mc/AsmParser.cpp
+++ b/tools/llvm-mc/AsmParser.cpp
@@ -168,7 +168,9 @@ bool AsmParser::ParseX86MemOperand(X86Operand &Op) {
       // memory operand consumed.
     } else {
       // It must be an parenthesized expression, parse it now.
-      if (ParseParenExpr(Disp)) return true;
+      if (ParseParenExpr(Disp) ||
+          ParseBinOpRHS(1, Disp))
+        return true;
       
       // After parsing the base expression we could either have a parenthesized
       // memory address or not.  If not, return now.  If so, eat the (.
@@ -274,9 +276,61 @@ bool AsmParser::ParsePrimaryExpr(int64_t &Res) {
 ///  expr ::= primaryexpr
 ///
 bool AsmParser::ParseExpression(int64_t &Res) {
-  return ParsePrimaryExpr(Res);
+  return ParsePrimaryExpr(Res) ||
+         ParseBinOpRHS(1, Res);
 }
-  
+
+static unsigned getBinOpPrecedence(asmtok::TokKind K) {
+  switch (K) {
+  default: return 0;    // not a binop.
+  case asmtok::Plus:
+  case asmtok::Minus:
+    return 1;
+  case asmtok::Pipe:
+  case asmtok::Caret:
+  case asmtok::Amp:
+  case asmtok::Exclaim:
+    return 2;
+  case asmtok::Star:
+  case asmtok::Slash:
+  case asmtok::Percent:
+  case asmtok::LessLess:
+  case asmtok::GreaterGreater:
+    return 3;
+  }
+}
+
+
+/// ParseBinOpRHS - Parse all binary operators with precedence >= 'Precedence'.
+/// Res contains the LHS of the expression on input.
+bool AsmParser::ParseBinOpRHS(unsigned Precedence, int64_t &Res) {
+  while (1) {
+    unsigned TokPrec = getBinOpPrecedence(Lexer.getKind());
+    
+    // If the next token is lower precedence than we are allowed to eat, return
+    // successfully with what we ate already.
+    if (TokPrec < Precedence)
+      return false;
+    
+    //asmtok::TokKind BinOp = Lexer.getKind();
+    Lexer.Lex();
+    
+    // Eat the next primary expression.
+    int64_t RHS;
+    if (ParsePrimaryExpr(RHS)) return true;
+    
+    // If BinOp binds less tightly with RHS than the operator after RHS, let
+    // the pending operator take RHS as its LHS.
+    unsigned NextTokPrec = getBinOpPrecedence(Lexer.getKind());
+    if (TokPrec < NextTokPrec) {
+      if (ParseBinOpRHS(Precedence+1, RHS)) return true;
+    }
+
+    // Merge LHS/RHS: fixme use the right operator etc.
+    Res += RHS;
+  }
+}
+
   
   
   
diff --git a/tools/llvm-mc/AsmParser.h b/tools/llvm-mc/AsmParser.h
index 82eb433b61ef..1dadb40ce2b8 100644
--- a/tools/llvm-mc/AsmParser.h
+++ b/tools/llvm-mc/AsmParser.h
@@ -40,6 +40,7 @@ private:
   bool ParseX86MemOperand(X86Operand &Op);
   bool ParseExpression(int64_t &Res);
   bool ParsePrimaryExpr(int64_t &Res);
+  bool ParseBinOpRHS(unsigned Precedence, int64_t &Res);
   bool ParseParenExpr(int64_t &Res);
 };
 
diff --git a/tools/lto/LTOCodeGenerator.cpp b/tools/lto/LTOCodeGenerator.cpp
index 11e0e5551741..8db573e61fbf 100644
--- a/tools/lto/LTOCodeGenerator.cpp
+++ b/tools/lto/LTOCodeGenerator.cpp
@@ -290,6 +290,11 @@ bool LTOCodeGenerator::assemble(const std::string& asmPath,
             args.push_back("-arch");
             args.push_back("armv6");
         }
+        else if ((strncmp(targetTriple.c_str(), "armv7-apple-", 12) == 0) ||
+                 (strncmp(targetTriple.c_str(), "thumbv7-apple-", 14) == 0)) {
+            args.push_back("-arch");
+            args.push_back("armv7");
+        }
         // add -static to assembler command line when code model requires
         if ( (_assemblerPath != NULL) && (_codeModel == LTO_CODEGEN_PIC_MODEL_STATIC) )
             args.push_back("-static");
author	Ed Schouten <ed@FreeBSD.org>	2009-06-23 14:50:01 +0000
committer	Ed Schouten <ed@FreeBSD.org>	2009-06-23 14:50:01 +0000
commit	0408e1d309a743aca4ed4592cf2c712a71537901 (patch)
tree	6be075b410677415707e0987e3a49123130cef22
parent	b2f21fb044b6b5c52cff6227f9f79ca4ed42b18f (diff)
download	src-test2-0408e1d309a743aca4ed4592cf2c712a71537901.tar.gz src-test2-0408e1d309a743aca4ed4592cf2c712a71537901.zip