566 files changed, 32107 insertions, 18511 deletions
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 503fbbdab8d67..1f2528fa560f2 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -65,10 +65,127 @@ void AliasAnalysis::copyValue(Value *From, Value *To) {
 }
 
 AliasAnalysis::ModRefResult
-AliasAnalysis::getModRefInfo(CallSite CS1, CallSite CS2) {
-  // FIXME: we can do better.
+AliasAnalysis::getModRefInfo(ImmutableCallSite CS,
+                             const Value *P, unsigned Size) {
+  // Don't assert AA because BasicAA calls us in order to make use of the
+  // logic here.
+
+  ModRefBehavior MRB = getModRefBehavior(CS);
+  if (MRB == DoesNotAccessMemory)
+    return NoModRef;
+
+  ModRefResult Mask = ModRef;
+  if (MRB == OnlyReadsMemory)
+    Mask = Ref;
+  else if (MRB == AliasAnalysis::AccessesArguments) {
+    bool doesAlias = false;
+    for (ImmutableCallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
+         AI != AE; ++AI)
+      if (!isNoAlias(*AI, ~0U, P, Size)) {
+        doesAlias = true;
+        break;
+      }
+
+    if (!doesAlias)
+      return NoModRef;
+  }
+
+  // If P points to a constant memory location, the call definitely could not
+  // modify the memory location.
+  if ((Mask & Mod) && pointsToConstantMemory(P))
+    Mask = ModRefResult(Mask & ~Mod);
+
+  // If this is BasicAA, don't forward.
+  if (!AA) return Mask;
+
+  // Otherwise, fall back to the next AA in the chain. But we can merge
+  // in any mask we've managed to compute.
+  return ModRefResult(AA->getModRefInfo(CS, P, Size) & Mask);
+}
+
+AliasAnalysis::ModRefResult
+AliasAnalysis::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) {
+  // Don't assert AA because BasicAA calls us in order to make use of the
+  // logic here.
+
+  // If CS1 or CS2 are readnone, they don't interact.
+  ModRefBehavior CS1B = getModRefBehavior(CS1);
+  if (CS1B == DoesNotAccessMemory) return NoModRef;
+
+  ModRefBehavior CS2B = getModRefBehavior(CS2);
+  if (CS2B == DoesNotAccessMemory) return NoModRef;
+
+  // If they both only read from memory, there is no dependence.
+  if (CS1B == OnlyReadsMemory && CS2B == OnlyReadsMemory)
+    return NoModRef;
+
+  AliasAnalysis::ModRefResult Mask = ModRef;
+
+  // If CS1 only reads memory, the only dependence on CS2 can be
+  // from CS1 reading memory written by CS2.
+  if (CS1B == OnlyReadsMemory)
+    Mask = ModRefResult(Mask & Ref);
+
+  // If CS2 only access memory through arguments, accumulate the mod/ref
+  // information from CS1's references to the memory referenced by
+  // CS2's arguments.
+  if (CS2B == AccessesArguments) {
+    AliasAnalysis::ModRefResult R = NoModRef;
+    for (ImmutableCallSite::arg_iterator
+         I = CS2.arg_begin(), E = CS2.arg_end(); I != E; ++I) {
+      R = ModRefResult((R | getModRefInfo(CS1, *I, UnknownSize)) & Mask);
+      if (R == Mask)
+        break;
+    }
+    return R;
+  }
+
+  // If CS1 only accesses memory through arguments, check if CS2 references
+  // any of the memory referenced by CS1's arguments. If not, return NoModRef.
+  if (CS1B == AccessesArguments) {
+    AliasAnalysis::ModRefResult R = NoModRef;
+    for (ImmutableCallSite::arg_iterator
+         I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I)
+      if (getModRefInfo(CS2, *I, UnknownSize) != NoModRef) {
+        R = Mask;
+        break;
+      }
+    if (R == NoModRef)
+      return R;
+  }
+
+  // If this is BasicAA, don't forward.
+  if (!AA) return Mask;
+
+  // Otherwise, fall back to the next AA in the chain. But we can merge
+  // in any mask we've managed to compute.
+  return ModRefResult(AA->getModRefInfo(CS1, CS2) & Mask);
+}
+
+AliasAnalysis::ModRefBehavior
+AliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
+  // Don't assert AA because BasicAA calls us in order to make use of the
+  // logic here.
+
+  ModRefBehavior Min = UnknownModRefBehavior;
+
+  // Call back into the alias analysis with the other form of getModRefBehavior
+  // to see if it can give a better response.
+  if (const Function *F = CS.getCalledFunction())
+    Min = getModRefBehavior(F);
+
+  // If this is BasicAA, don't forward.
+  if (!AA) return Min;
+
+  // Otherwise, fall back to the next AA in the chain. But we can merge
+  // in any result we've managed to compute.
+  return std::min(AA->getModRefBehavior(CS), Min);
+}
+
+AliasAnalysis::ModRefBehavior
+AliasAnalysis::getModRefBehavior(const Function *F) {
   assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
-  return AA->getModRefInfo(CS1, CS2);
+  return AA->getModRefBehavior(F);
 }
 
 
@@ -77,87 +194,63 @@ AliasAnalysis::getModRefInfo(CallSite CS1, CallSite CS2) {
 //===----------------------------------------------------------------------===//
 
 AliasAnalysis::ModRefResult
-AliasAnalysis::getModRefInfo(LoadInst *L, Value *P, unsigned Size) {
-  return alias(L->getOperand(0), getTypeStoreSize(L->getType()),
-               P, Size) ? Ref : NoModRef;
+AliasAnalysis::getModRefInfo(const LoadInst *L, const Value *P, unsigned Size) {
+  // Be conservative in the face of volatile.
+  if (L->isVolatile())
+    return ModRef;
+
+  // If the load address doesn't alias the given address, it doesn't read
+  // or write the specified memory.
+  if (!alias(L->getOperand(0), getTypeStoreSize(L->getType()), P, Size))
+    return NoModRef;
+
+  // Otherwise, a load just reads.
+  return Ref;
 }
 
 AliasAnalysis::ModRefResult
-AliasAnalysis::getModRefInfo(StoreInst *S, Value *P, unsigned Size) {
-  // If the stored address cannot alias the pointer in question, then the
-  // pointer cannot be modified by the store.
+AliasAnalysis::getModRefInfo(const StoreInst *S, const Value *P, unsigned Size) {
+  // Be conservative in the face of volatile.
+  if (S->isVolatile())
+    return ModRef;
+
+  // If the store address cannot alias the pointer in question, then the
+  // specified memory cannot be modified by the store.
   if (!alias(S->getOperand(1),
              getTypeStoreSize(S->getOperand(0)->getType()), P, Size))
     return NoModRef;
 
   // If the pointer is a pointer to constant memory, then it could not have been
   // modified by this store.
-  return pointsToConstantMemory(P) ? NoModRef : Mod;
-}
-
-AliasAnalysis::ModRefBehavior
-AliasAnalysis::getModRefBehavior(CallSite CS,
-                                 std::vector<PointerAccessInfo> *Info) {
-  if (CS.doesNotAccessMemory())
-    // Can't do better than this.
-    return DoesNotAccessMemory;
-  ModRefBehavior MRB = getModRefBehavior(CS.getCalledFunction(), Info);
-  if (MRB != DoesNotAccessMemory && CS.onlyReadsMemory())
-    return OnlyReadsMemory;
-  return MRB;
-}
-
-AliasAnalysis::ModRefBehavior
-AliasAnalysis::getModRefBehavior(Function *F,
-                                 std::vector<PointerAccessInfo> *Info) {
-  if (F) {
-    if (F->doesNotAccessMemory())
-      // Can't do better than this.
-      return DoesNotAccessMemory;
-    if (F->onlyReadsMemory())
-      return OnlyReadsMemory;
-    if (unsigned id = F->getIntrinsicID())
-      return getModRefBehavior(id);
-  }
-  return UnknownModRefBehavior;
-}
+  if (pointsToConstantMemory(P))
+    return NoModRef;
 
-AliasAnalysis::ModRefBehavior AliasAnalysis::getModRefBehavior(unsigned iid) {
-#define GET_INTRINSIC_MODREF_BEHAVIOR
-#include "llvm/Intrinsics.gen"
-#undef GET_INTRINSIC_MODREF_BEHAVIOR
+  // Otherwise, a store just writes.
+  return Mod;
 }
 
 AliasAnalysis::ModRefResult
-AliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
-  ModRefBehavior MRB = getModRefBehavior(CS);
-  if (MRB == DoesNotAccessMemory)
+AliasAnalysis::getModRefInfo(const VAArgInst *V, const Value *P, unsigned Size) {
+  // If the va_arg address cannot alias the pointer in question, then the
+  // specified memory cannot be accessed by the va_arg.
+  if (!alias(V->getOperand(0), UnknownSize, P, Size))
     return NoModRef;
-  
-  ModRefResult Mask = ModRef;
-  if (MRB == OnlyReadsMemory)
-    Mask = Ref;
-  else if (MRB == AliasAnalysis::AccessesArguments) {
-    bool doesAlias = false;
-    for (CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
-         AI != AE; ++AI)
-      if (!isNoAlias(*AI, ~0U, P, Size)) {
-        doesAlias = true;
-        break;
-      }
 
-    if (!doesAlias)
-      return NoModRef;
-  }
+  // If the pointer is a pointer to constant memory, then it could not have been
+  // modified by this va_arg.
+  if (pointsToConstantMemory(P))
+    return NoModRef;
 
-  if (!AA) return Mask;
+  // Otherwise, a va_arg reads and writes.
+  return ModRef;
+}
 
-  // If P points to a constant memory location, the call definitely could not
-  // modify the memory location.
-  if ((Mask & Mod) && AA->pointsToConstantMemory(P))
-    Mask = ModRefResult(Mask & ~Mod);
 
-  return ModRefResult(Mask & AA->getModRefInfo(CS, P, Size));
+AliasAnalysis::ModRefBehavior
+AliasAnalysis::getIntrinsicModRefBehavior(unsigned iid) {
+#define GET_INTRINSIC_MODREF_BEHAVIOR
+#include "llvm/Intrinsics.gen"
+#undef GET_INTRINSIC_MODREF_BEHAVIOR
 }
 
 // AliasAnalysis destructor: DO NOT move this to the header file for
@@ -206,12 +299,12 @@ bool AliasAnalysis::canInstructionRangeModify(const Instruction &I1,
                                               const Value *Ptr, unsigned Size) {
   assert(I1.getParent() == I2.getParent() &&
          "Instructions not in same basic block!");
-  BasicBlock::iterator I = const_cast<Instruction*>(&I1);
-  BasicBlock::iterator E = const_cast<Instruction*>(&I2);
+  BasicBlock::const_iterator I = &I1;
+  BasicBlock::const_iterator E = &I2;
   ++E;  // Convert from inclusive to exclusive range.
 
   for (; I != E; ++I) // Check every instruction in range
-    if (getModRefInfo(I, const_cast<Value*>(Ptr), Size) & Mod)
+    if (getModRefInfo(I, Ptr, Size) & Mod)
       return true;
   return false;
 }
@@ -220,7 +313,7 @@ bool AliasAnalysis::canInstructionRangeModify(const Instruction &I1,
 /// function.
 bool llvm::isNoAliasCall(const Value *V) {
   if (isa<CallInst>(V) || isa<InvokeInst>(V))
-    return CallSite(const_cast<Instruction*>(cast<Instruction>(V)))
+    return ImmutableCallSite(cast<Instruction>(V))
       .paramHasAttr(0, Attribute::NoAlias);
   return false;
 }
diff --git a/lib/Analysis/AliasAnalysisCounter.cpp b/lib/Analysis/AliasAnalysisCounter.cpp
index 1053955ea2339..b17804186a63a 100644
--- a/lib/Analysis/AliasAnalysisCounter.cpp
+++ b/lib/Analysis/AliasAnalysisCounter.cpp
@@ -34,7 +34,7 @@ namespace {
     Module *M;
   public:
     static char ID; // Class identification, replacement for typeinfo
-    AliasAnalysisCounter() : ModulePass(&ID) {
+    AliasAnalysisCounter() : ModulePass(ID) {
       No = May = Must = 0;
       NoMR = JustRef = JustMod = MR = 0;
     }
@@ -87,8 +87,8 @@ namespace {
     /// an analysis interface through multiple inheritance.  If needed, it
     /// should override this to adjust the this pointer as needed for the
     /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(const PassInfo *PI) {
-      if (PI->isPassID(&AliasAnalysis::ID))
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &AliasAnalysis::ID)
         return (AliasAnalysis*)this;
       return this;
     }
@@ -103,17 +103,18 @@ namespace {
     AliasResult alias(const Value *V1, unsigned V1Size,
                       const Value *V2, unsigned V2Size);
 
-    ModRefResult getModRefInfo(CallSite CS, Value *P, unsigned Size);
-    ModRefResult getModRefInfo(CallSite CS1, CallSite CS2) {
+    ModRefResult getModRefInfo(ImmutableCallSite CS,
+                               const Value *P, unsigned Size);
+    ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                               ImmutableCallSite CS2) {
       return AliasAnalysis::getModRefInfo(CS1,CS2);
     }
   };
 }
 
 char AliasAnalysisCounter::ID = 0;
-static RegisterPass<AliasAnalysisCounter>
-X("count-aa", "Count Alias Analysis Query Responses", false, true);
-static RegisterAnalysisGroup<AliasAnalysis> Y(X);
+INITIALIZE_AG_PASS(AliasAnalysisCounter, AliasAnalysis, "count-aa",
+                   "Count Alias Analysis Query Responses", false, true, false);
 
 ModulePass *llvm::createAliasAnalysisCounterPass() {
   return new AliasAnalysisCounter();
@@ -146,7 +147,8 @@ AliasAnalysisCounter::alias(const Value *V1, unsigned V1Size,
 }
 
 AliasAnalysis::ModRefResult
-AliasAnalysisCounter::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+AliasAnalysisCounter::getModRefInfo(ImmutableCallSite CS,
+                                    const Value *P, unsigned Size) {
   ModRefResult R = getAnalysis<AliasAnalysis>().getModRefInfo(CS, P, Size);
 
   const char *MRString;
diff --git a/lib/Analysis/AliasAnalysisEvaluator.cpp b/lib/Analysis/AliasAnalysisEvaluator.cpp
index 37ee9fc22c9b9..ce363cbc7bbd6 100644
--- a/lib/Analysis/AliasAnalysisEvaluator.cpp
+++ b/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -50,7 +50,7 @@ namespace {
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    AAEval() : FunctionPass(&ID) {}
+    AAEval() : FunctionPass(ID) {}
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<AliasAnalysis>();
@@ -74,8 +74,8 @@ namespace {
 }
 
 char AAEval::ID = 0;
-static RegisterPass<AAEval>
-X("aa-eval", "Exhaustive Alias Analysis Precision Evaluator", false, true);
+INITIALIZE_PASS(AAEval, "aa-eval",
+                "Exhaustive Alias Analysis Precision Evaluator", false, true);
 
 FunctionPass *llvm::createAAEvalPass() { return new AAEval(); }
 
@@ -107,6 +107,15 @@ PrintModRefResults(const char *Msg, bool P, Instruction *I, Value *Ptr,
   }
 }
 
+static inline void
+PrintModRefResults(const char *Msg, bool P, CallSite CSA, CallSite CSB,
+                   Module *M) {
+  if (P) {
+    errs() << "  " << Msg << ": " << *CSA.getInstruction()
+           << " <-> " << *CSB.getInstruction() << '\n';
+  }
+}
+
 static inline bool isInterestingPointer(Value *V) {
   return V->getType()->isPointerTy()
       && !isa<ConstantPointerNull>(V);
@@ -126,8 +135,7 @@ bool AAEval::runOnFunction(Function &F) {
     if (I->getType()->isPointerTy()) // Add all pointer instructions.
       Pointers.insert(&*I);
     Instruction &Inst = *I;
-    CallSite CS = CallSite::get(&Inst);
-    if (CS) {
+    if (CallSite CS = cast<Value>(&Inst)) {
       Value *Callee = CS.getCalledValue();
       // Skip actual functions for direct function calls.
       if (!isa<Function>(Callee) && isInterestingPointer(Callee))
@@ -137,6 +145,7 @@ bool AAEval::runOnFunction(Function &F) {
            AI != AE; ++AI)
         if (isInterestingPointer(*AI))
           Pointers.insert(*AI);
+      CallSites.insert(CS);
     } else {
       // Consider all operands.
       for (Instruction::op_iterator OI = Inst.op_begin(), OE = Inst.op_end();
@@ -144,8 +153,6 @@ bool AAEval::runOnFunction(Function &F) {
         if (isInterestingPointer(*OI))
           Pointers.insert(*OI);
     }
-
-    if (CS.getInstruction()) CallSites.insert(CS);
   }
 
   if (PrintNoAlias || PrintMayAlias || PrintMustAlias ||
@@ -197,13 +204,13 @@ bool AAEval::runOnFunction(Function &F) {
         PrintModRefResults("NoModRef", PrintNoModRef, I, *V, F.getParent());
         ++NoModRef; break;
       case AliasAnalysis::Mod:
-        PrintModRefResults("     Mod", PrintMod, I, *V, F.getParent());
+        PrintModRefResults("Just Mod", PrintMod, I, *V, F.getParent());
         ++Mod; break;
       case AliasAnalysis::Ref:
-        PrintModRefResults("     Ref", PrintRef, I, *V, F.getParent());
+        PrintModRefResults("Just Ref", PrintRef, I, *V, F.getParent());
         ++Ref; break;
       case AliasAnalysis::ModRef:
-        PrintModRefResults("  ModRef", PrintModRef, I, *V, F.getParent());
+        PrintModRefResults("Both ModRef", PrintModRef, I, *V, F.getParent());
         ++ModRef; break;
       default:
         errs() << "Unknown alias query result!\n";
@@ -211,6 +218,29 @@ bool AAEval::runOnFunction(Function &F) {
     }
   }
 
+  // Mod/ref alias analysis: compare all pairs of calls
+  for (SetVector<CallSite>::iterator C = CallSites.begin(),
+         Ce = CallSites.end(); C != Ce; ++C) {
+    for (SetVector<CallSite>::iterator D = CallSites.begin(); D != Ce; ++D) {
+      if (D == C)
+        continue;
+      switch (AA.getModRefInfo(*C, *D)) {
+      case AliasAnalysis::NoModRef:
+        PrintModRefResults("NoModRef", PrintNoModRef, *C, *D, F.getParent());
+        ++NoModRef; break;
+      case AliasAnalysis::Mod:
+        PrintModRefResults("Just Mod", PrintMod, *C, *D, F.getParent());
+        ++Mod; break;
+      case AliasAnalysis::Ref:
+        PrintModRefResults("Just Ref", PrintRef, *C, *D, F.getParent());
+        ++Ref; break;
+      case AliasAnalysis::ModRef:
+        PrintModRefResults("Both ModRef", PrintModRef, *C, *D, F.getParent());
+        ++ModRef; break;
+      }
+    }
+  }
+
   return false;
 }
 
diff --git a/lib/Analysis/AliasDebugger.cpp b/lib/Analysis/AliasDebugger.cpp
index bc2d9c55d1837..b9fe64608c01c 100644
--- a/lib/Analysis/AliasDebugger.cpp
+++ b/lib/Analysis/AliasDebugger.cpp
@@ -39,7 +39,7 @@ namespace {
     
   public:
     static char ID; // Class identification, replacement for typeinfo
-    AliasDebugger() : ModulePass(&ID) {}
+    AliasDebugger() : ModulePass(ID) {}
 
     bool runOnModule(Module &M) {
       InitializeAliasAnalysis(this);                 // set up super class
@@ -83,8 +83,8 @@ namespace {
     /// an analysis interface through multiple inheritance.  If needed, it
     /// should override this to adjust the this pointer as needed for the
     /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(const PassInfo *PI) {
-      if (PI->isPassID(&AliasAnalysis::ID))
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &AliasAnalysis::ID)
         return (AliasAnalysis*)this;
       return this;
     }
@@ -99,12 +99,14 @@ namespace {
       return AliasAnalysis::alias(V1, V1Size, V2, V2Size);
     }
 
-    ModRefResult getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+    ModRefResult getModRefInfo(ImmutableCallSite CS,
+                               const Value *P, unsigned Size) {
       assert(Vals.find(P) != Vals.end() && "Never seen value in AA before");
       return AliasAnalysis::getModRefInfo(CS, P, Size);
     }
 
-    ModRefResult getModRefInfo(CallSite CS1, CallSite CS2) {
+    ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                               ImmutableCallSite CS2) {
       return AliasAnalysis::getModRefInfo(CS1,CS2);
     }
     
@@ -126,9 +128,8 @@ namespace {
 }
 
 char AliasDebugger::ID = 0;
-static RegisterPass<AliasDebugger>
-X("debug-aa", "AA use debugger", false, true);
-static RegisterAnalysisGroup<AliasAnalysis> Y(X);
+INITIALIZE_AG_PASS(AliasDebugger, AliasAnalysis, "debug-aa",
+                   "AA use debugger", false, true, false);
 
 Pass *llvm::createAliasDebugger() { return new AliasDebugger(); }
 
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 02aff50d8a13a..e74543bb508aa 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/InstIterator.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -35,6 +34,7 @@ void AliasSet::mergeSetIn(AliasSet &AS, AliasSetTracker &AST) {
   // Update the alias and access types of this set...
   AccessTy |= AS.AccessTy;
   AliasTy  |= AS.AliasTy;
+  Volatile |= AS.Volatile;
 
   if (AliasTy == MustAlias) {
     // Check that these two merged sets really are must aliases.  Since both
@@ -111,11 +111,11 @@ void AliasSet::addPointer(AliasSetTracker &AST, PointerRec &Entry,
   *PtrListEnd = &Entry;
   PtrListEnd = Entry.setPrevInList(PtrListEnd);
   assert(*PtrListEnd == 0 && "End of list is not null?");
-  addRef();               // Entry points to alias set...
+  addRef();               // Entry points to alias set.
 }
 
 void AliasSet::addCallSite(CallSite CS, AliasAnalysis &AA) {
-  CallSites.push_back(CS);
+  CallSites.push_back(CS.getInstruction());
 
   AliasAnalysis::ModRefBehavior Behavior = AA.getModRefBehavior(CS);
   if (Behavior == AliasAnalysis::DoesNotAccessMemory)
@@ -140,7 +140,7 @@ bool AliasSet::aliasesPointer(const Value *Ptr, unsigned Size,
     assert(CallSites.empty() && "Illegal must alias set!");
 
     // If this is a set of MustAliases, only check to see if the pointer aliases
-    // SOME value in the set...
+    // SOME value in the set.
     PointerRec *SomePtr = getSomePointer();
     assert(SomePtr && "Empty must-alias set??");
     return AA.alias(SomePtr->getValue(), SomePtr->getSize(), Ptr, Size);
@@ -155,8 +155,7 @@ bool AliasSet::aliasesPointer(const Value *Ptr, unsigned Size,
   // Check the call sites list and invoke list...
   if (!CallSites.empty()) {
     for (unsigned i = 0, e = CallSites.size(); i != e; ++i)
-      if (AA.getModRefInfo(CallSites[i], const_cast<Value*>(Ptr), Size)
-                   != AliasAnalysis::NoModRef)
+      if (AA.getModRefInfo(CallSites[i], Ptr, Size) != AliasAnalysis::NoModRef)
         return true;
   }
 
@@ -167,10 +166,11 @@ bool AliasSet::aliasesCallSite(CallSite CS, AliasAnalysis &AA) const {
   if (AA.doesNotAccessMemory(CS))
     return false;
 
-  for (unsigned i = 0, e = CallSites.size(); i != e; ++i)
-    if (AA.getModRefInfo(CallSites[i], CS) != AliasAnalysis::NoModRef ||
-        AA.getModRefInfo(CS, CallSites[i]) != AliasAnalysis::NoModRef)
+  for (unsigned i = 0, e = CallSites.size(); i != e; ++i) {
+    if (AA.getModRefInfo(getCallSite(i), CS) != AliasAnalysis::NoModRef ||
+        AA.getModRefInfo(CS, getCallSite(i)) != AliasAnalysis::NoModRef)
       return true;
+  }
 
   for (iterator I = begin(), E = end(); I != E; ++I)
     if (AA.getModRefInfo(CS, I.getPointer(), I.getSize()) !=
@@ -200,14 +200,15 @@ void AliasSetTracker::clear() {
 AliasSet *AliasSetTracker::findAliasSetForPointer(const Value *Ptr,
                                                   unsigned Size) {
   AliasSet *FoundSet = 0;
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    if (!I->Forward && I->aliasesPointer(Ptr, Size, AA)) {
-      if (FoundSet == 0) {  // If this is the first alias set ptr can go into.
-        FoundSet = I;       // Remember it.
-      } else {              // Otherwise, we must merge the sets.
-        FoundSet->mergeSetIn(*I, *this);     // Merge in contents.
-      }
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    if (I->Forward || !I->aliasesPointer(Ptr, Size, AA)) continue;
+    
+    if (FoundSet == 0) {  // If this is the first alias set ptr can go into.
+      FoundSet = I;       // Remember it.
+    } else {              // Otherwise, we must merge the sets.
+      FoundSet->mergeSetIn(*I, *this);     // Merge in contents.
     }
+  }
 
   return FoundSet;
 }
@@ -226,15 +227,15 @@ bool AliasSetTracker::containsPointer(Value *Ptr, unsigned Size) const {
 
 AliasSet *AliasSetTracker::findAliasSetForCallSite(CallSite CS) {
   AliasSet *FoundSet = 0;
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    if (!I->Forward && I->aliasesCallSite(CS, AA)) {
-      if (FoundSet == 0) {  // If this is the first alias set ptr can go into.
-        FoundSet = I;       // Remember it.
-      } else if (!I->Forward) {     // Otherwise, we must merge the sets.
-        FoundSet->mergeSetIn(*I, *this);     // Merge in contents.
-      }
-    }
-
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    if (I->Forward || !I->aliasesCallSite(CS, AA))
+      continue;
+    
+    if (FoundSet == 0)        // If this is the first alias set ptr can go into.
+      FoundSet = I;           // Remember it.
+    else if (!I->Forward)     // Otherwise, we must merge the sets.
+      FoundSet->mergeSetIn(*I, *this);     // Merge in contents.
+  }
   return FoundSet;
 }
 
@@ -247,22 +248,24 @@ AliasSet &AliasSetTracker::getAliasSetForPointer(Value *Pointer, unsigned Size,
                                                  bool *New) {
   AliasSet::PointerRec &Entry = getEntryFor(Pointer);
 
-  // Check to see if the pointer is already known...
+  // Check to see if the pointer is already known.
   if (Entry.hasAliasSet()) {
     Entry.updateSize(Size);
     // Return the set!
     return *Entry.getAliasSet(*this)->getForwardedTarget(*this);
-  } else if (AliasSet *AS = findAliasSetForPointer(Pointer, Size)) {
-    // Add it to the alias set it aliases...
+  }
+  
+  if (AliasSet *AS = findAliasSetForPointer(Pointer, Size)) {
+    // Add it to the alias set it aliases.
     AS->addPointer(*this, Entry, Size);
     return *AS;
-  } else {
-    if (New) *New = true;
-    // Otherwise create a new alias set to hold the loaded pointer...
-    AliasSets.push_back(new AliasSet());
-    AliasSets.back().addPointer(*this, Entry, Size);
-    return AliasSets.back();
   }
+  
+  if (New) *New = true;
+  // Otherwise create a new alias set to hold the loaded pointer.
+  AliasSets.push_back(new AliasSet());
+  AliasSets.back().addPointer(*this, Entry, Size);
+  return AliasSets.back();
 }
 
 bool AliasSetTracker::add(Value *Ptr, unsigned Size) {
@@ -305,28 +308,27 @@ bool AliasSetTracker::add(CallSite CS) {
     return true; // doesn't alias anything
 
   AliasSet *AS = findAliasSetForCallSite(CS);
-  if (!AS) {
-    AliasSets.push_back(new AliasSet());
-    AS = &AliasSets.back();
-    AS->addCallSite(CS, AA);
-    return true;
-  } else {
+  if (AS) {
     AS->addCallSite(CS, AA);
     return false;
   }
+  AliasSets.push_back(new AliasSet());
+  AS = &AliasSets.back();
+  AS->addCallSite(CS, AA);
+  return true;
 }
 
 bool AliasSetTracker::add(Instruction *I) {
-  // Dispatch to one of the other add methods...
+  // Dispatch to one of the other add methods.
   if (LoadInst *LI = dyn_cast<LoadInst>(I))
     return add(LI);
-  else if (StoreInst *SI = dyn_cast<StoreInst>(I))
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return add(SI);
-  else if (CallInst *CI = dyn_cast<CallInst>(I))
+  if (CallInst *CI = dyn_cast<CallInst>(I))
     return add(CI);
-  else if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+  if (InvokeInst *II = dyn_cast<InvokeInst>(I))
     return add(II);
-  else if (VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
+  if (VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
     return add(VAAI);
   return true;
 }
@@ -343,23 +345,23 @@ void AliasSetTracker::add(const AliasSetTracker &AST) {
   // Loop over all of the alias sets in AST, adding the pointers contained
   // therein into the current alias sets.  This can cause alias sets to be
   // merged together in the current AST.
-  for (const_iterator I = AST.begin(), E = AST.end(); I != E; ++I)
-    if (!I->Forward) {   // Ignore forwarding alias sets
-      AliasSet &AS = const_cast<AliasSet&>(*I);
-
-      // If there are any call sites in the alias set, add them to this AST.
-      for (unsigned i = 0, e = AS.CallSites.size(); i != e; ++i)
-        add(AS.CallSites[i]);
-
-      // Loop over all of the pointers in this alias set...
-      AliasSet::iterator I = AS.begin(), E = AS.end();
-      bool X;
-      for (; I != E; ++I) {
-        AliasSet &NewAS = addPointer(I.getPointer(), I.getSize(),
-                                     (AliasSet::AccessType)AS.AccessTy, X);
-        if (AS.isVolatile()) NewAS.setVolatile();
-      }
+  for (const_iterator I = AST.begin(), E = AST.end(); I != E; ++I) {
+    if (I->Forward) continue;   // Ignore forwarding alias sets
+    
+    AliasSet &AS = const_cast<AliasSet&>(*I);
+
+    // If there are any call sites in the alias set, add them to this AST.
+    for (unsigned i = 0, e = AS.CallSites.size(); i != e; ++i)
+      add(AS.CallSites[i]);
+
+    // Loop over all of the pointers in this alias set.
+    bool X;
+    for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) {
+      AliasSet &NewAS = addPointer(ASI.getPointer(), ASI.getSize(),
+                                   (AliasSet::AccessType)AS.AccessTy, X);
+      if (AS.isVolatile()) NewAS.setVolatile();
     }
+  }
 }
 
 /// remove - Remove the specified (potentially non-empty) alias set from the
@@ -435,11 +437,11 @@ bool AliasSetTracker::remove(Instruction *I) {
   // Dispatch to one of the other remove methods...
   if (LoadInst *LI = dyn_cast<LoadInst>(I))
     return remove(LI);
-  else if (StoreInst *SI = dyn_cast<StoreInst>(I))
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return remove(SI);
-  else if (CallInst *CI = dyn_cast<CallInst>(I))
+  if (CallInst *CI = dyn_cast<CallInst>(I))
     return remove(CI);
-  else if (VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
+  if (VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
     return remove(VAAI);
   return true;
 }
@@ -455,12 +457,17 @@ void AliasSetTracker::deleteValue(Value *PtrVal) {
   AA.deleteValue(PtrVal);
 
   // If this is a call instruction, remove the callsite from the appropriate
-  // AliasSet.
-  CallSite CS = CallSite::get(PtrVal);
-  if (CS.getInstruction())
-    if (!AA.doesNotAccessMemory(CS))
-      if (AliasSet *AS = findAliasSetForCallSite(CS))
-        AS->removeCallSite(CS);
+  // AliasSet (if present).
+  if (CallSite CS = PtrVal) {
+    if (!AA.doesNotAccessMemory(CS)) {
+      // Scan all the alias sets to see if this call site is contained.
+      for (iterator I = begin(), E = end(); I != E; ++I) {
+        if (I->Forward) continue;
+        
+        I->removeCallSite(CS);
+      }
+    }
+  }
 
   // First, look up the PointerRec for this pointer.
   PointerMapType::iterator I = PointerMap.find(PtrVal);
@@ -510,7 +517,7 @@ void AliasSetTracker::copyValue(Value *From, Value *To) {
 //===----------------------------------------------------------------------===//
 
 void AliasSet::print(raw_ostream &OS) const {
-  OS << "  AliasSet[" << format("0x%p", (void*)this) << "," << RefCount << "] ";
+  OS << "  AliasSet[" << (void*)this << ", " << RefCount << "] ";
   OS << (AliasTy == MustAlias ? "must" : "may") << " alias, ";
   switch (AccessTy) {
   case NoModRef: OS << "No access "; break;
@@ -536,7 +543,7 @@ void AliasSet::print(raw_ostream &OS) const {
     OS << "\n    " << CallSites.size() << " Call Sites: ";
     for (unsigned i = 0, e = CallSites.size(); i != e; ++i) {
       if (i) OS << ", ";
-      WriteAsOperand(OS, CallSites[i].getCalledValue());
+      WriteAsOperand(OS, CallSites[i]);
     }
   }
   OS << "\n";
@@ -580,7 +587,7 @@ namespace {
     AliasSetTracker *Tracker;
   public:
     static char ID; // Pass identification, replacement for typeid
-    AliasSetPrinter() : FunctionPass(&ID) {}
+    AliasSetPrinter() : FunctionPass(ID) {}
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
@@ -600,5 +607,5 @@ namespace {
 }
 
 char AliasSetPrinter::ID = 0;
-static RegisterPass<AliasSetPrinter>
-X("print-alias-sets", "Alias Set Printer", false, true);
+INITIALIZE_PASS(AliasSetPrinter, "print-alias-sets",
+                "Alias Set Printer", false, true);
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 4f53a6d62559b..113c72b94dac5 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
+#include "llvm/GlobalAlias.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
@@ -30,6 +31,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -137,8 +139,8 @@ namespace {
   ///
   struct NoAA : public ImmutablePass, public AliasAnalysis {
     static char ID; // Class identification, replacement for typeinfo
-    NoAA() : ImmutablePass(&ID) {}
-    explicit NoAA(void *PID) : ImmutablePass(PID) { }
+    NoAA() : ImmutablePass(ID) {}
+    explicit NoAA(char &PID) : ImmutablePass(PID) { }
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
     }
@@ -152,16 +154,20 @@ namespace {
       return MayAlias;
     }
 
-    virtual void getArgumentAccesses(Function *F, CallSite CS,
-                                     std::vector<PointerAccessInfo> &Info) {
-      llvm_unreachable("This method may not be called on this function!");
+    virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS) {
+      return UnknownModRefBehavior;
+    }
+    virtual ModRefBehavior getModRefBehavior(const Function *F) {
+      return UnknownModRefBehavior;
     }
 
     virtual bool pointsToConstantMemory(const Value *P) { return false; }
-    virtual ModRefResult getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS,
+                                       const Value *P, unsigned Size) {
       return ModRef;
     }
-    virtual ModRefResult getModRefInfo(CallSite CS1, CallSite CS2) {
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                                       ImmutableCallSite CS2) {
       return ModRef;
     }
 
@@ -169,11 +175,11 @@ namespace {
     virtual void copyValue(Value *From, Value *To) {}
     
     /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it should
-    /// override this to adjust the this pointer as needed for the specified pass
-    /// info.
-    virtual void *getAdjustedAnalysisPointer(const PassInfo *PI) {
-      if (PI->isPassID(&AliasAnalysis::ID))
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(const void *ID) {
+      if (ID == &AliasAnalysis::ID)
         return (AliasAnalysis*)this;
       return this;
     }
@@ -182,15 +188,279 @@ namespace {
 
 // Register this pass...
 char NoAA::ID = 0;
-static RegisterPass<NoAA>
-U("no-aa", "No Alias Analysis (always returns 'may' alias)", true, true);
-
-// Declare that we implement the AliasAnalysis interface
-static RegisterAnalysisGroup<AliasAnalysis> V(U);
+INITIALIZE_AG_PASS(NoAA, AliasAnalysis, "no-aa",
+                   "No Alias Analysis (always returns 'may' alias)",
+                   true, true, false);
 
 ImmutablePass *llvm::createNoAAPass() { return new NoAA(); }
 
 //===----------------------------------------------------------------------===//
+// GetElementPtr Instruction Decomposition and Analysis
+//===----------------------------------------------------------------------===//
+
+namespace {
+  enum ExtensionKind {
+    EK_NotExtended,
+    EK_SignExt,
+    EK_ZeroExt
+  };
+  
+  struct VariableGEPIndex {
+    const Value *V;
+    ExtensionKind Extension;
+    int64_t Scale;
+  };
+}
+
+
+/// GetLinearExpression - Analyze the specified value as a linear expression:
+/// "A*V + B", where A and B are constant integers.  Return the scale and offset
+/// values as APInts and return V as a Value*, and return whether we looked
+/// through any sign or zero extends.  The incoming Value is known to have
+/// IntegerType and it may already be sign or zero extended.
+///
+/// Note that this looks through extends, so the high bits may not be
+/// represented in the result.
+static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
+                                  ExtensionKind &Extension,
+                                  const TargetData &TD, unsigned Depth) {
+  assert(V->getType()->isIntegerTy() && "Not an integer value");
+
+  // Limit our recursion depth.
+  if (Depth == 6) {
+    Scale = 1;
+    Offset = 0;
+    return V;
+  }
+  
+  if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
+    if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
+      switch (BOp->getOpcode()) {
+      default: break;
+      case Instruction::Or:
+        // X|C == X+C if all the bits in C are unset in X.  Otherwise we can't
+        // analyze it.
+        if (!MaskedValueIsZero(BOp->getOperand(0), RHSC->getValue(), &TD))
+          break;
+        // FALL THROUGH.
+      case Instruction::Add:
+        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
+                                TD, Depth+1);
+        Offset += RHSC->getValue();
+        return V;
+      case Instruction::Mul:
+        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
+                                TD, Depth+1);
+        Offset *= RHSC->getValue();
+        Scale *= RHSC->getValue();
+        return V;
+      case Instruction::Shl:
+        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
+                                TD, Depth+1);
+        Offset <<= RHSC->getValue().getLimitedValue();
+        Scale <<= RHSC->getValue().getLimitedValue();
+        return V;
+      }
+    }
+  }
+  
+  // Since GEP indices are sign extended anyway, we don't care about the high
+  // bits of a sign or zero extended value - just scales and offsets.  The
+  // extensions have to be consistent though.
+  if ((isa<SExtInst>(V) && Extension != EK_ZeroExt) ||
+      (isa<ZExtInst>(V) && Extension != EK_SignExt)) {
+    Value *CastOp = cast<CastInst>(V)->getOperand(0);
+    unsigned OldWidth = Scale.getBitWidth();
+    unsigned SmallWidth = CastOp->getType()->getPrimitiveSizeInBits();
+    Scale.trunc(SmallWidth);
+    Offset.trunc(SmallWidth);
+    Extension = isa<SExtInst>(V) ? EK_SignExt : EK_ZeroExt;
+
+    Value *Result = GetLinearExpression(CastOp, Scale, Offset, Extension,
+                                        TD, Depth+1);
+    Scale.zext(OldWidth);
+    Offset.zext(OldWidth);
+    
+    return Result;
+  }
+  
+  Scale = 1;
+  Offset = 0;
+  return V;
+}
+
+/// DecomposeGEPExpression - If V is a symbolic pointer expression, decompose it
+/// into a base pointer with a constant offset and a number of scaled symbolic
+/// offsets.
+///
+/// The scaled symbolic offsets (represented by pairs of a Value* and a scale in
+/// the VarIndices vector) are Value*'s that are known to be scaled by the
+/// specified amount, but which may have other unrepresented high bits. As such,
+/// the gep cannot necessarily be reconstructed from its decomposed form.
+///
+/// When TargetData is around, this function is capable of analyzing everything
+/// that Value::getUnderlyingObject() can look through.  When not, it just looks
+/// through pointer casts.
+///
+static const Value *
+DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
+                       SmallVectorImpl<VariableGEPIndex> &VarIndices,
+                       const TargetData *TD) {
+  // Limit recursion depth to limit compile time in crazy cases.
+  unsigned MaxLookup = 6;
+  
+  BaseOffs = 0;
+  do {
+    // See if this is a bitcast or GEP.
+    const Operator *Op = dyn_cast<Operator>(V);
+    if (Op == 0) {
+      // The only non-operator case we can handle are GlobalAliases.
+      if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+        if (!GA->mayBeOverridden()) {
+          V = GA->getAliasee();
+          continue;
+        }
+      }
+      return V;
+    }
+    
+    if (Op->getOpcode() == Instruction::BitCast) {
+      V = Op->getOperand(0);
+      continue;
+    }
+    
+    const GEPOperator *GEPOp = dyn_cast<GEPOperator>(Op);
+    if (GEPOp == 0)
+      return V;
+    
+    // Don't attempt to analyze GEPs over unsized objects.
+    if (!cast<PointerType>(GEPOp->getOperand(0)->getType())
+        ->getElementType()->isSized())
+      return V;
+    
+    // If we are lacking TargetData information, we can't compute the offets of
+    // elements computed by GEPs.  However, we can handle bitcast equivalent
+    // GEPs.
+    if (TD == 0) {
+      if (!GEPOp->hasAllZeroIndices())
+        return V;
+      V = GEPOp->getOperand(0);
+      continue;
+    }
+    
+    // Walk the indices of the GEP, accumulating them into BaseOff/VarIndices.
+    gep_type_iterator GTI = gep_type_begin(GEPOp);
+    for (User::const_op_iterator I = GEPOp->op_begin()+1,
+         E = GEPOp->op_end(); I != E; ++I) {
+      Value *Index = *I;
+      // Compute the (potentially symbolic) offset in bytes for this index.
+      if (const StructType *STy = dyn_cast<StructType>(*GTI++)) {
+        // For a struct, add the member offset.
+        unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
+        if (FieldNo == 0) continue;
+        
+        BaseOffs += TD->getStructLayout(STy)->getElementOffset(FieldNo);
+        continue;
+      }
+      
+      // For an array/pointer, add the element offset, explicitly scaled.
+      if (ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
+        if (CIdx->isZero()) continue;
+        BaseOffs += TD->getTypeAllocSize(*GTI)*CIdx->getSExtValue();
+        continue;
+      }
+      
+      uint64_t Scale = TD->getTypeAllocSize(*GTI);
+      ExtensionKind Extension = EK_NotExtended;
+      
+      // If the integer type is smaller than the pointer size, it is implicitly
+      // sign extended to pointer size.
+      unsigned Width = cast<IntegerType>(Index->getType())->getBitWidth();
+      if (TD->getPointerSizeInBits() > Width)
+        Extension = EK_SignExt;
+      
+      // Use GetLinearExpression to decompose the index into a C1*V+C2 form.
+      APInt IndexScale(Width, 0), IndexOffset(Width, 0);
+      Index = GetLinearExpression(Index, IndexScale, IndexOffset, Extension,
+                                  *TD, 0);
+      
+      // The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
+      // This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
+      BaseOffs += IndexOffset.getZExtValue()*Scale;
+      Scale *= IndexScale.getZExtValue();
+      
+      
+      // If we already had an occurrance of this index variable, merge this
+      // scale into it.  For example, we want to handle:
+      //   A[x][x] -> x*16 + x*4 -> x*20
+      // This also ensures that 'x' only appears in the index list once.
+      for (unsigned i = 0, e = VarIndices.size(); i != e; ++i) {
+        if (VarIndices[i].V == Index &&
+            VarIndices[i].Extension == Extension) {
+          Scale += VarIndices[i].Scale;
+          VarIndices.erase(VarIndices.begin()+i);
+          break;
+        }
+      }
+      
+      // Make sure that we have a scale that makes sense for this target's
+      // pointer size.
+      if (unsigned ShiftBits = 64-TD->getPointerSizeInBits()) {
+        Scale <<= ShiftBits;
+        Scale >>= ShiftBits;
+      }
+      
+      if (Scale) {
+        VariableGEPIndex Entry = {Index, Extension, Scale};
+        VarIndices.push_back(Entry);
+      }
+    }
+    
+    // Analyze the base pointer next.
+    V = GEPOp->getOperand(0);
+  } while (--MaxLookup);
+  
+  // If the chain of expressions is too deep, just return early.
+  return V;
+}
+
+/// GetIndexDifference - Dest and Src are the variable indices from two
+/// decomposed GetElementPtr instructions GEP1 and GEP2 which have common base
+/// pointers.  Subtract the GEP2 indices from GEP1 to find the symbolic
+/// difference between the two pointers. 
+static void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest,
+                               const SmallVectorImpl<VariableGEPIndex> &Src) {
+  if (Src.empty()) return;
+
+  for (unsigned i = 0, e = Src.size(); i != e; ++i) {
+    const Value *V = Src[i].V;
+    ExtensionKind Extension = Src[i].Extension;
+    int64_t Scale = Src[i].Scale;
+    
+    // Find V in Dest.  This is N^2, but pointer indices almost never have more
+    // than a few variable indexes.
+    for (unsigned j = 0, e = Dest.size(); j != e; ++j) {
+      if (Dest[j].V != V || Dest[j].Extension != Extension) continue;
+      
+      // If we found it, subtract off Scale V's from the entry in Dest.  If it
+      // goes to zero, remove the entry.
+      if (Dest[j].Scale != Scale)
+        Dest[j].Scale -= Scale;
+      else
+        Dest.erase(Dest.begin()+j);
+      Scale = 0;
+      break;
+    }
+    
+    // If we didn't consume this entry, add it to the end of the Dest list.
+    if (Scale) {
+      VariableGEPIndex Entry = { V, Extension, -Scale };
+      Dest.push_back(Entry);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
 // BasicAliasAnalysis Pass
 //===----------------------------------------------------------------------===//
 
@@ -220,10 +490,10 @@ namespace {
   /// derives from the NoAA class.
   struct BasicAliasAnalysis : public NoAA {
     static char ID; // Class identification, replacement for typeinfo
-    BasicAliasAnalysis() : NoAA(&ID) {}
+    BasicAliasAnalysis() : NoAA(ID) {}
 
-    AliasResult alias(const Value *V1, unsigned V1Size,
-                      const Value *V2, unsigned V2Size) {
+    virtual AliasResult alias(const Value *V1, unsigned V1Size,
+                              const Value *V2, unsigned V2Size) {
       assert(Visited.empty() && "Visited must be cleared after use!");
       assert(notDifferentParent(V1, V2) &&
              "BasicAliasAnalysis doesn't support interprocedural queries.");
@@ -232,19 +502,33 @@ namespace {
       return Alias;
     }
 
-    ModRefResult getModRefInfo(CallSite CS, Value *P, unsigned Size);
-    ModRefResult getModRefInfo(CallSite CS1, CallSite CS2);
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS,
+                                       const Value *P, unsigned Size);
+
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                                       ImmutableCallSite CS2) {
+      // The AliasAnalysis base class has some smarts, lets use them.
+      return AliasAnalysis::getModRefInfo(CS1, CS2);
+    }
 
     /// pointsToConstantMemory - Chase pointers until we find a (constant
     /// global) or not.
-    bool pointsToConstantMemory(const Value *P);
+    virtual bool pointsToConstantMemory(const Value *P);
+
+    /// getModRefBehavior - Return the behavior when calling the given
+    /// call site.
+    virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS);
+
+    /// getModRefBehavior - Return the behavior when calling the given function.
+    /// For use when the call site is not known.
+    virtual ModRefBehavior getModRefBehavior(const Function *F);
 
     /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it should
-    /// override this to adjust the this pointer as needed for the specified pass
-    /// info.
-    virtual void *getAdjustedAnalysisPointer(const PassInfo *PI) {
-      if (PI->isPassID(&AliasAnalysis::ID))
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(const void *ID) {
+      if (ID == &AliasAnalysis::ID)
         return (AliasAnalysis*)this;
       return this;
     }
@@ -275,11 +559,9 @@ namespace {
 
 // Register this pass...
 char BasicAliasAnalysis::ID = 0;
-static RegisterPass<BasicAliasAnalysis>
-X("basicaa", "Basic Alias Analysis (default AA impl)", false, true);
-
-// Declare that we implement the AliasAnalysis interface
-static RegisterAnalysisGroup<AliasAnalysis, true> Y(X);
+INITIALIZE_AG_PASS(BasicAliasAnalysis, AliasAnalysis, "basicaa",
+                   "Basic Alias Analysis (default AA impl)",
+                   false, true, true);
 
 ImmutablePass *llvm::createBasicAliasAnalysisPass() {
   return new BasicAliasAnalysis();
@@ -295,16 +577,50 @@ bool BasicAliasAnalysis::pointsToConstantMemory(const Value *P) {
     // global to be marked constant in some modules and non-constant in others.
     // GV may even be a declaration, not a definition.
     return GV->isConstant();
-  return false;
+
+  return NoAA::pointsToConstantMemory(P);
 }
 
+/// getModRefBehavior - Return the behavior when calling the given call site.
+AliasAnalysis::ModRefBehavior
+BasicAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
+  if (CS.doesNotAccessMemory())
+    // Can't do better than this.
+    return DoesNotAccessMemory;
+
+  ModRefBehavior Min = UnknownModRefBehavior;
+
+  // If the callsite knows it only reads memory, don't return worse
+  // than that.
+  if (CS.onlyReadsMemory())
+    Min = OnlyReadsMemory;
+
+  // The AliasAnalysis base class has some smarts, lets use them.
+  return std::min(AliasAnalysis::getModRefBehavior(CS), Min);
+}
+
+/// getModRefBehavior - Return the behavior when calling the given function.
+/// For use when the call site is not known.
+AliasAnalysis::ModRefBehavior
+BasicAliasAnalysis::getModRefBehavior(const Function *F) {
+  if (F->doesNotAccessMemory())
+    // Can't do better than this.
+    return DoesNotAccessMemory;
+  if (F->onlyReadsMemory())
+    return OnlyReadsMemory;
+  if (unsigned id = F->getIntrinsicID())
+    return getIntrinsicModRefBehavior(id);
+
+  return NoAA::getModRefBehavior(F);
+}
 
 /// getModRefInfo - Check to see if the specified callsite can clobber the
 /// specified memory object.  Since we only look at local properties of this
 /// function, we really can't say much about this query.  We do, however, use
 /// simple "address taken" analysis on local objects.
 AliasAnalysis::ModRefResult
-BasicAliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
+                                  const Value *P, unsigned Size) {
   assert(notDifferentParent(CS.getInstruction(), P) &&
          "AliasAnalysis query involving multiple functions!");
 
@@ -316,7 +632,7 @@ BasicAliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
   // the current function not to the current function, and a tail callee
   // may reference them.
   if (isa<AllocaInst>(Object))
-    if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction()))
+    if (const CallInst *CI = dyn_cast<CallInst>(CS.getInstruction()))
       if (CI->isTailCall())
         return NoModRef;
   
@@ -327,7 +643,7 @@ BasicAliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
       isNonEscapingLocalObject(Object)) {
     bool PassedAsArg = false;
     unsigned ArgNo = 0;
-    for (CallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
+    for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
          CI != CE; ++CI, ++ArgNo) {
       // Only look at the no-capture pointer arguments.
       if (!(*CI)->getType()->isPointerTy() ||
@@ -338,7 +654,7 @@ BasicAliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
       // is impossible to alias the pointer we're checking.  If not, we have to
       // assume that the call could touch the pointer, even though it doesn't
       // escape.
-      if (!isNoAlias(cast<Value>(CI), ~0U, P, ~0U)) {
+      if (!isNoAlias(cast<Value>(CI), UnknownSize, P, UnknownSize)) {
         PassedAsArg = true;
         break;
       }
@@ -349,127 +665,76 @@ BasicAliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
   }
 
   // Finally, handle specific knowledge of intrinsics.
-  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction());
-  if (II == 0)
-    return AliasAnalysis::getModRefInfo(CS, P, Size);
-
-  switch (II->getIntrinsicID()) {
-  default: break;
-  case Intrinsic::memcpy:
-  case Intrinsic::memmove: {
-    unsigned Len = ~0U;
-    if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getArgOperand(2)))
-      Len = LenCI->getZExtValue();
-    Value *Dest = II->getArgOperand(0);
-    Value *Src = II->getArgOperand(1);
-    if (isNoAlias(Dest, Len, P, Size)) {
-      if (isNoAlias(Src, Len, P, Size))
-        return NoModRef;
-      return Ref;
-    }
-    break;
-  }
-  case Intrinsic::memset:
-    // Since memset is 'accesses arguments' only, the AliasAnalysis base class
-    // will handle it for the variable length case.
-    if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
-      unsigned Len = LenCI->getZExtValue();
+  const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction());
+  if (II != 0)
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove: {
+      unsigned Len = UnknownSize;
+      if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getArgOperand(2)))
+        Len = LenCI->getZExtValue();
       Value *Dest = II->getArgOperand(0);
-      if (isNoAlias(Dest, Len, P, Size))
+      Value *Src = II->getArgOperand(1);
+      if (isNoAlias(Dest, Len, P, Size)) {
+        if (isNoAlias(Src, Len, P, Size))
+          return NoModRef;
+        return Ref;
+      }
+      break;
+    }
+    case Intrinsic::memset:
+      // Since memset is 'accesses arguments' only, the AliasAnalysis base class
+      // will handle it for the variable length case.
+      if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
+        unsigned Len = LenCI->getZExtValue();
+        Value *Dest = II->getArgOperand(0);
+        if (isNoAlias(Dest, Len, P, Size))
+          return NoModRef;
+      }
+      break;
+    case Intrinsic::atomic_cmp_swap:
+    case Intrinsic::atomic_swap:
+    case Intrinsic::atomic_load_add:
+    case Intrinsic::atomic_load_sub:
+    case Intrinsic::atomic_load_and:
+    case Intrinsic::atomic_load_nand:
+    case Intrinsic::atomic_load_or:
+    case Intrinsic::atomic_load_xor:
+    case Intrinsic::atomic_load_max:
+    case Intrinsic::atomic_load_min:
+    case Intrinsic::atomic_load_umax:
+    case Intrinsic::atomic_load_umin:
+      if (TD) {
+        Value *Op1 = II->getArgOperand(0);
+        unsigned Op1Size = TD->getTypeStoreSize(Op1->getType());
+        if (isNoAlias(Op1, Op1Size, P, Size))
+          return NoModRef;
+      }
+      break;
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::invariant_start: {
+      unsigned PtrSize =
+        cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
+      if (isNoAlias(II->getArgOperand(1), PtrSize, P, Size))
         return NoModRef;
+      break;
     }
-    break;
-  case Intrinsic::atomic_cmp_swap:
-  case Intrinsic::atomic_swap:
-  case Intrinsic::atomic_load_add:
-  case Intrinsic::atomic_load_sub:
-  case Intrinsic::atomic_load_and:
-  case Intrinsic::atomic_load_nand:
-  case Intrinsic::atomic_load_or:
-  case Intrinsic::atomic_load_xor:
-  case Intrinsic::atomic_load_max:
-  case Intrinsic::atomic_load_min:
-  case Intrinsic::atomic_load_umax:
-  case Intrinsic::atomic_load_umin:
-    if (TD) {
-      Value *Op1 = II->getArgOperand(0);
-      unsigned Op1Size = TD->getTypeStoreSize(Op1->getType());
-      if (isNoAlias(Op1, Op1Size, P, Size))
+    case Intrinsic::invariant_end: {
+      unsigned PtrSize =
+        cast<ConstantInt>(II->getArgOperand(1))->getZExtValue();
+      if (isNoAlias(II->getArgOperand(2), PtrSize, P, Size))
         return NoModRef;
+      break;
+    }
     }
-    break;
-  case Intrinsic::lifetime_start:
-  case Intrinsic::lifetime_end:
-  case Intrinsic::invariant_start: {
-    unsigned PtrSize = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
-    if (isNoAlias(II->getArgOperand(1), PtrSize, P, Size))
-      return NoModRef;
-    break;
-  }
-  case Intrinsic::invariant_end: {
-    unsigned PtrSize = cast<ConstantInt>(II->getArgOperand(1))->getZExtValue();
-    if (isNoAlias(II->getArgOperand(2), PtrSize, P, Size))
-      return NoModRef;
-    break;
-  }
-  }
 
   // The AliasAnalysis base class has some smarts, lets use them.
   return AliasAnalysis::getModRefInfo(CS, P, Size);
 }
 
 
-AliasAnalysis::ModRefResult 
-BasicAliasAnalysis::getModRefInfo(CallSite CS1, CallSite CS2) {
-  // If CS1 or CS2 are readnone, they don't interact.
-  ModRefBehavior CS1B = AliasAnalysis::getModRefBehavior(CS1);
-  if (CS1B == DoesNotAccessMemory) return NoModRef;
-  
-  ModRefBehavior CS2B = AliasAnalysis::getModRefBehavior(CS2);
-  if (CS2B == DoesNotAccessMemory) return NoModRef;
-  
-  // If they both only read from memory, just return ref.
-  if (CS1B == OnlyReadsMemory && CS2B == OnlyReadsMemory)
-    return Ref;
-  
-  // Otherwise, fall back to NoAA (mod+ref).
-  return NoAA::getModRefInfo(CS1, CS2);
-}
-
-/// GetIndiceDifference - Dest and Src are the variable indices from two
-/// decomposed GetElementPtr instructions GEP1 and GEP2 which have common base
-/// pointers.  Subtract the GEP2 indices from GEP1 to find the symbolic
-/// difference between the two pointers. 
-static void GetIndiceDifference(
-                      SmallVectorImpl<std::pair<const Value*, int64_t> > &Dest,
-                const SmallVectorImpl<std::pair<const Value*, int64_t> > &Src) {
-  if (Src.empty()) return;
-
-  for (unsigned i = 0, e = Src.size(); i != e; ++i) {
-    const Value *V = Src[i].first;
-    int64_t Scale = Src[i].second;
-    
-    // Find V in Dest.  This is N^2, but pointer indices almost never have more
-    // than a few variable indexes.
-    for (unsigned j = 0, e = Dest.size(); j != e; ++j) {
-      if (Dest[j].first != V) continue;
-      
-      // If we found it, subtract off Scale V's from the entry in Dest.  If it
-      // goes to zero, remove the entry.
-      if (Dest[j].second != Scale)
-        Dest[j].second -= Scale;
-      else
-        Dest.erase(Dest.begin()+j);
-      Scale = 0;
-      break;
-    }
-    
-    // If we didn't consume this entry, add it to the end of the Dest list.
-    if (Scale)
-      Dest.push_back(std::make_pair(V, -Scale));
-  }
-}
-
 /// aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP instruction
 /// against another pointer.  We know that V1 is a GEP, but we don't know
 /// anything about V2.  UnderlyingV1 is GEP1->getUnderlyingObject(),
@@ -488,13 +753,14 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, unsigned V1Size,
     return MayAlias;
 
   int64_t GEP1BaseOffset;
-  SmallVector<std::pair<const Value*, int64_t>, 4> GEP1VariableIndices;
+  SmallVector<VariableGEPIndex, 4> GEP1VariableIndices;
 
   // If we have two gep instructions with must-alias'ing base pointers, figure
   // out if the indexes to the GEP tell us anything about the derived pointer.
   if (const GEPOperator *GEP2 = dyn_cast<GEPOperator>(V2)) {
     // Do the base pointers alias?
-    AliasResult BaseAlias = aliasCheck(UnderlyingV1, ~0U, UnderlyingV2, ~0U);
+    AliasResult BaseAlias = aliasCheck(UnderlyingV1, UnknownSize,
+                                       UnderlyingV2, UnknownSize);
     
     // If we get a No or May, then return it immediately, no amount of analysis
     // will improve this situation.
@@ -507,7 +773,7 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, unsigned V1Size,
       DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, TD);
     
     int64_t GEP2BaseOffset;
-    SmallVector<std::pair<const Value*, int64_t>, 4> GEP2VariableIndices;
+    SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
     const Value *GEP2BasePtr =
       DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices, TD);
     
@@ -523,7 +789,7 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, unsigned V1Size,
     // Subtract the GEP2 pointer from the GEP1 pointer to find out their
     // symbolic difference.
     GEP1BaseOffset -= GEP2BaseOffset;
-    GetIndiceDifference(GEP1VariableIndices, GEP2VariableIndices);
+    GetIndexDifference(GEP1VariableIndices, GEP2VariableIndices);
     
   } else {
     // Check to see if these two pointers are related by the getelementptr
@@ -531,10 +797,10 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, unsigned V1Size,
     // pointer, we know they cannot alias.
 
     // If both accesses are unknown size, we can't do anything useful here.
-    if (V1Size == ~0U && V2Size == ~0U)
+    if (V1Size == UnknownSize && V2Size == UnknownSize)
       return MayAlias;
 
-    AliasResult R = aliasCheck(UnderlyingV1, ~0U, V2, V2Size);
+    AliasResult R = aliasCheck(UnderlyingV1, UnknownSize, V2, V2Size);
     if (R != MustAlias)
       // If V2 may alias GEP base pointer, conservatively returns MayAlias.
       // If V2 is known not to alias GEP base pointer, then the two values
@@ -578,8 +844,8 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, unsigned V1Size,
   // provides an offset of 4 bytes (assuming a <= 4 byte access).
   for (unsigned i = 0, e = GEP1VariableIndices.size();
        i != e && GEP1BaseOffset;++i)
-    if (int64_t RemovedOffset = GEP1BaseOffset/GEP1VariableIndices[i].second)
-      GEP1BaseOffset -= RemovedOffset*GEP1VariableIndices[i].second;
+    if (int64_t RemovedOffset = GEP1BaseOffset/GEP1VariableIndices[i].Scale)
+      GEP1BaseOffset -= RemovedOffset*GEP1VariableIndices[i].Scale;
   
   // If our known offset is bigger than the access size, we know we don't have
   // an alias.
@@ -782,8 +1048,8 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, unsigned V1Size,
   // If the size of one access is larger than the entire object on the other
   // side, then we know such behavior is undefined and can assume no alias.
   if (TD)
-    if ((V1Size != ~0U && isObjectSmallerThan(O2, V1Size, *TD)) ||
-        (V2Size != ~0U && isObjectSmallerThan(O1, V2Size, *TD)))
+    if ((V1Size != UnknownSize && isObjectSmallerThan(O2, V1Size, *TD)) ||
+        (V2Size != UnknownSize && isObjectSmallerThan(O1, V2Size, *TD)))
       return NoAlias;
   
   // FIXME: This isn't aggressively handling alias(GEP, PHI) for example: if the
@@ -810,7 +1076,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, unsigned V1Size,
   if (const SelectInst *S1 = dyn_cast<SelectInst>(V1))
     return aliasSelect(S1, V1Size, V2, V2Size);
 
-  return MayAlias;
+  return NoAA::alias(V1, V1Size, V2, V2Size);
 }
 
 // Make sure that anything that uses AliasAnalysis pulls in this file.
diff --git a/lib/Analysis/CFGPrinter.cpp b/lib/Analysis/CFGPrinter.cpp
index e06704bd897ca..617a362062fc2 100644
--- a/lib/Analysis/CFGPrinter.cpp
+++ b/lib/Analysis/CFGPrinter.cpp
@@ -25,7 +25,7 @@ using namespace llvm;
 namespace {
   struct CFGViewer : public FunctionPass {
     static char ID; // Pass identifcation, replacement for typeid
-    CFGViewer() : FunctionPass(&ID) {}
+    CFGViewer() : FunctionPass(ID) {}
 
     virtual bool runOnFunction(Function &F) {
       F.viewCFG();
@@ -41,13 +41,12 @@ namespace {
 }
 
 char CFGViewer::ID = 0;
-static RegisterPass<CFGViewer>
-V0("view-cfg", "View CFG of function", false, true);
+INITIALIZE_PASS(CFGViewer, "view-cfg", "View CFG of function", false, true);
 
 namespace {
   struct CFGOnlyViewer : public FunctionPass {
     static char ID; // Pass identifcation, replacement for typeid
-    CFGOnlyViewer() : FunctionPass(&ID) {}
+    CFGOnlyViewer() : FunctionPass(ID) {}
 
     virtual bool runOnFunction(Function &F) {
       F.viewCFGOnly();
@@ -63,15 +62,14 @@ namespace {
 }
 
 char CFGOnlyViewer::ID = 0;
-static RegisterPass<CFGOnlyViewer>
-V1("view-cfg-only",
-   "View CFG of function (with no function bodies)", false, true);
+INITIALIZE_PASS(CFGOnlyViewer, "view-cfg-only",
+                "View CFG of function (with no function bodies)", false, true);
 
 namespace {
   struct CFGPrinter : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    CFGPrinter() : FunctionPass(&ID) {}
-    explicit CFGPrinter(void *pid) : FunctionPass(pid) {}
+    CFGPrinter() : FunctionPass(ID) {}
+    explicit CFGPrinter(char &pid) : FunctionPass(pid) {}
 
     virtual bool runOnFunction(Function &F) {
       std::string Filename = "cfg." + F.getNameStr() + ".dot";
@@ -97,14 +95,14 @@ namespace {
 }
 
 char CFGPrinter::ID = 0;
-static RegisterPass<CFGPrinter>
-P1("dot-cfg", "Print CFG of function to 'dot' file", false, true);
+INITIALIZE_PASS(CFGPrinter, "dot-cfg", "Print CFG of function to 'dot' file", 
+                false, true);
 
 namespace {
   struct CFGOnlyPrinter : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    CFGOnlyPrinter() : FunctionPass(&ID) {}
-    explicit CFGOnlyPrinter(void *pid) : FunctionPass(pid) {}
+    CFGOnlyPrinter() : FunctionPass(ID) {}
+    explicit CFGOnlyPrinter(char &pid) : FunctionPass(pid) {}
     virtual bool runOnFunction(Function &F) {
       std::string Filename = "cfg." + F.getNameStr() + ".dot";
       errs() << "Writing '" << Filename << "'...";
@@ -128,9 +126,9 @@ namespace {
 }
 
 char CFGOnlyPrinter::ID = 0;
-static RegisterPass<CFGOnlyPrinter>
-P2("dot-cfg-only",
-   "Print CFG of function to 'dot' file (with no function bodies)", false, true);
+INITIALIZE_PASS(CFGOnlyPrinter, "dot-cfg-only",
+   "Print CFG of function to 'dot' file (with no function bodies)",
+   false, true);
 
 /// viewCFG - This function is meant for use from the debugger.  You can just
 /// say 'call F->viewCFG()' and a ghostview window should pop up from the
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index d9b670dea58d2..6a2ab681d1acf 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -38,12 +38,15 @@ add_llvm_library(LLVMAnalysis
   ProfileInfoLoader.cpp
   ProfileInfoLoaderPass.cpp
   ProfileVerifierPass.cpp
+  RegionInfo.cpp
+  RegionPrinter.cpp
   ScalarEvolution.cpp
   ScalarEvolutionAliasAnalysis.cpp
   ScalarEvolutionExpander.cpp
   ScalarEvolutionNormalization.cpp
   SparsePropagation.cpp
   Trace.cpp
+  TypeBasedAliasAnalysis.cpp
   ValueTracking.cpp
   )
 
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index 047825884ef35..90eae20858fb9 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -69,7 +69,7 @@ bool llvm::PointerMayBeCaptured(const Value *V,
     switch (I->getOpcode()) {
     case Instruction::Call:
     case Instruction::Invoke: {
-      CallSite CS = CallSite::get(I);
+      CallSite CS(I);
       // Not captured if the callee is readonly, doesn't return a copy through
       // its return value and doesn't unwind (a readonly function can leak bits
       // by throwing an exception or not depending on the input value).
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 13d8f4de48248..0bf7967e83b13 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -778,9 +778,9 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, const Type *DestTy,
   case Instruction::ICmp:
   case Instruction::FCmp: assert(0 && "Invalid for compares");
   case Instruction::Call:
-    if (Function *F = dyn_cast<Function>(Ops[CallInst::ArgOffset ? 0:NumOps-1]))
+    if (Function *F = dyn_cast<Function>(Ops[NumOps - 1]))
       if (canConstantFoldCallTo(F))
-        return ConstantFoldCall(F, Ops+CallInst::ArgOffset, NumOps-1);
+        return ConstantFoldCall(F, Ops, NumOps - 1);
     return 0;
   case Instruction::PtrToInt:
     // If the input is a inttoptr, eliminate the pair.  This requires knowing
diff --git a/lib/Analysis/DbgInfoPrinter.cpp b/lib/Analysis/DbgInfoPrinter.cpp
index 3532b052dc558..0567750606104 100644
--- a/lib/Analysis/DbgInfoPrinter.cpp
+++ b/lib/Analysis/DbgInfoPrinter.cpp
@@ -40,7 +40,7 @@ namespace {
     void printVariableDeclaration(const Value *V);
   public:
     static char ID; // Pass identification
-    PrintDbgInfo() : FunctionPass(&ID), Out(outs()) {}
+    PrintDbgInfo() : FunctionPass(ID), Out(errs()) {}
 
     virtual bool runOnFunction(Function &F);
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
@@ -48,8 +48,8 @@ namespace {
     }
   };
   char PrintDbgInfo::ID = 0;
-  static RegisterPass<PrintDbgInfo> X("print-dbginfo",
-                                     "Print debug info in human readable form");
+  INITIALIZE_PASS(PrintDbgInfo, "print-dbginfo",
+                  "Print debug info in human readable form", false, false);
 }
 
 FunctionPass *llvm::createDbgInfoPrinterPass() { return new PrintDbgInfo(); }
diff --git a/lib/Analysis/DebugInfo.cpp b/lib/Analysis/DebugInfo.cpp
index c8d0d22ec2e10..5ca89c658df6f 100644
--- a/lib/Analysis/DebugInfo.cpp
+++ b/lib/Analysis/DebugInfo.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/DebugInfo.h"
-#include "llvm/Target/TargetMachine.h"  // FIXME: LAYERING VIOLATION!
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Intrinsics.h"
@@ -22,6 +21,8 @@
 #include "llvm/Module.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/raw_ostream.h"
@@ -32,7 +33,22 @@ using namespace llvm::dwarf;
 // DIDescriptor
 //===----------------------------------------------------------------------===//
 
-StringRef 
+DIDescriptor::DIDescriptor(const DIFile F) : DbgNode(F.DbgNode) {
+}
+
+DIDescriptor::DIDescriptor(const DISubprogram F) : DbgNode(F.DbgNode) {
+}
+
+DIDescriptor::DIDescriptor(const DILexicalBlock F) : DbgNode(F.DbgNode) {
+}
+
+DIDescriptor::DIDescriptor(const DIVariable F) : DbgNode(F.DbgNode) {
+}
+
+DIDescriptor::DIDescriptor(const DIType F) : DbgNode(F.DbgNode) {
+}
+
+StringRef
 DIDescriptor::getStringField(unsigned Elt) const {
   if (DbgNode == 0)
     return StringRef();
@@ -60,7 +76,8 @@ DIDescriptor DIDescriptor::getDescriptorField(unsigned Elt) const {
     return DIDescriptor();
 
   if (Elt < DbgNode->getNumOperands())
-    return DIDescriptor(dyn_cast_or_null<const MDNode>(DbgNode->getOperand(Elt)));
+    return
+      DIDescriptor(dyn_cast_or_null<const MDNode>(DbgNode->getOperand(Elt)));
   return DIDescriptor();
 }
 
@@ -73,6 +90,15 @@ GlobalVariable *DIDescriptor::getGlobalVariableField(unsigned Elt) const {
   return 0;
 }
 
+Constant *DIDescriptor::getConstantField(unsigned Elt) const {
+  if (DbgNode == 0)
+    return 0;
+
+  if (Elt < DbgNode->getNumOperands())
+      return dyn_cast_or_null<Constant>(DbgNode->getOperand(Elt));
+  return 0;
+}
+
 Function *DIDescriptor::getFunctionField(unsigned Elt) const {
   if (DbgNode == 0)
     return 0;
@@ -109,6 +135,7 @@ bool DIDescriptor::isDerivedType() const {
   case dwarf::DW_TAG_restrict_type:
   case dwarf::DW_TAG_member:
   case dwarf::DW_TAG_inheritance:
+  case dwarf::DW_TAG_friend:
     return true;
   default:
     // CompositeTypes are currently modelled as DerivedTypes.
@@ -161,7 +188,8 @@ bool DIDescriptor::isSubprogram() const {
 /// isGlobalVariable - Return true if the specified tag is legal for
 /// DIGlobalVariable.
 bool DIDescriptor::isGlobalVariable() const {
-  return DbgNode && getTag() == dwarf::DW_TAG_variable;
+  return DbgNode && (getTag() == dwarf::DW_TAG_variable ||
+                     getTag() == dwarf::DW_TAG_constant);
 }
 
 /// isGlobal - Return true if the specified tag is legal for DIGlobal.
@@ -233,9 +261,8 @@ unsigned DIArray::getNumElements() const {
 }
 
 /// replaceAllUsesWith - Replace all uses of debug info referenced by
-/// this descriptor. After this completes, the current debug info value
-/// is erased.
-void DIDerivedType::replaceAllUsesWith(DIDescriptor &D) {
+/// this descriptor.
+void DIType::replaceAllUsesWith(DIDescriptor &D) {
   if (!DbgNode)
     return;
 
@@ -249,7 +276,7 @@ void DIDerivedType::replaceAllUsesWith(DIDescriptor &D) {
     const MDNode *DN = D;
     const Value *V = cast_or_null<Value>(DN);
     Node->replaceAllUsesWith(const_cast<Value*>(V));
-    Node->destroy();
+    MDNode::deleteTemporary(Node);
   }
 }
 
@@ -277,6 +304,16 @@ bool DIType::Verify() const {
   return true;
 }
 
+/// Verify - Verify that a basic type descriptor is well formed.
+bool DIBasicType::Verify() const {
+  return isBasicType();
+}
+
+/// Verify - Verify that a derived type descriptor is well formed.
+bool DIDerivedType::Verify() const {
+  return isDerivedType();
+}
+
 /// Verify - Verify that a composite type descriptor is well formed.
 bool DICompositeType::Verify() const {
   if (!DbgNode)
@@ -327,7 +364,7 @@ bool DIGlobalVariable::Verify() const {
   if (!Ty.Verify())
     return false;
 
-  if (!getGlobal())
+  if (!getGlobal() && !getConstant())
     return false;
 
   return true;
@@ -355,7 +392,7 @@ bool DIVariable::Verify() const {
 bool DILocation::Verify() const {
   if (!DbgNode)
     return false;
-  
+
   return DbgNode->getNumOperands() == 4;
 }
 
@@ -378,7 +415,7 @@ uint64_t DIDerivedType::getOriginalTypeSize() const {
       Tag == dwarf::DW_TAG_const_type || Tag == dwarf::DW_TAG_volatile_type ||
       Tag == dwarf::DW_TAG_restrict_type) {
     DIType BaseType = getTypeDerivedFrom();
-    // If this type is not derived from any type then take conservative 
+    // If this type is not derived from any type then take conservative
     // approach.
     if (!BaseType.isValid())
       return getSizeInBits();
@@ -387,17 +424,17 @@ uint64_t DIDerivedType::getOriginalTypeSize() const {
     else
       return BaseType.getSizeInBits();
   }
-    
+
   return getSizeInBits();
 }
 
-/// isInlinedFnArgument - Return trule if this variable provides debugging
+/// isInlinedFnArgument - Return true if this variable provides debugging
 /// information for an inlined function arguments.
 bool DIVariable::isInlinedFnArgument(const Function *CurFn) {
   assert(CurFn && "Invalid function");
   if (!getContext().isSubprogram())
     return false;
-  // This variable is not inlined function argument if its scope 
+  // This variable is not inlined function argument if its scope
   // does not describe current function.
   return !(DISubprogram(getContext()).describes(CurFn));
 }
@@ -416,7 +453,7 @@ bool DISubprogram::describes(const Function *F) {
   return false;
 }
 
-unsigned DISubprogram::isOptimized() const     {
+unsigned DISubprogram::isOptimized() const {
   assert (DbgNode && "Invalid subprogram descriptor!");
   if (DbgNode->getNumOperands() == 16)
     return getUnsignedField(15);
@@ -426,7 +463,7 @@ unsigned DISubprogram::isOptimized() const     {
 StringRef DIScope::getFilename() const {
   if (!DbgNode)
     return StringRef();
-  if (isLexicalBlock()) 
+  if (isLexicalBlock())
     return DILexicalBlock(DbgNode).getFilename();
   if (isSubprogram())
     return DISubprogram(DbgNode).getFilename();
@@ -445,7 +482,7 @@ StringRef DIScope::getFilename() const {
 StringRef DIScope::getDirectory() const {
   if (!DbgNode)
     return StringRef();
-  if (isLexicalBlock()) 
+  if (isLexicalBlock())
     return DILexicalBlock(DbgNode).getDirectory();
   if (isSubprogram())
     return DISubprogram(DbgNode).getDirectory();
@@ -899,7 +936,26 @@ DICompositeType DIFactory::CreateCompositeType(unsigned Tag,
     ConstantInt::get(Type::getInt32Ty(VMContext), RuntimeLang),
     ContainingType
   };
-  return DICompositeType(MDNode::get(VMContext, &Elts[0], 13));
+
+  MDNode *Node = MDNode::get(VMContext, &Elts[0], 13);
+  // Create a named metadata so that we do not lose this enum info.
+  if (Tag == dwarf::DW_TAG_enumeration_type) {
+    NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.enum");
+    NMD->addOperand(Node);
+  }
+  return DICompositeType(Node);
+}
+
+
+/// CreateTemporaryType - Create a temporary forward-declared type.
+DIType DIFactory::CreateTemporaryType() {
+  // Give the temporary MDNode a tag. It doesn't matter what tag we
+  // use here as long as DIType accepts it.
+  Value *Elts[] = {
+    GetTagConstant(DW_TAG_base_type)
+  };
+  MDNode *Node = MDNode::getTemporary(VMContext, Elts, array_lengthof(Elts));
+  return DIType(Node);
 }
 
 
@@ -915,8 +971,8 @@ DICompositeType DIFactory::CreateCompositeTypeEx(unsigned Tag,
                                                  unsigned Flags,
                                                  DIType DerivedFrom,
                                                  DIArray Elements,
-                                                 unsigned RuntimeLang) {
-
+                                                 unsigned RuntimeLang,
+                                                 MDNode *ContainingType) {
   Value *Elts[] = {
     GetTagConstant(Tag),
     Context,
@@ -929,9 +985,16 @@ DICompositeType DIFactory::CreateCompositeTypeEx(unsigned Tag,
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
     DerivedFrom,
     Elements,
-    ConstantInt::get(Type::getInt32Ty(VMContext), RuntimeLang)
+    ConstantInt::get(Type::getInt32Ty(VMContext), RuntimeLang),
+    ContainingType
   };
-  return DICompositeType(MDNode::get(VMContext, &Elts[0], 12));
+  MDNode *Node = MDNode::get(VMContext, &Elts[0], 13);
+  // Create a named metadata so that we do not lose this enum info.
+  if (Tag == dwarf::DW_TAG_enumeration_type) {
+    NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.enum");
+    NMD->addOperand(Node);
+  }
+  return DICompositeType(Node);
 }
 
 
@@ -980,8 +1043,8 @@ DISubprogram DIFactory::CreateSubprogram(DIDescriptor Context,
 }
 
 /// CreateSubprogramDefinition - Create new subprogram descriptor for the
-/// given declaration. 
-DISubprogram DIFactory::CreateSubprogramDefinition(DISubprogram &SPDeclaration) {
+/// given declaration.
+DISubprogram DIFactory::CreateSubprogramDefinition(DISubprogram &SPDeclaration){
   if (SPDeclaration.isDefinition())
     return DISubprogram(SPDeclaration);
 
@@ -1046,6 +1109,38 @@ DIFactory::CreateGlobalVariable(DIDescriptor Context, StringRef Name,
   return DIGlobalVariable(Node);
 }
 
+/// CreateGlobalVariable - Create a new descriptor for the specified constant.
+DIGlobalVariable
+DIFactory::CreateGlobalVariable(DIDescriptor Context, StringRef Name,
+                                StringRef DisplayName,
+                                StringRef LinkageName,
+                                DIFile F,
+                                unsigned LineNo, DIType Ty,bool isLocalToUnit,
+                                bool isDefinition, llvm::Constant *Val) {
+  Value *Elts[] = {
+    GetTagConstant(dwarf::DW_TAG_variable),
+    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Context,
+    MDString::get(VMContext, Name),
+    MDString::get(VMContext, DisplayName),
+    MDString::get(VMContext, LinkageName),
+    F,
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
+    Ty,
+    ConstantInt::get(Type::getInt1Ty(VMContext), isLocalToUnit),
+    ConstantInt::get(Type::getInt1Ty(VMContext), isDefinition),
+    Val
+  };
+
+  Value *const *Vs = &Elts[0];
+  MDNode *Node = MDNode::get(VMContext,Vs, 12);
+
+  // Create a named metadata so that we do not lose this mdnode.
+  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv");
+  NMD->addOperand(Node);
+
+  return DIGlobalVariable(Node);
+}
 
 /// CreateVariable - Create a new descriptor for the specified variable.
 DIVariable DIFactory::CreateVariable(unsigned Tag, DIDescriptor Context,
@@ -1073,10 +1168,10 @@ DIVariable DIFactory::CreateVariable(unsigned Tag, DIDescriptor Context,
     char One = '\1';
     if (FName.startswith(StringRef(&One, 1)))
       FName = FName.substr(1);
-    NamedMDNode *FnLocals = M.getNamedMetadata(Twine("llvm.dbg.lv.", FName));
-    if (!FnLocals)
-      FnLocals = NamedMDNode::Create(VMContext, Twine("llvm.dbg.lv.", FName),
-                                     NULL, 0, &M);
+
+    SmallString<32> Out;
+    NamedMDNode *FnLocals =
+      M.getOrInsertNamedMetadata(Twine("llvm.dbg.lv.", FName).toStringRef(Out));
     FnLocals->addOperand(Node);
   }
   return DIVariable(Node);
@@ -1089,7 +1184,7 @@ DIVariable DIFactory::CreateComplexVariable(unsigned Tag, DIDescriptor Context,
                                             const std::string &Name,
                                             DIFile F,
                                             unsigned LineNo,
-                                            DIType Ty, 
+                                            DIType Ty,
                                             SmallVector<Value *, 9> &addr) {
   SmallVector<Value *, 9> Elts;
   Elts.push_back(GetTagConstant(Tag));
@@ -1107,14 +1202,19 @@ DIVariable DIFactory::CreateComplexVariable(unsigned Tag, DIDescriptor Context,
 /// CreateBlock - This creates a descriptor for a lexical block with the
 /// specified parent VMContext.
 DILexicalBlock DIFactory::CreateLexicalBlock(DIDescriptor Context,
-                                             unsigned LineNo, unsigned Col) {
+                                             DIFile F, unsigned LineNo,
+                                             unsigned Col) {
+  // Defeat MDNode uniqing for lexical blocks.
+  static unsigned int unique_id = 0;
   Value *Elts[] = {
     GetTagConstant(dwarf::DW_TAG_lexical_block),
     Context,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Col)
+    ConstantInt::get(Type::getInt32Ty(VMContext), Col),
+    F,
+    ConstantInt::get(Type::getInt32Ty(VMContext), unique_id++)
   };
-  return DILexicalBlock(MDNode::get(VMContext, &Elts[0], 4));
+  return DILexicalBlock(MDNode::get(VMContext, &Elts[0], 6));
 }
 
 /// CreateNameSpace - This creates new descriptor for a namespace
@@ -1174,7 +1274,7 @@ Instruction *DIFactory::InsertDeclare(Value *Storage, DIVariable D,
 
   // If this block already has a terminator then insert this intrinsic
   // before the terminator.
-  if (TerminatorInst *T = InsertAtEnd->getTerminator()) 
+  if (TerminatorInst *T = InsertAtEnd->getTerminator())
     return CallInst::Create(DeclareFn, Args, Args+2, "", T);
   else
     return CallInst::Create(DeclareFn, Args, Args+2, "", InsertAtEnd);}
@@ -1203,7 +1303,7 @@ Instruction *DIFactory::InsertDbgValueIntrinsic(Value *V, uint64_t Offset,
   if (!ValueFn)
     ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
 
-  Value *Args[] = { MDNode::get(V->getContext(), &V, 1), 
+  Value *Args[] = { MDNode::get(V->getContext(), &V, 1),
                     ConstantInt::get(Type::getInt64Ty(V->getContext()), Offset),
                     D };
   return CallInst::Create(ValueFn, Args, Args+3, "", InsertAtEnd);
@@ -1221,21 +1321,21 @@ void DebugInfoFinder::processModule(Module &M) {
            ++BI) {
         if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(BI))
           processDeclare(DDI);
-        
+
         DebugLoc Loc = BI->getDebugLoc();
         if (Loc.isUnknown())
           continue;
-        
+
         LLVMContext &Ctx = BI->getContext();
         DIDescriptor Scope(Loc.getScope(Ctx));
-        
+
         if (Scope.isCompileUnit())
           addCompileUnit(DICompileUnit(Scope));
         else if (Scope.isSubprogram())
           processSubprogram(DISubprogram(Scope));
         else if (Scope.isLexicalBlock())
           processLexicalBlock(DILexicalBlock(Scope));
-        
+
         if (MDNode *IA = Loc.getInlinedAt(Ctx))
           processLocation(DILocation(IA));
       }
@@ -1380,7 +1480,7 @@ static Value *findDbgGlobalDeclare(GlobalVariable *V) {
     return 0;
 
   for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
-    DIDescriptor DIG(cast_or_null<MDNode>(NMD->getOperand(i)));
+    DIDescriptor DIG(cast<MDNode>(NMD->getOperand(i)));
     if (!DIG.isGlobalVariable())
       continue;
     if (DIGlobalVariable(DIG).getGlobal() == V)
@@ -1393,16 +1493,16 @@ static Value *findDbgGlobalDeclare(GlobalVariable *V) {
 /// It looks through pointer casts too.
 static const DbgDeclareInst *findDbgDeclare(const Value *V) {
   V = V->stripPointerCasts();
-  
+
   if (!isa<Instruction>(V) && !isa<Argument>(V))
     return 0;
-    
+
   const Function *F = NULL;
   if (const Instruction *I = dyn_cast<Instruction>(V))
     F = I->getParent()->getParent();
   else if (const Argument *A = dyn_cast<Argument>(V))
     F = A->getParent();
-  
+
   for (Function::const_iterator FI = F->begin(), FE = F->end(); FI != FE; ++FI)
     for (BasicBlock::const_iterator BI = (*FI).begin(), BE = (*FI).end();
          BI != BE; ++BI)
@@ -1460,10 +1560,10 @@ DISubprogram llvm::getDISubprogram(const MDNode *Scope) {
   DIDescriptor D(Scope);
   if (D.isSubprogram())
     return DISubprogram(Scope);
-  
+
   if (D.isLexicalBlock())
     return getDISubprogram(DILexicalBlock(Scope).getContext());
-  
+
   return DISubprogram();
 }
 
@@ -1471,9 +1571,9 @@ DISubprogram llvm::getDISubprogram(const MDNode *Scope) {
 DICompositeType llvm::getDICompositeType(DIType T) {
   if (T.isCompositeType())
     return DICompositeType(T);
-  
+
   if (T.isDerivedType())
     return getDICompositeType(DIDerivedType(T).getTypeDerivedFrom());
-  
+
   return DICompositeType();
 }
diff --git a/lib/Analysis/DomPrinter.cpp b/lib/Analysis/DomPrinter.cpp
index d95c3761bee62..9f340942f2ccb 100644
--- a/lib/Analysis/DomPrinter.cpp
+++ b/lib/Analysis/DomPrinter.cpp
@@ -86,99 +86,100 @@ namespace {
 struct DomViewer
   : public DOTGraphTraitsViewer<DominatorTree, false> {
   static char ID;
-  DomViewer() : DOTGraphTraitsViewer<DominatorTree, false>("dom", &ID){}
+  DomViewer() : DOTGraphTraitsViewer<DominatorTree, false>("dom", ID){}
 };
 
 struct DomOnlyViewer
   : public DOTGraphTraitsViewer<DominatorTree, true> {
   static char ID;
-  DomOnlyViewer() : DOTGraphTraitsViewer<DominatorTree, true>("domonly", &ID){}
+  DomOnlyViewer() : DOTGraphTraitsViewer<DominatorTree, true>("domonly", ID){}
 };
 
 struct PostDomViewer
   : public DOTGraphTraitsViewer<PostDominatorTree, false> {
   static char ID;
   PostDomViewer() :
-    DOTGraphTraitsViewer<PostDominatorTree, false>("postdom", &ID){}
+    DOTGraphTraitsViewer<PostDominatorTree, false>("postdom", ID){}
 };
 
 struct PostDomOnlyViewer
   : public DOTGraphTraitsViewer<PostDominatorTree, true> {
   static char ID;
   PostDomOnlyViewer() :
-    DOTGraphTraitsViewer<PostDominatorTree, true>("postdomonly", &ID){}
+    DOTGraphTraitsViewer<PostDominatorTree, true>("postdomonly", ID){}
 };
 } // end anonymous namespace
 
 char DomViewer::ID = 0;
-RegisterPass<DomViewer> A("view-dom",
-                          "View dominance tree of function");
+INITIALIZE_PASS(DomViewer, "view-dom",
+                "View dominance tree of function", false, false);
 
 char DomOnlyViewer::ID = 0;
-RegisterPass<DomOnlyViewer> B("view-dom-only",
-                              "View dominance tree of function "
-                              "(with no function bodies)");
+INITIALIZE_PASS(DomOnlyViewer, "view-dom-only",
+                "View dominance tree of function (with no function bodies)",
+                false, false);
 
 char PostDomViewer::ID = 0;
-RegisterPass<PostDomViewer> C("view-postdom",
-                              "View postdominance tree of function");
+INITIALIZE_PASS(PostDomViewer, "view-postdom",
+                "View postdominance tree of function", false, false);
 
 char PostDomOnlyViewer::ID = 0;
-RegisterPass<PostDomOnlyViewer> D("view-postdom-only",
-                                  "View postdominance tree of function "
-                                  "(with no function bodies)");
+INITIALIZE_PASS(PostDomOnlyViewer, "view-postdom-only",
+                "View postdominance tree of function "
+                "(with no function bodies)",
+                false, false);
 
 namespace {
 struct DomPrinter
   : public DOTGraphTraitsPrinter<DominatorTree, false> {
   static char ID;
-  DomPrinter() : DOTGraphTraitsPrinter<DominatorTree, false>("dom", &ID) {}
+  DomPrinter() : DOTGraphTraitsPrinter<DominatorTree, false>("dom", ID) {}
 };
 
 struct DomOnlyPrinter
   : public DOTGraphTraitsPrinter<DominatorTree, true> {
   static char ID;
-  DomOnlyPrinter() : DOTGraphTraitsPrinter<DominatorTree, true>("domonly", &ID) {}
+  DomOnlyPrinter() : DOTGraphTraitsPrinter<DominatorTree, true>("domonly", ID) {}
 };
 
 struct PostDomPrinter
   : public DOTGraphTraitsPrinter<PostDominatorTree, false> {
   static char ID;
   PostDomPrinter() :
-    DOTGraphTraitsPrinter<PostDominatorTree, false>("postdom", &ID) {}
+    DOTGraphTraitsPrinter<PostDominatorTree, false>("postdom", ID) {}
 };
 
 struct PostDomOnlyPrinter
   : public DOTGraphTraitsPrinter<PostDominatorTree, true> {
   static char ID;
   PostDomOnlyPrinter() :
-    DOTGraphTraitsPrinter<PostDominatorTree, true>("postdomonly", &ID) {}
+    DOTGraphTraitsPrinter<PostDominatorTree, true>("postdomonly", ID) {}
 };
 } // end anonymous namespace
 
 
 
 char DomPrinter::ID = 0;
-RegisterPass<DomPrinter> E("dot-dom",
-                           "Print dominance tree of function "
-                           "to 'dot' file");
+INITIALIZE_PASS(DomPrinter, "dot-dom",
+                "Print dominance tree of function to 'dot' file",
+                false, false);
 
 char DomOnlyPrinter::ID = 0;
-RegisterPass<DomOnlyPrinter> F("dot-dom-only",
-                               "Print dominance tree of function "
-                               "to 'dot' file "
-                               "(with no function bodies)");
+INITIALIZE_PASS(DomOnlyPrinter, "dot-dom-only",
+                "Print dominance tree of function to 'dot' file "
+                "(with no function bodies)",
+                false, false);
 
 char PostDomPrinter::ID = 0;
-RegisterPass<PostDomPrinter> G("dot-postdom",
-                               "Print postdominance tree of function "
-                               "to 'dot' file");
+INITIALIZE_PASS(PostDomPrinter, "dot-postdom",
+                "Print postdominance tree of function to 'dot' file",
+                false, false);
 
 char PostDomOnlyPrinter::ID = 0;
-RegisterPass<PostDomOnlyPrinter> H("dot-postdom-only",
-                                   "Print postdominance tree of function "
-                                   "to 'dot' file "
-                                   "(with no function bodies)");
+INITIALIZE_PASS(PostDomOnlyPrinter, "dot-postdom-only",
+                "Print postdominance tree of function to 'dot' file "
+                "(with no function bodies)",
+                false, false);
 
 // Create methods available outside of this file, to use them
 // "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by
diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp
index 65c7c6efd8027..b3635283fda58 100644
--- a/lib/Analysis/IPA/CallGraph.cpp
+++ b/lib/Analysis/IPA/CallGraph.cpp
@@ -42,7 +42,7 @@ class BasicCallGraph : public ModulePass, public CallGraph {
 
 public:
   static char ID; // Class identification, replacement for typeinfo
-  BasicCallGraph() : ModulePass(&ID), Root(0), 
+  BasicCallGraph() : ModulePass(ID), Root(0), 
     ExternalCallingNode(0), CallsExternalNode(0) {}
 
   // runOnModule - Compute the call graph for the specified module.
@@ -86,8 +86,8 @@ public:
   /// an analysis interface through multiple inheritance.  If needed, it should
   /// override this to adjust the this pointer as needed for the specified pass
   /// info.
-  virtual void *getAdjustedAnalysisPointer(const PassInfo *PI) {
-    if (PI->isPassID(&CallGraph::ID))
+  virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+    if (PI == &CallGraph::ID)
       return (CallGraph*)this;
     return this;
   }
@@ -145,8 +145,8 @@ private:
     for (Function::iterator BB = F->begin(), BBE = F->end(); BB != BBE; ++BB)
       for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
            II != IE; ++II) {
-        CallSite CS = CallSite::get(II);
-        if (CS.getInstruction() && !isa<DbgInfoIntrinsic>(II)) {
+        CallSite CS(cast<Value>(II));
+        if (CS && !isa<DbgInfoIntrinsic>(II)) {
           const Function *Callee = CS.getCalledFunction();
           if (Callee)
             Node->addCalledFunction(CS, getOrInsertFunction(Callee));
@@ -172,9 +172,8 @@ private:
 } //End anonymous namespace
 
 static RegisterAnalysisGroup<CallGraph> X("Call Graph");
-static RegisterPass<BasicCallGraph>
-Y("basiccg", "Basic CallGraph Construction", false, true);
-static RegisterAnalysisGroup<CallGraph, true> Z(Y);
+INITIALIZE_AG_PASS(BasicCallGraph, CallGraph, "basiccg",
+                   "Basic CallGraph Construction", false, true, true);
 
 char CallGraph::ID = 0;
 char BasicCallGraph::ID = 0;
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index 0c01ee5b82848..b7a27cb288d92 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -45,7 +45,7 @@ class CGPassManager : public ModulePass, public PMDataManager {
 public:
   static char ID;
   explicit CGPassManager(int Depth) 
-    : ModulePass(&ID), PMDataManager(Depth) { }
+    : ModulePass(ID), PMDataManager(Depth) { }
 
   /// run - Execute all of the passes scheduled for execution.  Keep track of
   /// whether any of the passes modifies the module, and if so, return true.
@@ -209,7 +209,7 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC,
           // If the call edge is not from a call or invoke, then the function
           // pass RAUW'd a call with another value.  This can happen when
           // constant folding happens of well known functions etc.
-          CallSite::get(I->first).getInstruction() == 0) {
+          !CallSite(I->first)) {
         assert(!CheckingMode &&
                "CallGraphSCCPass did not update the CallGraph correctly!");
         
@@ -245,8 +245,8 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC,
     
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-        CallSite CS = CallSite::get(I);
-        if (!CS.getInstruction() || isa<DbgInfoIntrinsic>(I)) continue;
+          CallSite CS(cast<Value>(I));
+        if (!CS || isa<DbgInfoIntrinsic>(I)) continue;
         
         // If this call site already existed in the callgraph, just verify it
         // matches up to expectations and remove it from CallSites.
@@ -582,9 +582,9 @@ namespace {
     
   public:
     static char ID;
-    PrintCallGraphPass() : CallGraphSCCPass(&ID), Out(dbgs()) {}
+    PrintCallGraphPass() : CallGraphSCCPass(ID), Out(dbgs()) {}
     PrintCallGraphPass(const std::string &B, raw_ostream &o)
-      : CallGraphSCCPass(&ID), Banner(B), Out(o) {}
+      : CallGraphSCCPass(ID), Banner(B), Out(o) {}
     
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
diff --git a/lib/Analysis/IPA/FindUsedTypes.cpp b/lib/Analysis/IPA/FindUsedTypes.cpp
index c4fb0b9a4e3dd..8eed9d6f68bc5 100644
--- a/lib/Analysis/IPA/FindUsedTypes.cpp
+++ b/lib/Analysis/IPA/FindUsedTypes.cpp
@@ -23,8 +23,8 @@
 using namespace llvm;
 
 char FindUsedTypes::ID = 0;
-static RegisterPass<FindUsedTypes>
-X("print-used-types", "Find Used Types", false, true);
+INITIALIZE_PASS(FindUsedTypes, "print-used-types",
+                "Find Used Types", false, true);
 
 // IncorporateType - Incorporate one type and all of its subtypes into the
 // collection of used types.
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index f13deea41d4ed..6759b0afdce39 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -47,14 +47,15 @@ namespace {
     /// GlobalInfo - Maintain mod/ref info for all of the globals without
     /// addresses taken that are read or written (transitively) by this
     /// function.
-    std::map<GlobalValue*, unsigned> GlobalInfo;
+    std::map<const GlobalValue*, unsigned> GlobalInfo;
 
     /// MayReadAnyGlobal - May read global variables, but it is not known which.
     bool MayReadAnyGlobal;
 
-    unsigned getInfoForGlobal(GlobalValue *GV) const {
+    unsigned getInfoForGlobal(const GlobalValue *GV) const {
       unsigned Effect = MayReadAnyGlobal ? AliasAnalysis::Ref : 0;
-      std::map<GlobalValue*, unsigned>::const_iterator I = GlobalInfo.find(GV);
+      std::map<const GlobalValue*, unsigned>::const_iterator I =
+        GlobalInfo.find(GV);
       if (I != GlobalInfo.end())
         Effect |= I->second;
       return Effect;
@@ -71,23 +72,23 @@ namespace {
   class GlobalsModRef : public ModulePass, public AliasAnalysis {
     /// NonAddressTakenGlobals - The globals that do not have their addresses
     /// taken.
-    std::set<GlobalValue*> NonAddressTakenGlobals;
+    std::set<const GlobalValue*> NonAddressTakenGlobals;
 
     /// IndirectGlobals - The memory pointed to by this global is known to be
     /// 'owned' by the global.
-    std::set<GlobalValue*> IndirectGlobals;
+    std::set<const GlobalValue*> IndirectGlobals;
 
     /// AllocsForIndirectGlobals - If an instruction allocates memory for an
     /// indirect global, this map indicates which one.
-    std::map<Value*, GlobalValue*> AllocsForIndirectGlobals;
+    std::map<const Value*, const GlobalValue*> AllocsForIndirectGlobals;
 
     /// FunctionInfo - For each function, keep track of what globals are
     /// modified or read.
-    std::map<Function*, FunctionRecord> FunctionInfo;
+    std::map<const Function*, FunctionRecord> FunctionInfo;
 
   public:
     static char ID;
-    GlobalsModRef() : ModulePass(&ID) {}
+    GlobalsModRef() : ModulePass(ID) {}
 
     bool runOnModule(Module &M) {
       InitializeAliasAnalysis(this);                 // set up super class
@@ -107,39 +108,39 @@ namespace {
     //
     AliasResult alias(const Value *V1, unsigned V1Size,
                       const Value *V2, unsigned V2Size);
-    ModRefResult getModRefInfo(CallSite CS, Value *P, unsigned Size);
-    ModRefResult getModRefInfo(CallSite CS1, CallSite CS2) {
-      return AliasAnalysis::getModRefInfo(CS1,CS2);
+    ModRefResult getModRefInfo(ImmutableCallSite CS,
+                               const Value *P, unsigned Size);
+    ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                               ImmutableCallSite CS2) {
+      return AliasAnalysis::getModRefInfo(CS1, CS2);
     }
 
     /// getModRefBehavior - Return the behavior of the specified function if
     /// called from the specified call site.  The call site may be null in which
     /// case the most generic behavior of this function should be returned.
-    ModRefBehavior getModRefBehavior(Function *F,
-                                         std::vector<PointerAccessInfo> *Info) {
+    ModRefBehavior getModRefBehavior(const Function *F) {
       if (FunctionRecord *FR = getFunctionInfo(F)) {
         if (FR->FunctionEffect == 0)
           return DoesNotAccessMemory;
         else if ((FR->FunctionEffect & Mod) == 0)
           return OnlyReadsMemory;
       }
-      return AliasAnalysis::getModRefBehavior(F, Info);
+      return AliasAnalysis::getModRefBehavior(F);
     }
     
     /// getModRefBehavior - Return the behavior of the specified function if
     /// called from the specified call site.  The call site may be null in which
     /// case the most generic behavior of this function should be returned.
-    ModRefBehavior getModRefBehavior(CallSite CS,
-                                         std::vector<PointerAccessInfo> *Info) {
-      Function* F = CS.getCalledFunction();
-      if (!F) return AliasAnalysis::getModRefBehavior(CS, Info);
+    ModRefBehavior getModRefBehavior(ImmutableCallSite CS) {
+      const Function* F = CS.getCalledFunction();
+      if (!F) return AliasAnalysis::getModRefBehavior(CS);
       if (FunctionRecord *FR = getFunctionInfo(F)) {
         if (FR->FunctionEffect == 0)
           return DoesNotAccessMemory;
         else if ((FR->FunctionEffect & Mod) == 0)
           return OnlyReadsMemory;
       }
-      return AliasAnalysis::getModRefBehavior(CS, Info);
+      return AliasAnalysis::getModRefBehavior(CS);
     }
 
     virtual void deleteValue(Value *V);
@@ -149,8 +150,8 @@ namespace {
     /// an analysis interface through multiple inheritance.  If needed, it
     /// should override this to adjust the this pointer as needed for the
     /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(const PassInfo *PI) {
-      if (PI->isPassID(&AliasAnalysis::ID))
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &AliasAnalysis::ID)
         return (AliasAnalysis*)this;
       return this;
     }
@@ -158,8 +159,9 @@ namespace {
   private:
     /// getFunctionInfo - Return the function info for the function, or null if
     /// we don't have anything useful to say about it.
-    FunctionRecord *getFunctionInfo(Function *F) {
-      std::map<Function*, FunctionRecord>::iterator I = FunctionInfo.find(F);
+    FunctionRecord *getFunctionInfo(const Function *F) {
+      std::map<const Function*, FunctionRecord>::iterator I =
+        FunctionInfo.find(F);
       if (I != FunctionInfo.end())
         return &I->second;
       return 0;
@@ -175,9 +177,9 @@ namespace {
 }
 
 char GlobalsModRef::ID = 0;
-static RegisterPass<GlobalsModRef>
-X("globalsmodref-aa", "Simple mod/ref analysis for globals", false, true);
-static RegisterAnalysisGroup<AliasAnalysis> Y(X);
+INITIALIZE_AG_PASS(GlobalsModRef, AliasAnalysis,
+                "globalsmodref-aa", "Simple mod/ref analysis for globals",    
+                false, true, false);
 
 Pass *llvm::createGlobalsModRefPass() { return new GlobalsModRef(); }
 
@@ -409,7 +411,7 @@ void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
             FunctionEffect |= CalleeFR->FunctionEffect;
 
             // Incorporate callee's effects on globals into our info.
-            for (std::map<GlobalValue*, unsigned>::iterator GI =
+            for (std::map<const GlobalValue*, unsigned>::iterator GI =
                    CalleeFR->GlobalInfo.begin(), E = CalleeFR->GlobalInfo.end();
                  GI != E; ++GI)
               FR.GlobalInfo[GI->first] |= GI->second;
@@ -477,13 +479,13 @@ AliasAnalysis::AliasResult
 GlobalsModRef::alias(const Value *V1, unsigned V1Size,
                      const Value *V2, unsigned V2Size) {
   // Get the base object these pointers point to.
-  Value *UV1 = const_cast<Value*>(V1->getUnderlyingObject());
-  Value *UV2 = const_cast<Value*>(V2->getUnderlyingObject());
+  const Value *UV1 = V1->getUnderlyingObject();
+  const Value *UV2 = V2->getUnderlyingObject();
 
   // If either of the underlying values is a global, they may be non-addr-taken
   // globals, which we can answer queries about.
-  GlobalValue *GV1 = dyn_cast<GlobalValue>(UV1);
-  GlobalValue *GV2 = dyn_cast<GlobalValue>(UV2);
+  const GlobalValue *GV1 = dyn_cast<GlobalValue>(UV1);
+  const GlobalValue *GV2 = dyn_cast<GlobalValue>(UV2);
   if (GV1 || GV2) {
     // If the global's address is taken, pretend we don't know it's a pointer to
     // the global.
@@ -503,12 +505,12 @@ GlobalsModRef::alias(const Value *V1, unsigned V1Size,
   // so, we may be able to handle this.  First check to see if the base pointer
   // is a direct load from an indirect global.
   GV1 = GV2 = 0;
-  if (LoadInst *LI = dyn_cast<LoadInst>(UV1))
+  if (const LoadInst *LI = dyn_cast<LoadInst>(UV1))
     if (GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0)))
       if (IndirectGlobals.count(GV))
         GV1 = GV;
-  if (LoadInst *LI = dyn_cast<LoadInst>(UV2))
-    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0)))
+  if (const LoadInst *LI = dyn_cast<LoadInst>(UV2))
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0)))
       if (IndirectGlobals.count(GV))
         GV2 = GV;
 
@@ -530,16 +532,17 @@ GlobalsModRef::alias(const Value *V1, unsigned V1Size,
 }
 
 AliasAnalysis::ModRefResult
-GlobalsModRef::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+GlobalsModRef::getModRefInfo(ImmutableCallSite CS,
+                             const Value *P, unsigned Size) {
   unsigned Known = ModRef;
 
   // If we are asking for mod/ref info of a direct call with a pointer to a
   // global we are tracking, return information if we have it.
-  if (GlobalValue *GV = dyn_cast<GlobalValue>(P->getUnderlyingObject()))
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(P->getUnderlyingObject()))
     if (GV->hasLocalLinkage())
-      if (Function *F = CS.getCalledFunction())
+      if (const Function *F = CS.getCalledFunction())
         if (NonAddressTakenGlobals.count(GV))
-          if (FunctionRecord *FR = getFunctionInfo(F))
+          if (const FunctionRecord *FR = getFunctionInfo(F))
             Known = FR->getInfoForGlobal(GV);
 
   if (Known == NoModRef)
@@ -558,7 +561,7 @@ void GlobalsModRef::deleteValue(Value *V) {
       // any AllocRelatedValues for it.
       if (IndirectGlobals.erase(GV)) {
         // Remove any entries in AllocsForIndirectGlobals for this global.
-        for (std::map<Value*, GlobalValue*>::iterator
+        for (std::map<const Value*, const GlobalValue*>::iterator
              I = AllocsForIndirectGlobals.begin(),
              E = AllocsForIndirectGlobals.end(); I != E; ) {
           if (I->second == GV) {
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index 2c997dae5859c..cdf667ad6eed9 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Assembly/AsmAnnotationWriter.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -29,8 +28,7 @@
 using namespace llvm;
 
 char IVUsers::ID = 0;
-static RegisterPass<IVUsers>
-X("iv-users", "Induction Variable Users", false, true);
+INITIALIZE_PASS(IVUsers, "iv-users", "Induction Variable Users", false, true);
 
 Pass *llvm::createIVUsersPass() {
   return new IVUsers();
@@ -39,27 +37,31 @@ Pass *llvm::createIVUsersPass() {
 /// isInteresting - Test whether the given expression is "interesting" when
 /// used by the given expression, within the context of analyzing the
 /// given loop.
-static bool isInteresting(const SCEV *S, const Instruction *I, const Loop *L) {
-  // Anything loop-invariant is interesting.
-  if (!isa<SCEVUnknown>(S) && S->isLoopInvariant(L))
-    return true;
-
+static bool isInteresting(const SCEV *S, const Instruction *I, const Loop *L,
+                          ScalarEvolution *SE) {
   // An addrec is interesting if it's affine or if it has an interesting start.
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     // Keep things simple. Don't touch loop-variant strides.
     if (AR->getLoop() == L)
       return AR->isAffine() || !L->contains(I);
-    // Otherwise recurse to see if the start value is interesting.
-    return isInteresting(AR->getStart(), I, L);
+    // Otherwise recurse to see if the start value is interesting, and that
+    // the step value is not interesting, since we don't yet know how to
+    // do effective SCEV expansions for addrecs with interesting steps.
+    return isInteresting(AR->getStart(), I, L, SE) &&
+          !isInteresting(AR->getStepRecurrence(*SE), I, L, SE);
   }
 
-  // An add is interesting if any of its operands is.
+  // An add is interesting if exactly one of its operands is interesting.
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    bool AnyInterestingYet = false;
     for (SCEVAddExpr::op_iterator OI = Add->op_begin(), OE = Add->op_end();
          OI != OE; ++OI)
-      if (isInteresting(*OI, I, L))
-        return true;
-    return false;
+      if (isInteresting(*OI, I, L, SE)) {
+        if (AnyInterestingYet)
+          return false;
+        AnyInterestingYet = true;
+      }
+    return AnyInterestingYet;
   }
 
   // Nothing else is interesting here.
@@ -85,7 +87,7 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
 
   // If we've come to an uninteresting expression, stop the traversal and
   // call this a user.
-  if (!isInteresting(ISE, I, L))
+  if (!isInteresting(ISE, I, L, SE))
     return false;
 
   SmallPtrSet<Instruction *, 4> UniqueUsers;
@@ -141,7 +143,7 @@ IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand) {
 }
 
 IVUsers::IVUsers()
- : LoopPass(&ID) {
+ : LoopPass(ID) {
 }
 
 void IVUsers::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -176,9 +178,6 @@ void IVUsers::print(raw_ostream &OS, const Module *M) const {
   }
   OS << ":\n";
 
-  // Use a default AssemblyAnnotationWriter to suppress the default info
-  // comments, which aren't relevant here.
-  AssemblyAnnotationWriter Annotator;
   for (ilist<IVStrideUse>::const_iterator UI = IVUses.begin(),
        E = IVUses.end(); UI != E; ++UI) {
     OS << "  ";
@@ -192,7 +191,7 @@ void IVUsers::print(raw_ostream &OS, const Module *M) const {
       OS << ")";
     }
     OS << " in  ";
-    UI->getUser()->print(OS, &Annotator);
+    UI->getUser()->print(OS);
     OS << '\n';
   }
 }
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index b1df517c2a94f..3e550f35c2553 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -152,14 +152,14 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB) {
     if (isa<CallInst>(II) || isa<InvokeInst>(II)) {
       if (isa<DbgInfoIntrinsic>(II))
         continue;  // Debug intrinsics don't count as size.
-      
-      CallSite CS = CallSite::get(const_cast<Instruction*>(&*II));
-      
+
+      ImmutableCallSite CS(cast<Instruction>(II));
+
       // If this function contains a call to setjmp or _setjmp, never inline
       // it.  This is a hack because we depend on the user marking their local
       // variables as volatile if they are live across a setjmp call, and they
       // probably won't do this in callers.
-      if (Function *F = CS.getCalledFunction()) {
+      if (const Function *F = CS.getCalledFunction()) {
         if (F->isDeclaration() && 
             (F->getName() == "setjmp" || F->getName() == "_setjmp"))
           callsSetJmp = true;
diff --git a/lib/Analysis/InstCount.cpp b/lib/Analysis/InstCount.cpp
index bb2cf53c85efd..dcbcac005a2fc 100644
--- a/lib/Analysis/InstCount.cpp
+++ b/lib/Analysis/InstCount.cpp
@@ -51,7 +51,7 @@ namespace {
     }
   public:
     static char ID; // Pass identification, replacement for typeid
-    InstCount() : FunctionPass(&ID) {}
+    InstCount() : FunctionPass(ID) {}
 
     virtual bool runOnFunction(Function &F);
 
@@ -64,8 +64,8 @@ namespace {
 }
 
 char InstCount::ID = 0;
-static RegisterPass<InstCount>
-X("instcount", "Counts the various types of Instructions", false, true);
+INITIALIZE_PASS(InstCount, "instcount",
+                "Counts the various types of Instructions", false, true);
 
 FunctionPass *llvm::createInstCountPass() { return new InstCount(); }
 
diff --git a/lib/Analysis/IntervalPartition.cpp b/lib/Analysis/IntervalPartition.cpp
index 1f17b77a5b96f..1c9e14884316b 100644
--- a/lib/Analysis/IntervalPartition.cpp
+++ b/lib/Analysis/IntervalPartition.cpp
@@ -16,8 +16,8 @@
 using namespace llvm;
 
 char IntervalPartition::ID = 0;
-static RegisterPass<IntervalPartition>
-X("intervals", "Interval Partition Construction", true, true);
+INITIALIZE_PASS(IntervalPartition, "intervals",
+                "Interval Partition Construction", true, true);
 
 //===----------------------------------------------------------------------===//
 // IntervalPartition Implementation
@@ -91,7 +91,7 @@ bool IntervalPartition::runOnFunction(Function &F) {
 // distinguish it from a copy constructor.  Always pass in false for now.
 //
 IntervalPartition::IntervalPartition(IntervalPartition &IP, bool)
-  : FunctionPass(&ID) {
+  : FunctionPass(ID) {
   assert(IP.getRootInterval() && "Cannot operate on empty IntervalPartitions!");
 
   // Pass false to intervals_begin because we take ownership of it's memory
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index ff9026bede97e..e32dbc4447135 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -19,16 +19,18 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 char LazyValueInfo::ID = 0;
-static RegisterPass<LazyValueInfo>
-X("lazy-value-info", "Lazy Value Information Analysis", false, true);
+INITIALIZE_PASS(LazyValueInfo, "lazy-value-info",
+                "Lazy Value Information Analysis", false, true);
 
 namespace llvm {
   FunctionPass *createLazyValueInfoPass() { return new LazyValueInfo(); }
@@ -50,12 +52,15 @@ class LVILatticeVal {
   enum LatticeValueTy {
     /// undefined - This LLVM Value has no known value yet.
     undefined,
+    
     /// constant - This LLVM Value has a specific constant value.
     constant,
-    
     /// notconstant - This LLVM value is known to not have the specified value.
     notconstant,
     
+    /// constantrange
+    constantrange,
+    
     /// overdefined - This instruction is not known to be constant, and we know
     /// it has a value.
     overdefined
@@ -63,42 +68,62 @@ class LVILatticeVal {
   
   /// Val: This stores the current lattice value along with the Constant* for
   /// the constant if this is a 'constant' or 'notconstant' value.
-  PointerIntPair<Constant *, 2, LatticeValueTy> Val;
+  LatticeValueTy Tag;
+  Constant *Val;
+  ConstantRange Range;
   
 public:
-  LVILatticeVal() : Val(0, undefined) {}
+  LVILatticeVal() : Tag(undefined), Val(0), Range(1, true) {}
 
   static LVILatticeVal get(Constant *C) {
     LVILatticeVal Res;
-    Res.markConstant(C);
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(C))
+      Res.markConstantRange(ConstantRange(CI->getValue(), CI->getValue()+1));
+    else if (!isa<UndefValue>(C))
+      Res.markConstant(C);
     return Res;
   }
   static LVILatticeVal getNot(Constant *C) {
     LVILatticeVal Res;
-    Res.markNotConstant(C);
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(C))
+      Res.markConstantRange(ConstantRange(CI->getValue()+1, CI->getValue()));
+    else
+      Res.markNotConstant(C);
+    return Res;
+  }
+  static LVILatticeVal getRange(ConstantRange CR) {
+    LVILatticeVal Res;
+    Res.markConstantRange(CR);
     return Res;
   }
   
-  bool isUndefined() const   { return Val.getInt() == undefined; }
-  bool isConstant() const    { return Val.getInt() == constant; }
-  bool isNotConstant() const { return Val.getInt() == notconstant; }
-  bool isOverdefined() const { return Val.getInt() == overdefined; }
+  bool isUndefined() const     { return Tag == undefined; }
+  bool isConstant() const      { return Tag == constant; }
+  bool isNotConstant() const   { return Tag == notconstant; }
+  bool isConstantRange() const { return Tag == constantrange; }
+  bool isOverdefined() const   { return Tag == overdefined; }
   
   Constant *getConstant() const {
     assert(isConstant() && "Cannot get the constant of a non-constant!");
-    return Val.getPointer();
+    return Val;
   }
   
   Constant *getNotConstant() const {
     assert(isNotConstant() && "Cannot get the constant of a non-notconstant!");
-    return Val.getPointer();
+    return Val;
+  }
+  
+  ConstantRange getConstantRange() const {
+    assert(isConstantRange() &&
+           "Cannot get the constant-range of a non-constant-range!");
+    return Range;
   }
   
   /// markOverdefined - Return true if this is a change in status.
   bool markOverdefined() {
     if (isOverdefined())
       return false;
-    Val.setInt(overdefined);
+    Tag = overdefined;
     return true;
   }
 
@@ -110,9 +135,9 @@ public:
     }
     
     assert(isUndefined());
-    Val.setInt(constant);
+    Tag = constant;
     assert(V && "Marking constant with NULL");
-    Val.setPointer(V);
+    Val = V;
     return true;
   }
   
@@ -128,9 +153,29 @@ public:
     else
       assert(isUndefined());
 
-    Val.setInt(notconstant);
+    Tag = notconstant;
     assert(V && "Marking constant with NULL");
-    Val.setPointer(V);
+    Val = V;
+    return true;
+  }
+  
+  /// markConstantRange - Return true if this is a change in status.
+  bool markConstantRange(const ConstantRange NewR) {
+    if (isConstantRange()) {
+      if (NewR.isEmptySet())
+        return markOverdefined();
+      
+      bool changed = Range == NewR;
+      Range = NewR;
+      return changed;
+    }
+    
+    assert(isUndefined());
+    if (NewR.isEmptySet())
+      return markOverdefined();
+    
+    Tag = constantrange;
+    Range = NewR;
     return true;
   }
   
@@ -147,20 +192,39 @@ public:
             isa<ConstantExpr>(RHS.getNotConstant()))
           return markOverdefined();
         return false;
-      }
-      if (isConstant()) {
+      } else if (isConstant()) {
         if (getConstant() == RHS.getNotConstant() ||
             isa<ConstantExpr>(RHS.getNotConstant()) ||
             isa<ConstantExpr>(getConstant()))
           return markOverdefined();
         return markNotConstant(RHS.getNotConstant());
+      } else if (isConstantRange()) {
+        return markOverdefined();
       }
       
       assert(isUndefined() && "Unexpected lattice");
       return markNotConstant(RHS.getNotConstant());
     }
     
+    if (RHS.isConstantRange()) {
+      if (isConstantRange()) {
+        ConstantRange NewR = Range.unionWith(RHS.getConstantRange());
+        if (NewR.isFullSet())
+          return markOverdefined();
+        else
+          return markConstantRange(NewR);
+      } else if (!isUndefined()) {
+        return markOverdefined();
+      }
+      
+      assert(isUndefined() && "Unexpected lattice");
+      return markConstantRange(RHS.getConstantRange());
+    }
+    
     // RHS must be a constant, we must be undef, constant, or notconstant.
+    assert(!isConstantRange() &&
+           "Constant and ConstantRange cannot be merged.");
+    
     if (isUndefined())
       return markConstant(RHS.getConstant());
     
@@ -191,6 +255,9 @@ raw_ostream &operator<<(raw_ostream &OS, const LVILatticeVal &Val) {
 
   if (Val.isNotConstant())
     return OS << "notconstant<" << *Val.getNotConstant() << '>';
+  else if (Val.isConstantRange())
+    return OS << "constantrange<" << Val.getConstantRange().getLower() << ", "
+              << Val.getConstantRange().getUpper() << '>';
   return OS << "constant<" << *Val.getConstant() << '>';
 }
 }
@@ -206,17 +273,41 @@ namespace {
   public:
     /// BlockCacheEntryTy - This is a computed lattice value at the end of the
     /// specified basic block for a Value* that depends on context.
-    typedef std::pair<BasicBlock*, LVILatticeVal> BlockCacheEntryTy;
+    typedef std::pair<AssertingVH<BasicBlock>, LVILatticeVal> BlockCacheEntryTy;
     
     /// ValueCacheEntryTy - This is all of the cached block information for
     /// exactly one Value*.  The entries are sorted by the BasicBlock* of the
     /// entries, allowing us to do a lookup with a binary search.
-    typedef std::vector<BlockCacheEntryTy> ValueCacheEntryTy;
+    typedef std::map<AssertingVH<BasicBlock>, LVILatticeVal> ValueCacheEntryTy;
 
   private:
+     /// LVIValueHandle - A callback value handle update the cache when
+     /// values are erased.
+    struct LVIValueHandle : public CallbackVH {
+      LazyValueInfoCache *Parent;
+      
+      LVIValueHandle(Value *V, LazyValueInfoCache *P)
+        : CallbackVH(V), Parent(P) { }
+      
+      void deleted();
+      void allUsesReplacedWith(Value* V) {
+        deleted();
+      }
+
+      LVIValueHandle &operator=(Value *V) {
+        return *this = LVIValueHandle(V, Parent);
+      }
+    };
+
     /// ValueCache - This is all of the cached information for all values,
     /// mapped from Value* to key information.
-    DenseMap<Value*, ValueCacheEntryTy> ValueCache;
+    std::map<LVIValueHandle, ValueCacheEntryTy> ValueCache;
+    
+    /// OverDefinedCache - This tracks, on a per-block basis, the set of 
+    /// values that are over-defined at the end of that block.  This is required
+    /// for cache updating.
+    std::set<std::pair<AssertingVH<BasicBlock>, Value*> > OverDefinedCache;
+
   public:
     
     /// getValueInBlock - This is the query interface to determine the lattice
@@ -226,29 +317,23 @@ namespace {
     /// getValueOnEdge - This is the query interface to determine the lattice
     /// value for the specified Value* that is true on the specified edge.
     LVILatticeVal getValueOnEdge(Value *V, BasicBlock *FromBB,BasicBlock *ToBB);
-  };
-} // end anonymous namespace
-
-namespace {
-  struct BlockCacheEntryComparator {
-    static int Compare(const void *LHSv, const void *RHSv) {
-      const LazyValueInfoCache::BlockCacheEntryTy *LHS =
-        static_cast<const LazyValueInfoCache::BlockCacheEntryTy *>(LHSv);
-      const LazyValueInfoCache::BlockCacheEntryTy *RHS =
-        static_cast<const LazyValueInfoCache::BlockCacheEntryTy *>(RHSv);
-      if (LHS->first < RHS->first)
-        return -1;
-      if (LHS->first > RHS->first)
-        return 1;
-      return 0;
-    }
     
-    bool operator()(const LazyValueInfoCache::BlockCacheEntryTy &LHS,
-                    const LazyValueInfoCache::BlockCacheEntryTy &RHS) const {
-      return LHS.first < RHS.first;
+    /// threadEdge - This is the update interface to inform the cache that an
+    /// edge from PredBB to OldSucc has been threaded to be from PredBB to
+    /// NewSucc.
+    void threadEdge(BasicBlock *PredBB,BasicBlock *OldSucc,BasicBlock *NewSucc);
+    
+    /// eraseBlock - This is part of the update interface to inform the cache
+    /// that a block has been deleted.
+    void eraseBlock(BasicBlock *BB);
+    
+    /// clear - Empty the cache.
+    void clear() {
+      ValueCache.clear();
+      OverDefinedCache.clear();
     }
   };
-}
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 //                              LVIQuery Impl
@@ -267,78 +352,87 @@ namespace {
     /// This is the current value being queried for.
     Value *Val;
     
+    /// This is a pointer to the owning cache, for recursive queries.
+    LazyValueInfoCache &Parent;
+
     /// This is all of the cached information about this value.
     ValueCacheEntryTy &Cache;
     
+    /// This tracks, for each block, what values are overdefined.
+    std::set<std::pair<AssertingVH<BasicBlock>, Value*> > &OverDefinedCache;
+    
     ///  NewBlocks - This is a mapping of the new BasicBlocks which have been
     /// added to cache but that are not in sorted order.
-    DenseMap<BasicBlock*, LVILatticeVal> NewBlockInfo;
+    DenseSet<BasicBlock*> NewBlockInfo;
+    
   public:
     
-    LVIQuery(Value *V, ValueCacheEntryTy &VC) : Val(V), Cache(VC) {
+    LVIQuery(Value *V, LazyValueInfoCache &P,
+             ValueCacheEntryTy &VC,
+             std::set<std::pair<AssertingVH<BasicBlock>, Value*> > &ODC)
+      : Val(V), Parent(P), Cache(VC), OverDefinedCache(ODC) {
     }
 
     ~LVIQuery() {
       // When the query is done, insert the newly discovered facts into the
       // cache in sorted order.
       if (NewBlockInfo.empty()) return;
-
-      // Grow the cache to exactly fit the new data.
-      Cache.reserve(Cache.size() + NewBlockInfo.size());
       
-      // If we only have one new entry, insert it instead of doing a full-on
-      // sort.
-      if (NewBlockInfo.size() == 1) {
-        BlockCacheEntryTy Entry = *NewBlockInfo.begin();
-        ValueCacheEntryTy::iterator I =
-          std::lower_bound(Cache.begin(), Cache.end(), Entry,
-                           BlockCacheEntryComparator());
-        assert((I == Cache.end() || I->first != Entry.first) &&
-               "Entry already in map!");
-        
-        Cache.insert(I, Entry);
-        return;
+      for (DenseSet<BasicBlock*>::iterator I = NewBlockInfo.begin(),
+           E = NewBlockInfo.end(); I != E; ++I) {
+        if (Cache[*I].isOverdefined())
+          OverDefinedCache.insert(std::make_pair(*I, Val));
       }
-      
-      // TODO: If we only have two new elements, INSERT them both.
-      
-      Cache.insert(Cache.end(), NewBlockInfo.begin(), NewBlockInfo.end());
-      array_pod_sort(Cache.begin(), Cache.end(),
-                     BlockCacheEntryComparator::Compare);
-      
     }
 
     LVILatticeVal getBlockValue(BasicBlock *BB);
     LVILatticeVal getEdgeValue(BasicBlock *FromBB, BasicBlock *ToBB);
 
   private:
-    LVILatticeVal &getCachedEntryForBlock(BasicBlock *BB);
+    LVILatticeVal getCachedEntryForBlock(BasicBlock *BB);
   };
 } // end anonymous namespace
 
-/// getCachedEntryForBlock - See if we already have a value for this block.  If
-/// so, return it, otherwise create a new entry in the NewBlockInfo map to use.
-LVILatticeVal &LVIQuery::getCachedEntryForBlock(BasicBlock *BB) {
-  
-  // Do a binary search to see if we already have an entry for this block in
-  // the cache set.  If so, find it.
-  if (!Cache.empty()) {
-    ValueCacheEntryTy::iterator Entry =
-      std::lower_bound(Cache.begin(), Cache.end(),
-                       BlockCacheEntryTy(BB, LVILatticeVal()),
-                       BlockCacheEntryComparator());
-    if (Entry != Cache.end() && Entry->first == BB)
-      return Entry->second;
+void LazyValueInfoCache::LVIValueHandle::deleted() {
+  for (std::set<std::pair<AssertingVH<BasicBlock>, Value*> >::iterator
+       I = Parent->OverDefinedCache.begin(),
+       E = Parent->OverDefinedCache.end();
+       I != E; ) {
+    std::set<std::pair<AssertingVH<BasicBlock>, Value*> >::iterator tmp = I;
+    ++I;
+    if (tmp->second == getValPtr())
+      Parent->OverDefinedCache.erase(tmp);
   }
   
-  // Otherwise, check to see if it's in NewBlockInfo or create a new entry if
-  // not.
-  return NewBlockInfo[BB];
+  // This erasure deallocates *this, so it MUST happen after we're done
+  // using any and all members of *this.
+  Parent->ValueCache.erase(*this);
+}
+
+void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
+  for (std::set<std::pair<AssertingVH<BasicBlock>, Value*> >::iterator
+       I = OverDefinedCache.begin(), E = OverDefinedCache.end(); I != E; ) {
+    std::set<std::pair<AssertingVH<BasicBlock>, Value*> >::iterator tmp = I;
+    ++I;
+    if (tmp->first == BB)
+      OverDefinedCache.erase(tmp);
+  }
+
+  for (std::map<LVIValueHandle, ValueCacheEntryTy>::iterator
+       I = ValueCache.begin(), E = ValueCache.end(); I != E; ++I)
+    I->second.erase(BB);
+}
+
+/// getCachedEntryForBlock - See if we already have a value for this block.  If
+/// so, return it, otherwise create a new entry in the Cache map to use.
+LVILatticeVal LVIQuery::getCachedEntryForBlock(BasicBlock *BB) {
+  NewBlockInfo.insert(BB);
+  return Cache[BB];
 }
 
 LVILatticeVal LVIQuery::getBlockValue(BasicBlock *BB) {
   // See if we already have a value for this block.
-  LVILatticeVal &BBLV = getCachedEntryForBlock(BB);
+  LVILatticeVal BBLV = getCachedEntryForBlock(BB);
   
   // If we've already computed this block's value, return it.
   if (!BBLV.isUndefined()) {
@@ -350,13 +444,28 @@ LVILatticeVal LVIQuery::getBlockValue(BasicBlock *BB) {
   // lattice value to overdefined, so that cycles will terminate and be
   // conservatively correct.
   BBLV.markOverdefined();
+  Cache[BB] = BBLV;
   
-  // If V is live into BB, see if our predecessors know anything about it.
   Instruction *BBI = dyn_cast<Instruction>(Val);
   if (BBI == 0 || BBI->getParent() != BB) {
     LVILatticeVal Result;  // Start Undefined.
-    unsigned NumPreds = 0;
     
+    // If this is a pointer, and there's a load from that pointer in this BB,
+    // then we know that the pointer can't be NULL.
+    bool NotNull = false;
+    if (Val->getType()->isPointerTy()) {
+      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();BI != BE;++BI){
+        LoadInst *L = dyn_cast<LoadInst>(BI);
+        if (L && L->getPointerAddressSpace() == 0 &&
+            L->getPointerOperand()->getUnderlyingObject() ==
+              Val->getUnderlyingObject()) {
+          NotNull = true;
+          break;
+        }
+      }
+    }
+    
+    unsigned NumPreds = 0;    
     // Loop over all of our predecessors, merging what we know from them into
     // result.
     for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
@@ -367,11 +476,19 @@ LVILatticeVal LVIQuery::getBlockValue(BasicBlock *BB) {
       if (Result.isOverdefined()) {
         DEBUG(dbgs() << " compute BB '" << BB->getName()
                      << "' - overdefined because of pred.\n");
+        // If we previously determined that this is a pointer that can't be null
+        // then return that rather than giving up entirely.
+        if (NotNull) {
+          const PointerType *PTy = cast<PointerType>(Val->getType());
+          Result = LVILatticeVal::getNot(ConstantPointerNull::get(PTy));
+        }
+        
         return Result;
       }
       ++NumPreds;
     }
     
+    
     // If this is the entry block, we must be asking about an argument.  The
     // value is overdefined.
     if (NumPreds == 0 && BB == &BB->getParent()->front()) {
@@ -382,24 +499,123 @@ LVILatticeVal LVIQuery::getBlockValue(BasicBlock *BB) {
     
     // Return the merged value, which is more precise than 'overdefined'.
     assert(!Result.isOverdefined());
-    return getCachedEntryForBlock(BB) = Result;
+    return Cache[BB] = Result;
   }
   
   // If this value is defined by an instruction in this block, we have to
   // process it here somehow or return overdefined.
   if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
-    (void)PN;
-    // TODO: PHI Translation in preds.
-  } else {
+    LVILatticeVal Result;  // Start Undefined.
     
+    // Loop over all of our predecessors, merging what we know from them into
+    // result.
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+      Value* PhiVal = PN->getIncomingValueForBlock(*PI);
+      Result.mergeIn(Parent.getValueOnEdge(PhiVal, *PI, BB));
+      
+      // If we hit overdefined, exit early.  The BlockVals entry is already set
+      // to overdefined.
+      if (Result.isOverdefined()) {
+        DEBUG(dbgs() << " compute BB '" << BB->getName()
+                     << "' - overdefined because of pred.\n");
+        return Result;
+      }
+    }
+    
+    // Return the merged value, which is more precise than 'overdefined'.
+    assert(!Result.isOverdefined());
+    return Cache[BB] = Result;
   }
-  
-  DEBUG(dbgs() << " compute BB '" << BB->getName()
-               << "' - overdefined because inst def found.\n");
 
+  assert(Cache[BB].isOverdefined() && "Recursive query changed our cache?");
+
+  // We can only analyze the definitions of certain classes of instructions
+  // (integral binops and casts at the moment), so bail if this isn't one.
   LVILatticeVal Result;
-  Result.markOverdefined();
-  return getCachedEntryForBlock(BB) = Result;
+  if ((!isa<BinaryOperator>(BBI) && !isa<CastInst>(BBI)) ||
+     !BBI->getType()->isIntegerTy()) {
+    DEBUG(dbgs() << " compute BB '" << BB->getName()
+                 << "' - overdefined because inst def found.\n");
+    Result.markOverdefined();
+    return Result;
+  }
+   
+  // FIXME: We're currently limited to binops with a constant RHS.  This should
+  // be improved.
+  BinaryOperator *BO = dyn_cast<BinaryOperator>(BBI);
+  if (BO && !isa<ConstantInt>(BO->getOperand(1))) { 
+    DEBUG(dbgs() << " compute BB '" << BB->getName()
+                 << "' - overdefined because inst def found.\n");
+
+    Result.markOverdefined();
+    return Result;
+  }  
+
+  // Figure out the range of the LHS.  If that fails, bail.
+  LVILatticeVal LHSVal = Parent.getValueInBlock(BBI->getOperand(0), BB);
+  if (!LHSVal.isConstantRange()) {
+    Result.markOverdefined();
+    return Result;
+  }
+  
+  ConstantInt *RHS = 0;
+  ConstantRange LHSRange = LHSVal.getConstantRange();
+  ConstantRange RHSRange(1);
+  const IntegerType *ResultTy = cast<IntegerType>(BBI->getType());
+  if (isa<BinaryOperator>(BBI)) {
+    RHS = dyn_cast<ConstantInt>(BBI->getOperand(1));
+    if (!RHS) {
+      Result.markOverdefined();
+      return Result;
+    }
+    
+    RHSRange = ConstantRange(RHS->getValue(), RHS->getValue()+1);
+  }
+      
+  // NOTE: We're currently limited by the set of operations that ConstantRange
+  // can evaluate symbolically.  Enhancing that set will allows us to analyze
+  // more definitions.
+  switch (BBI->getOpcode()) {
+  case Instruction::Add:
+    Result.markConstantRange(LHSRange.add(RHSRange));
+    break;
+  case Instruction::Sub:
+    Result.markConstantRange(LHSRange.sub(RHSRange));
+    break;
+  case Instruction::Mul:
+    Result.markConstantRange(LHSRange.multiply(RHSRange));
+    break;
+  case Instruction::UDiv:
+    Result.markConstantRange(LHSRange.udiv(RHSRange));
+    break;
+  case Instruction::Shl:
+    Result.markConstantRange(LHSRange.shl(RHSRange));
+    break;
+  case Instruction::LShr:
+    Result.markConstantRange(LHSRange.lshr(RHSRange));
+    break;
+  case Instruction::Trunc:
+    Result.markConstantRange(LHSRange.truncate(ResultTy->getBitWidth()));
+    break;
+  case Instruction::SExt:
+    Result.markConstantRange(LHSRange.signExtend(ResultTy->getBitWidth()));
+    break;
+  case Instruction::ZExt:
+    Result.markConstantRange(LHSRange.zeroExtend(ResultTy->getBitWidth()));
+    break;
+  case Instruction::BitCast:
+    Result.markConstantRange(LHSRange);
+    break;
+  
+  // Unhandled instructions are overdefined.
+  default:
+    DEBUG(dbgs() << " compute BB '" << BB->getName()
+                 << "' - overdefined because inst def found.\n");
+    Result.markOverdefined();
+    break;
+  }
+  
+  return Cache[BB] = Result;
 }
 
 
@@ -420,28 +636,57 @@ LVILatticeVal LVIQuery::getEdgeValue(BasicBlock *BBFrom, BasicBlock *BBTo) {
       // it is.
       if (BI->getCondition() == Val)
         return LVILatticeVal::get(ConstantInt::get(
-                               Type::getInt1Ty(Val->getContext()), isTrueDest));
+                              Type::getInt1Ty(Val->getContext()), isTrueDest));
       
       // If the condition of the branch is an equality comparison, we may be
       // able to infer the value.
-      if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition()))
-        if (ICI->isEquality() && ICI->getOperand(0) == Val &&
-            isa<Constant>(ICI->getOperand(1))) {
+      ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition());
+      if (ICI && ICI->getOperand(0) == Val &&
+          isa<Constant>(ICI->getOperand(1))) {
+        if (ICI->isEquality()) {
           // We know that V has the RHS constant if this is a true SETEQ or
           // false SETNE. 
           if (isTrueDest == (ICI->getPredicate() == ICmpInst::ICMP_EQ))
             return LVILatticeVal::get(cast<Constant>(ICI->getOperand(1)));
           return LVILatticeVal::getNot(cast<Constant>(ICI->getOperand(1)));
         }
+          
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(ICI->getOperand(1))) {
+          // Calculate the range of values that would satisfy the comparison.
+          ConstantRange CmpRange(CI->getValue(), CI->getValue()+1);
+          ConstantRange TrueValues =
+            ConstantRange::makeICmpRegion(ICI->getPredicate(), CmpRange);
+            
+          // If we're interested in the false dest, invert the condition.
+          if (!isTrueDest) TrueValues = TrueValues.inverse();
+          
+          // Figure out the possible values of the query BEFORE this branch.  
+          LVILatticeVal InBlock = getBlockValue(BBFrom);
+          if (!InBlock.isConstantRange())
+            return LVILatticeVal::getRange(TrueValues);
+            
+          // Find all potential values that satisfy both the input and output
+          // conditions.
+          ConstantRange PossibleValues =
+            TrueValues.intersectWith(InBlock.getConstantRange());
+            
+          return LVILatticeVal::getRange(PossibleValues);
+        }
+      }
     }
   }
 
   // If the edge was formed by a switch on the value, then we may know exactly
   // what it is.
   if (SwitchInst *SI = dyn_cast<SwitchInst>(BBFrom->getTerminator())) {
-    // If BBTo is the default destination of the switch, we don't know anything.
-    // Given a more powerful range analysis we could know stuff.
-    if (SI->getCondition() == Val && SI->getDefaultDest() != BBTo) {
+    if (SI->getCondition() == Val) {
+      // We don't know anything in the default case.
+      if (SI->getDefaultDest() == BBTo) {
+        LVILatticeVal Result;
+        Result.markOverdefined();
+        return Result;
+      }
+      
       // We only know something if there is exactly one value that goes from
       // BBFrom to BBTo.
       unsigned NumEdges = 0;
@@ -474,7 +719,9 @@ LVILatticeVal LazyValueInfoCache::getValueInBlock(Value *V, BasicBlock *BB) {
   DEBUG(dbgs() << "LVI Getting block end value " << *V << " at '"
         << BB->getName() << "'\n");
   
-  LVILatticeVal Result = LVIQuery(V, ValueCache[V]).getBlockValue(BB);
+  LVILatticeVal Result = LVIQuery(V, *this,
+                                ValueCache[LVIValueHandle(V, this)], 
+                                OverDefinedCache).getBlockValue(BB);
   
   DEBUG(dbgs() << "  Result = " << Result << "\n");
   return Result;
@@ -488,24 +735,80 @@ getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB) {
   
   DEBUG(dbgs() << "LVI Getting edge value " << *V << " from '"
         << FromBB->getName() << "' to '" << ToBB->getName() << "'\n");
+  
   LVILatticeVal Result =
-    LVIQuery(V, ValueCache[V]).getEdgeValue(FromBB, ToBB);
+    LVIQuery(V, *this, ValueCache[LVIValueHandle(V, this)],
+             OverDefinedCache).getEdgeValue(FromBB, ToBB);
   
   DEBUG(dbgs() << "  Result = " << Result << "\n");
   
   return Result;
 }
 
+void LazyValueInfoCache::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
+                                    BasicBlock *NewSucc) {
+  // When an edge in the graph has been threaded, values that we could not 
+  // determine a value for before (i.e. were marked overdefined) may be possible
+  // to solve now.  We do NOT try to proactively update these values.  Instead,
+  // we clear their entries from the cache, and allow lazy updating to recompute
+  // them when needed.
+  
+  // The updating process is fairly simple: we need to dropped cached info
+  // for all values that were marked overdefined in OldSucc, and for those same
+  // values in any successor of OldSucc (except NewSucc) in which they were
+  // also marked overdefined.
+  std::vector<BasicBlock*> worklist;
+  worklist.push_back(OldSucc);
+  
+  DenseSet<Value*> ClearSet;
+  for (std::set<std::pair<AssertingVH<BasicBlock>, Value*> >::iterator
+       I = OverDefinedCache.begin(), E = OverDefinedCache.end(); I != E; ++I) {
+    if (I->first == OldSucc)
+      ClearSet.insert(I->second);
+  }
+  
+  // Use a worklist to perform a depth-first search of OldSucc's successors.
+  // NOTE: We do not need a visited list since any blocks we have already
+  // visited will have had their overdefined markers cleared already, and we
+  // thus won't loop to their successors.
+  while (!worklist.empty()) {
+    BasicBlock *ToUpdate = worklist.back();
+    worklist.pop_back();
+    
+    // Skip blocks only accessible through NewSucc.
+    if (ToUpdate == NewSucc) continue;
+    
+    bool changed = false;
+    for (DenseSet<Value*>::iterator I = ClearSet.begin(),E = ClearSet.end();
+         I != E; ++I) {
+      // If a value was marked overdefined in OldSucc, and is here too...
+      std::set<std::pair<AssertingVH<BasicBlock>, Value*> >::iterator OI =
+        OverDefinedCache.find(std::make_pair(ToUpdate, *I));
+      if (OI == OverDefinedCache.end()) continue;
+
+      // Remove it from the caches.
+      ValueCacheEntryTy &Entry = ValueCache[LVIValueHandle(*I, this)];
+      ValueCacheEntryTy::iterator CI = Entry.find(ToUpdate);
+        
+      assert(CI != Entry.end() && "Couldn't find entry to update?");
+      Entry.erase(CI);
+      OverDefinedCache.erase(OI);
+
+      // If we removed anything, then we potentially need to update 
+      // blocks successors too.
+      changed = true;
+    }
+        
+    if (!changed) continue;
+    
+    worklist.insert(worklist.end(), succ_begin(ToUpdate), succ_end(ToUpdate));
+  }
+}
+
 //===----------------------------------------------------------------------===//
 //                            LazyValueInfo Impl
 //===----------------------------------------------------------------------===//
 
-bool LazyValueInfo::runOnFunction(Function &F) {
-  TD = getAnalysisIfAvailable<TargetData>();
-  // Fully lazy.
-  return false;
-}
-
 /// getCache - This lazily constructs the LazyValueInfoCache.
 static LazyValueInfoCache &getCache(void *&PImpl) {
   if (!PImpl)
@@ -513,6 +816,15 @@ static LazyValueInfoCache &getCache(void *&PImpl) {
   return *static_cast<LazyValueInfoCache*>(PImpl);
 }
 
+bool LazyValueInfo::runOnFunction(Function &F) {
+  if (PImpl)
+    getCache(PImpl).clear();
+  
+  TD = getAnalysisIfAvailable<TargetData>();
+  // Fully lazy.
+  return false;
+}
+
 void LazyValueInfo::releaseMemory() {
   // If the cache was allocated, free it.
   if (PImpl) {
@@ -526,6 +838,11 @@ Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB) {
   
   if (Result.isConstant())
     return Result.getConstant();
+  else if (Result.isConstantRange()) {
+    ConstantRange CR = Result.getConstantRange();
+    if (const APInt *SingleVal = CR.getSingleElement())
+      return ConstantInt::get(V->getContext(), *SingleVal);
+  }
   return 0;
 }
 
@@ -537,6 +854,11 @@ Constant *LazyValueInfo::getConstantOnEdge(Value *V, BasicBlock *FromBB,
   
   if (Result.isConstant())
     return Result.getConstant();
+  else if (Result.isConstantRange()) {
+    ConstantRange CR = Result.getConstantRange();
+    if (const APInt *SingleVal = CR.getSingleElement())
+      return ConstantInt::get(V->getContext(), *SingleVal);
+  }
   return 0;
 }
 
@@ -557,6 +879,36 @@ LazyValueInfo::getPredicateOnEdge(unsigned Pred, Value *V, Constant *C,
     return Unknown;
   }
   
+  if (Result.isConstantRange()) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(C);
+    if (!CI) return Unknown;
+    
+    ConstantRange CR = Result.getConstantRange();
+    if (Pred == ICmpInst::ICMP_EQ) {
+      if (!CR.contains(CI->getValue()))
+        return False;
+      
+      if (CR.isSingleElement() && CR.contains(CI->getValue()))
+        return True;
+    } else if (Pred == ICmpInst::ICMP_NE) {
+      if (!CR.contains(CI->getValue()))
+        return True;
+      
+      if (CR.isSingleElement() && CR.contains(CI->getValue()))
+        return False;
+    }
+    
+    // Handle more complex predicates.
+    ConstantRange RHS(CI->getValue(), CI->getValue()+1);
+    ConstantRange TrueValues = ConstantRange::makeICmpRegion(Pred, RHS);
+    if (CR.intersectWith(TrueValues).isEmptySet())
+      return False;
+    else if (TrueValues.contains(CR))
+      return True;
+    
+    return Unknown;
+  }
+  
   if (Result.isNotConstant()) {
     // If this is an equality comparison, we can try to fold it knowing that
     // "V != C1".
@@ -579,4 +931,11 @@ LazyValueInfo::getPredicateOnEdge(unsigned Pred, Value *V, Constant *C,
   return Unknown;
 }
 
+void LazyValueInfo::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
+                               BasicBlock* NewSucc) {
+  if (PImpl) getCache(PImpl).threadEdge(PredBB, OldSucc, NewSucc);
+}
 
+void LazyValueInfo::eraseBlock(BasicBlock *BB) {
+  if (PImpl) getCache(PImpl).eraseBlock(BB);
+}
diff --git a/lib/Analysis/LibCallAliasAnalysis.cpp b/lib/Analysis/LibCallAliasAnalysis.cpp
index 7419659298902..7f51202ecb558 100644
--- a/lib/Analysis/LibCallAliasAnalysis.cpp
+++ b/lib/Analysis/LibCallAliasAnalysis.cpp
@@ -20,11 +20,8 @@ using namespace llvm;
   
 // Register this pass...
 char LibCallAliasAnalysis::ID = 0;
-static RegisterPass<LibCallAliasAnalysis>
-X("libcall-aa", "LibCall Alias Analysis", false, true);
-  
-// Declare that we implement the AliasAnalysis interface
-static RegisterAnalysisGroup<AliasAnalysis> Y(X);
+INITIALIZE_AG_PASS(LibCallAliasAnalysis, AliasAnalysis, "libcall-aa",
+                   "LibCall Alias Analysis", false, true, false);
 
 FunctionPass *llvm::createLibCallAliasAnalysisPass(LibCallInfo *LCI) {
   return new LibCallAliasAnalysis(LCI);
@@ -46,7 +43,7 @@ void LibCallAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
 /// vs the specified pointer/size.
 AliasAnalysis::ModRefResult
 LibCallAliasAnalysis::AnalyzeLibCallDetails(const LibCallFunctionInfo *FI,
-                                            CallSite CS, Value *P,
+                                            ImmutableCallSite CS, const Value *P,
                                             unsigned Size) {
   // If we have a function, check to see what kind of mod/ref effects it
   // has.  Start by including any info globally known about the function.
@@ -120,13 +117,14 @@ LibCallAliasAnalysis::AnalyzeLibCallDetails(const LibCallFunctionInfo *FI,
 // specified memory object.
 //
 AliasAnalysis::ModRefResult
-LibCallAliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
+LibCallAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
+                                    const Value *P, unsigned Size) {
   ModRefResult MRInfo = ModRef;
   
   // If this is a direct call to a function that LCI knows about, get the
   // information about the runtime function.
   if (LCI) {
-    if (Function *F = CS.getCalledFunction()) {
+    if (const Function *F = CS.getCalledFunction()) {
       if (const LibCallFunctionInfo *FI = LCI->getFunctionInfo(F)) {
         MRInfo = ModRefResult(MRInfo & AnalyzeLibCallDetails(FI, CS, P, Size));
         if (MRInfo == NoModRef) return NoModRef;
diff --git a/lib/Analysis/LibCallSemantics.cpp b/lib/Analysis/LibCallSemantics.cpp
index e0060c3e89b1a..81b0f46f3740e 100644
--- a/lib/Analysis/LibCallSemantics.cpp
+++ b/lib/Analysis/LibCallSemantics.cpp
@@ -40,7 +40,8 @@ const LibCallLocationInfo &LibCallInfo::getLocationInfo(unsigned LocID) const {
 
 /// getFunctionInfo - Return the LibCallFunctionInfo object corresponding to
 /// the specified function if we have it.  If not, return null.
-const LibCallFunctionInfo *LibCallInfo::getFunctionInfo(Function *F) const {
+const LibCallFunctionInfo *
+LibCallInfo::getFunctionInfo(const Function *F) const {
   StringMap<const LibCallFunctionInfo*> *Map = getMap(Impl);
   
   /// If this is the first time we are querying for this info, lazily construct
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index 9f1b30d2cf45c..a9d972435f5fb 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -108,7 +108,7 @@ namespace {
     raw_string_ostream MessagesStr;
 
     static char ID; // Pass identification, replacement for typeid
-    Lint() : FunctionPass(&ID), MessagesStr(Messages) {}
+    Lint() : FunctionPass(ID), MessagesStr(Messages) {}
 
     virtual bool runOnFunction(Function &F);
 
@@ -167,8 +167,7 @@ namespace {
 }
 
 char Lint::ID = 0;
-static RegisterPass<Lint>
-X("lint", "Statically lint-checks LLVM IR", false, true);
+INITIALIZE_PASS(Lint, "lint", "Statically lint-checks LLVM IR", false, true);
 
 // Assert - We know that cond should be true, if not print an error message.
 #define Assert(C, M) \
@@ -247,8 +246,7 @@ void Lint::visitCallSite(CallSite CS) {
         // where nothing is known.
         if (Formal->hasNoAliasAttr() && Actual->getType()->isPointerTy())
           for (CallSite::arg_iterator BI = CS.arg_begin(); BI != AE; ++BI) {
-            Assert1(AI == BI ||
-                    AA->alias(*AI, ~0u, *BI, ~0u) != AliasAnalysis::MustAlias,
+            Assert1(AI == BI || AA->alias(*AI, *BI) != AliasAnalysis::MustAlias,
                     "Unusual: noalias argument aliases another argument", &I);
           }
 
@@ -520,6 +518,9 @@ void Lint::visitVAArgInst(VAArgInst &I) {
 
 void Lint::visitIndirectBrInst(IndirectBrInst &I) {
   visitMemoryReference(I, I.getAddress(), ~0u, 0, 0, MemRef::Branchee);
+
+  Assert1(I.getNumDestinations() != 0,
+          "Undefined behavior: indirectbr with no destinations", &I);
 }
 
 void Lint::visitExtractElementInst(ExtractElementInst &I) {
diff --git a/lib/Analysis/LiveValues.cpp b/lib/Analysis/LiveValues.cpp
index 23964ffc457ed..0225f4fa25486 100644
--- a/lib/Analysis/LiveValues.cpp
+++ b/lib/Analysis/LiveValues.cpp
@@ -22,10 +22,10 @@ namespace llvm {
 }
 
 char LiveValues::ID = 0;
-static RegisterPass<LiveValues>
-X("live-values", "Value Liveness Analysis", false, true);
+INITIALIZE_PASS(LiveValues, "live-values",
+                "Value Liveness Analysis", false, true);
 
-LiveValues::LiveValues() : FunctionPass(&ID) {}
+LiveValues::LiveValues() : FunctionPass(ID) {}
 
 void LiveValues::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<DominatorTree>();
diff --git a/lib/Analysis/LoopDependenceAnalysis.cpp b/lib/Analysis/LoopDependenceAnalysis.cpp
index e1019474cf431..82c02dcd13425 100644
--- a/lib/Analysis/LoopDependenceAnalysis.cpp
+++ b/lib/Analysis/LoopDependenceAnalysis.cpp
@@ -46,8 +46,8 @@ LoopPass *llvm::createLoopDependenceAnalysisPass() {
   return new LoopDependenceAnalysis();
 }
 
-static RegisterPass<LoopDependenceAnalysis>
-R("lda", "Loop Dependence Analysis", false, true);
+INITIALIZE_PASS(LoopDependenceAnalysis, "lda",
+                "Loop Dependence Analysis", false, true);
 char LoopDependenceAnalysis::ID = 0;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 818d0a9dd1146..46219d1b6f55c 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -38,8 +38,7 @@ VerifyLoopInfoX("verify-loop-info", cl::location(VerifyLoopInfo),
                 cl::desc("Verify loop info (time consuming)"));
 
 char LoopInfo::ID = 0;
-static RegisterPass<LoopInfo>
-X("loops", "Natural Loop Information", true, true);
+INITIALIZE_PASS(LoopInfo, "loops", "Natural Loop Information", true, true);
 
 //===----------------------------------------------------------------------===//
 // Loop implementation
@@ -124,14 +123,13 @@ PHINode *Loop::getCanonicalInductionVariable() const {
   BasicBlock *H = getHeader();
 
   BasicBlock *Incoming = 0, *Backedge = 0;
-  typedef GraphTraits<Inverse<BasicBlock*> > InvBlockTraits;
-  InvBlockTraits::ChildIteratorType PI = InvBlockTraits::child_begin(H);
-  assert(PI != InvBlockTraits::child_end(H) &&
+  pred_iterator PI = pred_begin(H);
+  assert(PI != pred_end(H) &&
          "Loop must have at least one backedge!");
   Backedge = *PI++;
-  if (PI == InvBlockTraits::child_end(H)) return 0;  // dead loop
+  if (PI == pred_end(H)) return 0;  // dead loop
   Incoming = *PI++;
-  if (PI != InvBlockTraits::child_end(H)) return 0;  // multiple backedges?
+  if (PI != pred_end(H)) return 0;  // multiple backedges?
 
   if (contains(Incoming)) {
     if (contains(Backedge))
@@ -157,18 +155,6 @@ PHINode *Loop::getCanonicalInductionVariable() const {
   return 0;
 }
 
-/// getCanonicalInductionVariableIncrement - Return the LLVM value that holds
-/// the canonical induction variable value for the "next" iteration of the
-/// loop.  This always succeeds if getCanonicalInductionVariable succeeds.
-///
-Instruction *Loop::getCanonicalInductionVariableIncrement() const {
-  if (PHINode *PN = getCanonicalInductionVariable()) {
-    bool P1InLoop = contains(PN->getIncomingBlock(1));
-    return cast<Instruction>(PN->getIncomingValue(P1InLoop));
-  }
-  return 0;
-}
-
 /// getTripCount - Return a loop-invariant LLVM value indicating the number of
 /// times the loop will be executed.  Note that this means that the backedge
 /// of the loop executes N-1 times.  If the trip-count cannot be determined,
@@ -180,12 +166,12 @@ Instruction *Loop::getCanonicalInductionVariableIncrement() const {
 Value *Loop::getTripCount() const {
   // Canonical loops will end with a 'cmp ne I, V', where I is the incremented
   // canonical induction variable and V is the trip count of the loop.
-  Instruction *Inc = getCanonicalInductionVariableIncrement();
-  if (Inc == 0) return 0;
-  PHINode *IV = cast<PHINode>(Inc->getOperand(0));
+  PHINode *IV = getCanonicalInductionVariable();
+  if (IV == 0 || IV->getNumIncomingValues() != 2) return 0;
 
-  BasicBlock *BackedgeBlock =
-    IV->getIncomingBlock(contains(IV->getIncomingBlock(1)));
+  bool P0InLoop = contains(IV->getIncomingBlock(0));
+  Value *Inc = IV->getIncomingValue(!P0InLoop);
+  BasicBlock *BackedgeBlock = IV->getIncomingBlock(!P0InLoop);
 
   if (BranchInst *BI = dyn_cast<BranchInst>(BackedgeBlock->getTerminator()))
     if (BI->isConditional()) {
@@ -341,16 +327,12 @@ Loop::getUniqueExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const {
     BasicBlock *current = *BI;
     switchExitBlocks.clear();
 
-    typedef GraphTraits<BasicBlock *> BlockTraits;
-    typedef GraphTraits<Inverse<BasicBlock *> > InvBlockTraits;
-    for (BlockTraits::ChildIteratorType I =
-         BlockTraits::child_begin(*BI), E = BlockTraits::child_end(*BI);
-         I != E; ++I) {
+    for (succ_iterator I = succ_begin(*BI), E = succ_end(*BI); I != E; ++I) {
       // If block is inside the loop then it is not a exit block.
       if (std::binary_search(LoopBBs.begin(), LoopBBs.end(), *I))
         continue;
 
-      InvBlockTraits::ChildIteratorType PI = InvBlockTraits::child_begin(*I);
+      pred_iterator PI = pred_begin(*I);
       BasicBlock *firstPred = *PI;
 
       // If current basic block is this exit block's first predecessor
@@ -363,8 +345,7 @@ Loop::getUniqueExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const {
       // If a terminator has more then two successors, for example SwitchInst,
       // then it is possible that there are multiple edges from current block
       // to one exit block.
-      if (std::distance(BlockTraits::child_begin(current),
-                        BlockTraits::child_end(current)) <= 2) {
+      if (std::distance(succ_begin(current), succ_end(current)) <= 2) {
         ExitBlocks.push_back(*I);
         continue;
       }
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 2727d2f9465c1..15d4db8f5f98e 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -30,9 +30,9 @@ private:
 
 public:
   static char ID;
-  PrintLoopPass() : LoopPass(&ID), Out(dbgs()) {}
+  PrintLoopPass() : LoopPass(ID), Out(dbgs()) {}
   PrintLoopPass(const std::string &B, raw_ostream &o)
-      : LoopPass(&ID), Banner(B), Out(o) {}
+      : LoopPass(ID), Banner(B), Out(o) {}
 
   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
     AU.setPreservesAll();
@@ -59,7 +59,7 @@ char PrintLoopPass::ID = 0;
 char LPPassManager::ID = 0;
 
 LPPassManager::LPPassManager(int Depth) 
-  : FunctionPass(&ID), PMDataManager(Depth) { 
+  : FunctionPass(ID), PMDataManager(Depth) { 
   skipThisLoop = false;
   redoThisLoop = false;
   LI = NULL;
@@ -183,7 +183,7 @@ void LPPassManager::redoLoop(Loop *L) {
 void LPPassManager::cloneBasicBlockSimpleAnalysis(BasicBlock *From, 
                                                   BasicBlock *To, Loop *L) {
   for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
-    LoopPass *LP = (LoopPass *)getContainedPass(Index);
+    LoopPass *LP = getContainedPass(Index);
     LP->cloneBasicBlockAnalysis(From, To, L);
   }
 }
@@ -198,7 +198,7 @@ void LPPassManager::deleteSimpleAnalysisValue(Value *V, Loop *L) {
     }
   }
   for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
-    LoopPass *LP = (LoopPass *)getContainedPass(Index);
+    LoopPass *LP = getContainedPass(Index);
     LP->deleteAnalysisValue(V, L);
   }
 }
@@ -240,7 +240,7 @@ bool LPPassManager::runOnFunction(Function &F) {
        I != E; ++I) {
     Loop *L = *I;
     for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
-      LoopPass *P = (LoopPass*)getContainedPass(Index);
+      LoopPass *P = getContainedPass(Index);
       Changed |= P->doInitialization(L, *this);
     }
   }
@@ -254,7 +254,7 @@ bool LPPassManager::runOnFunction(Function &F) {
 
     // Run all passes on the current Loop.
     for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
-      LoopPass *P = (LoopPass*)getContainedPass(Index);
+      LoopPass *P = getContainedPass(Index);
 
       dumpPassInfo(P, EXECUTION_MSG, ON_LOOP_MSG,
                    CurrentLoop->getHeader()->getName());
@@ -320,7 +320,7 @@ bool LPPassManager::runOnFunction(Function &F) {
   
   // Finalization
   for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    LoopPass *P = (LoopPass *)getContainedPass(Index);
+    LoopPass *P = getContainedPass(Index);
     Changed |= P->doFinalization();
   }
 
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 1f54d740db9de..d18d5ce0ea4cb 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -46,11 +46,11 @@ STATISTIC(NumCacheCompleteNonLocalPtr,
 char MemoryDependenceAnalysis::ID = 0;
   
 // Register this pass...
-static RegisterPass<MemoryDependenceAnalysis> X("memdep",
-                                     "Memory Dependence Analysis", false, true);
+INITIALIZE_PASS(MemoryDependenceAnalysis, "memdep",
+                "Memory Dependence Analysis", false, true);
 
 MemoryDependenceAnalysis::MemoryDependenceAnalysis()
-: FunctionPass(&ID), PredCache(0) {
+: FunctionPass(ID), PredCache(0) {
 }
 MemoryDependenceAnalysis::~MemoryDependenceAnalysis() {
 }
@@ -120,33 +120,21 @@ getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall,
       Pointer = CI->getArgOperand(0);
       // calls to free() erase the entire structure
       PointerSize = ~0ULL;
-    } else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+    } else if (CallSite InstCS = cast<Value>(Inst)) {
       // Debug intrinsics don't cause dependences.
       if (isa<DbgInfoIntrinsic>(Inst)) continue;
-      CallSite InstCS = CallSite::get(Inst);
       // If these two calls do not interfere, look past it.
       switch (AA->getModRefInfo(CS, InstCS)) {
       case AliasAnalysis::NoModRef:
-        // If the two calls don't interact (e.g. InstCS is readnone) keep
-        // scanning.
+        // If the two calls are the same, return InstCS as a Def, so that
+        // CS can be found redundant and eliminated.
+        if (isReadOnlyCall && InstCS.onlyReadsMemory() &&
+            CS.getInstruction()->isIdenticalToWhenDefined(Inst))
+          return MemDepResult::getDef(Inst);
+
+        // Otherwise if the two calls don't interact (e.g. InstCS is readnone)
+        // keep scanning.
         continue;
-      case AliasAnalysis::Ref:
-        // If the two calls read the same memory locations and CS is a readonly
-        // function, then we have two cases: 1) the calls may not interfere with
-        // each other at all.  2) the calls may produce the same value.  In case
-        // #1 we want to ignore the values, in case #2, we want to return Inst
-        // as a Def dependence.  This allows us to CSE in cases like:
-        //   X = strlen(P);
-        //    memchr(...);
-        //   Y = strlen(P);  // Y = X
-        if (isReadOnlyCall) {
-          if (CS.getCalledFunction() != 0 &&
-              CS.getCalledFunction() == InstCS.getCalledFunction())
-            return MemDepResult::getDef(Inst);
-          // Ignore unrelated read/read call dependences.
-          continue;
-        }
-        // FALL THROUGH
       default:
         return MemDepResult::getClobber(Inst);
       }
@@ -196,8 +184,7 @@ getPointerDependencyFrom(Value *MemPtr, uint64_t MemSize, bool isLoad,
         // FIXME: This only considers queries directly on the invariant-tagged
         // pointer, not on query pointers that are indexed off of them.  It'd
         // be nice to handle that at some point.
-        AliasAnalysis::AliasResult R = 
-          AA->alias(II->getArgOperand(2), ~0U, MemPtr, ~0U);
+        AliasAnalysis::AliasResult R = AA->alias(II->getArgOperand(2), MemPtr);
         if (R == AliasAnalysis::MustAlias) {
           InvariantTag = II->getArgOperand(0);
           continue;
@@ -209,8 +196,7 @@ getPointerDependencyFrom(Value *MemPtr, uint64_t MemSize, bool isLoad,
         // FIXME: This only considers queries directly on the invariant-tagged
         // pointer, not on query pointers that are indexed off of them.  It'd
         // be nice to handle that at some point.
-        AliasAnalysis::AliasResult R =
-          AA->alias(II->getArgOperand(1), ~0U, MemPtr, ~0U);
+        AliasAnalysis::AliasResult R = AA->alias(II->getArgOperand(1), MemPtr);
         if (R == AliasAnalysis::MustAlias)
           return MemDepResult::getDef(II);
       }
@@ -387,7 +373,7 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
       MemSize = cast<ConstantInt>(II->getArgOperand(1))->getZExtValue();
       break;
     default:
-      CallSite QueryCS = CallSite::get(QueryInst);
+      CallSite QueryCS(QueryInst);
       bool isReadOnly = AA->onlyReadsMemory(QueryCS);
       LocalCache = getCallSiteDependencyFrom(QueryCS, isReadOnly, ScanPos,
                                              QueryParent);
diff --git a/lib/Analysis/ModuleDebugInfoPrinter.cpp b/lib/Analysis/ModuleDebugInfoPrinter.cpp
index 556d4c8aab54a..2cc1c2aa005ca 100644
--- a/lib/Analysis/ModuleDebugInfoPrinter.cpp
+++ b/lib/Analysis/ModuleDebugInfoPrinter.cpp
@@ -30,7 +30,7 @@ namespace {
     DebugInfoFinder Finder;
   public:
     static char ID; // Pass identification, replacement for typeid
-    ModuleDebugInfoPrinter() : ModulePass(&ID) {}
+    ModuleDebugInfoPrinter() : ModulePass(ID) {}
 
     virtual bool runOnModule(Module &M);
 
@@ -42,9 +42,8 @@ namespace {
 }
 
 char ModuleDebugInfoPrinter::ID = 0;
-static RegisterPass<ModuleDebugInfoPrinter>
-X("module-debuginfo",
-  "Decodes module-level debug info", false, true);
+INITIALIZE_PASS(ModuleDebugInfoPrinter, "module-debuginfo",
+                "Decodes module-level debug info", false, true);
 
 ModulePass *llvm::createModuleDebugInfoPrinterPass() {
   return new ModuleDebugInfoPrinter();
diff --git a/lib/Analysis/PointerTracking.cpp b/lib/Analysis/PointerTracking.cpp
index 14df0b7198791..07f46824700a8 100644
--- a/lib/Analysis/PointerTracking.cpp
+++ b/lib/Analysis/PointerTracking.cpp
@@ -28,7 +28,7 @@
 using namespace llvm;
 
 char PointerTracking::ID = 0;
-PointerTracking::PointerTracking() : FunctionPass(&ID) {}
+PointerTracking::PointerTracking() : FunctionPass(ID) {}
 
 bool PointerTracking::runOnFunction(Function &F) {
   predCache.clear();
@@ -144,6 +144,55 @@ const SCEV *PointerTracking::computeAllocationCount(Value *P,
   return SE->getCouldNotCompute();
 }
 
+Value *PointerTracking::computeAllocationCountValue(Value *P, const Type *&Ty) const 
+{
+  Value *V = P->stripPointerCasts();
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+    Ty = AI->getAllocatedType();
+    // arraySize elements of type Ty.
+    return AI->getArraySize();
+  }
+
+  if (CallInst *CI = extractMallocCall(V)) {
+    Ty = getMallocAllocatedType(CI);
+    if (!Ty)
+      return 0;
+    Value *arraySize = getMallocArraySize(CI, TD);
+    if (!arraySize) {
+      Ty = Type::getInt8Ty(P->getContext());
+      return CI->getArgOperand(0);
+    }
+    // arraySize elements of type Ty.
+    return arraySize;
+  }
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
+    if (GV->hasDefinitiveInitializer()) {
+      Constant *C = GV->getInitializer();
+      if (const ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) {
+        Ty = ATy->getElementType();
+        return ConstantInt::get(Type::getInt32Ty(P->getContext()),
+                               ATy->getNumElements());
+      }
+    }
+    Ty = cast<PointerType>(GV->getType())->getElementType();
+    return ConstantInt::get(Type::getInt32Ty(P->getContext()), 1);
+    //TODO: implement more tracking for globals
+  }
+
+  if (CallInst *CI = dyn_cast<CallInst>(V)) {
+    CallSite CS(CI);
+    Function *F = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
+    if (F == reallocFunc) {
+      Ty = Type::getInt8Ty(P->getContext());
+      // realloc allocates arg1 bytes.
+      return CS.getArgument(1);
+    }
+  }
+
+  return 0;
+}
+
 // Calculates the number of elements of type Ty allocated for P.
 const SCEV *PointerTracking::computeAllocationCountForType(Value *P,
                                                            const Type *Ty)
@@ -263,5 +312,5 @@ void PointerTracking::print(raw_ostream &OS, const Module* M) const {
   }
 }
 
-static RegisterPass<PointerTracking> X("pointertracking",
-                                       "Track pointer bounds", false, true);
+INITIALIZE_PASS(PointerTracking, "pointertracking",
+                "Track pointer bounds", false, true);
diff --git a/lib/Analysis/PostDominators.cpp b/lib/Analysis/PostDominators.cpp
index 7354afa181b2a..cbe8d1867e4f1 100644
--- a/lib/Analysis/PostDominators.cpp
+++ b/lib/Analysis/PostDominators.cpp
@@ -28,8 +28,8 @@ using namespace llvm;
 
 char PostDominatorTree::ID = 0;
 char PostDominanceFrontier::ID = 0;
-static RegisterPass<PostDominatorTree>
-F("postdomtree", "Post-Dominator Tree Construction", true, true);
+INITIALIZE_PASS(PostDominatorTree, "postdomtree",
+                "Post-Dominator Tree Construction", true, true);
 
 bool PostDominatorTree::runOnFunction(Function &F) {
   DT->recalculate(F);
@@ -53,8 +53,8 @@ FunctionPass* llvm::createPostDomTree() {
 //  PostDominanceFrontier Implementation
 //===----------------------------------------------------------------------===//
 
-static RegisterPass<PostDominanceFrontier>
-H("postdomfrontier", "Post-Dominance Frontier Construction", true, true);
+INITIALIZE_PASS(PostDominanceFrontier, "postdomfrontier",
+                "Post-Dominance Frontier Construction", true, true);
 
 const DominanceFrontier::DomSetType &
 PostDominanceFrontier::calculate(const PostDominatorTree &DT,
diff --git a/lib/Analysis/ProfileEstimatorPass.cpp b/lib/Analysis/ProfileEstimatorPass.cpp
index da4ce47692624..ecc0a1845307a 100644
--- a/lib/Analysis/ProfileEstimatorPass.cpp
+++ b/lib/Analysis/ProfileEstimatorPass.cpp
@@ -39,7 +39,7 @@ namespace {
   public:
     static char ID; // Class identification, replacement for typeinfo
     explicit ProfileEstimatorPass(const double execcount = 0)
-      : FunctionPass(&ID), ExecCount(execcount) {
+      : FunctionPass(ID), ExecCount(execcount) {
       if (execcount == 0) ExecCount = LoopWeight;
     }
 
@@ -59,8 +59,8 @@ namespace {
     /// an analysis interface through multiple inheritance.  If needed, it
     /// should override this to adjust the this pointer as needed for the
     /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(const PassInfo *PI) {
-      if (PI->isPassID(&ProfileInfo::ID))
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &ProfileInfo::ID)
         return (ProfileInfo*)this;
       return this;
     }
@@ -72,13 +72,11 @@ namespace {
 }  // End of anonymous namespace
 
 char ProfileEstimatorPass::ID = 0;
-static RegisterPass<ProfileEstimatorPass>
-X("profile-estimator", "Estimate profiling information", false, true);
-
-static RegisterAnalysisGroup<ProfileInfo> Y(X);
+INITIALIZE_AG_PASS(ProfileEstimatorPass, ProfileInfo, "profile-estimator",
+                "Estimate profiling information", false, true, false);
 
 namespace llvm {
-  const PassInfo *ProfileEstimatorPassID = &X;
+  char &ProfileEstimatorPassID = ProfileEstimatorPass::ID;
 
   FunctionPass *createProfileEstimatorPass() {
     return new ProfileEstimatorPass();
diff --git a/lib/Analysis/ProfileInfo.cpp b/lib/Analysis/ProfileInfo.cpp
index 8d2712fd6e063..fc7f28662c017 100644
--- a/lib/Analysis/ProfileInfo.cpp
+++ b/lib/Analysis/ProfileInfo.cpp
@@ -1076,14 +1076,14 @@ raw_ostream& operator<<(raw_ostream &O, std::pair<const MachineBasicBlock *, con
 namespace {
   struct NoProfileInfo : public ImmutablePass, public ProfileInfo {
     static char ID; // Class identification, replacement for typeinfo
-    NoProfileInfo() : ImmutablePass(&ID) {}
+    NoProfileInfo() : ImmutablePass(ID) {}
     
     /// getAdjustedAnalysisPointer - This method is used when a pass implements
     /// an analysis interface through multiple inheritance.  If needed, it
     /// should override this to adjust the this pointer as needed for the
     /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(const PassInfo *PI) {
-      if (PI->isPassID(&ProfileInfo::ID))
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &ProfileInfo::ID)
         return (ProfileInfo*)this;
       return this;
     }
@@ -1096,10 +1096,7 @@ namespace {
 
 char NoProfileInfo::ID = 0;
 // Register this pass...
-static RegisterPass<NoProfileInfo>
-X("no-profile", "No Profile Information", false, true);
-
-// Declare that we implement the ProfileInfo interface
-static RegisterAnalysisGroup<ProfileInfo, true> Y(X);
+INITIALIZE_AG_PASS(NoProfileInfo, ProfileInfo, "no-profile",
+                   "No Profile Information", false, true, true);
 
 ImmutablePass *llvm::createNoProfileInfoPass() { return new NoProfileInfo(); }
diff --git a/lib/Analysis/ProfileInfoLoaderPass.cpp b/lib/Analysis/ProfileInfoLoaderPass.cpp
index 8ea4ecf54f98f..d325b574e8482 100644
--- a/lib/Analysis/ProfileInfoLoaderPass.cpp
+++ b/lib/Analysis/ProfileInfoLoaderPass.cpp
@@ -45,7 +45,7 @@ namespace {
   public:
     static char ID; // Class identification, replacement for typeinfo
     explicit LoaderPass(const std::string &filename = "")
-      : ModulePass(&ID), Filename(filename) {
+      : ModulePass(ID), Filename(filename) {
       if (filename.empty()) Filename = ProfileInfoFilename;
     }
 
@@ -67,8 +67,8 @@ namespace {
     /// an analysis interface through multiple inheritance.  If needed, it
     /// should override this to adjust the this pointer as needed for the
     /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(const PassInfo *PI) {
-      if (PI->isPassID(&ProfileInfo::ID))
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &ProfileInfo::ID)
         return (ProfileInfo*)this;
       return this;
     }
@@ -79,12 +79,10 @@ namespace {
 }  // End of anonymous namespace
 
 char LoaderPass::ID = 0;
-static RegisterPass<LoaderPass>
-X("profile-loader", "Load profile information from llvmprof.out", false, true);
+INITIALIZE_AG_PASS(LoaderPass, ProfileInfo, "profile-loader",
+              "Load profile information from llvmprof.out", false, true, false);
 
-static RegisterAnalysisGroup<ProfileInfo> Y(X);
-
-const PassInfo *llvm::ProfileLoaderPassID = &X;
+char &llvm::ProfileLoaderPassID = LoaderPass::ID;
 
 ModulePass *llvm::createProfileLoaderPass() { return new LoaderPass(); }
 
diff --git a/lib/Analysis/ProfileVerifierPass.cpp b/lib/Analysis/ProfileVerifierPass.cpp
index 5d87e14a97b4f..3f01b2d592bc4 100644
--- a/lib/Analysis/ProfileVerifierPass.cpp
+++ b/lib/Analysis/ProfileVerifierPass.cpp
@@ -59,10 +59,10 @@ namespace llvm {
   public:
     static char ID; // Class identification, replacement for typeinfo
 
-    explicit ProfileVerifierPassT () : FunctionPass(&ID) {
+    explicit ProfileVerifierPassT () : FunctionPass(ID) {
       DisableAssertions = ProfileVerifierDisableAssertions;
     }
-    explicit ProfileVerifierPassT (bool da) : FunctionPass(&ID), 
+    explicit ProfileVerifierPassT (bool da) : FunctionPass(ID), 
                                               DisableAssertions(da) {
     }
 
@@ -366,8 +366,8 @@ namespace llvm {
   char ProfileVerifierPassT<FType, BType>::ID = 0;
 }
 
-static RegisterPass<ProfileVerifierPass>
-X("profile-verifier", "Verify profiling information", false, true);
+INITIALIZE_PASS(ProfileVerifierPass, "profile-verifier",
+                "Verify profiling information", false, true);
 
 namespace llvm {
   FunctionPass *createProfileVerifierPass() {
diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
new file mode 100644
index 0000000000000..abc057a773a9f
--- /dev/null
+++ b/lib/Analysis/RegionInfo.cpp
@@ -0,0 +1,749 @@
+//===- RegionInfo.cpp - SESE region detection analysis --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Detects single entry single exit regions in the control flow graph.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+#define DEBUG_TYPE "region"
+#include "llvm/Support/Debug.h"
+
+#include <set>
+#include <algorithm>
+
+using namespace llvm;
+
+// Always verify if expensive checking is enabled.
+#ifdef XDEBUG
+static bool VerifyRegionInfo = true;
+#else
+static bool VerifyRegionInfo = false;
+#endif
+
+static cl::opt<bool,true>
+VerifyRegionInfoX("verify-region-info", cl::location(VerifyRegionInfo),
+                cl::desc("Verify region info (time consuming)"));
+
+STATISTIC(numRegions,       "The # of regions");
+STATISTIC(numSimpleRegions, "The # of simple regions");
+
+//===----------------------------------------------------------------------===//
+/// PrintStyle - Print region in difference ways.
+enum PrintStyle { PrintNone, PrintBB, PrintRN  };
+
+cl::opt<enum PrintStyle> printStyle("print-region-style", cl::Hidden,
+  cl::desc("style of printing regions"),
+  cl::values(
+    clEnumValN(PrintNone, "none",  "print no details"),
+    clEnumValN(PrintBB, "bb",  "print regions in detail with block_iterator"),
+    clEnumValN(PrintRN, "rn",  "print regions in detail with element_iterator"),
+    clEnumValEnd));
+//===----------------------------------------------------------------------===//
+/// Region Implementation
+Region::Region(BasicBlock *Entry, BasicBlock *Exit, RegionInfo* RInfo,
+               DominatorTree *dt, Region *Parent)
+               : RegionNode(Parent, Entry, 1), RI(RInfo), DT(dt), exit(Exit) {}
+
+Region::~Region() {
+  // Free the cached nodes.
+  for (BBNodeMapT::iterator it = BBNodeMap.begin(),
+         ie = BBNodeMap.end(); it != ie; ++it)
+    delete it->second;
+
+  // Only clean the cache for this Region. Caches of child Regions will be
+  // cleaned when the child Regions are deleted.
+  BBNodeMap.clear();
+
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    delete *I;
+}
+
+bool Region::contains(const BasicBlock *B) const {
+  BasicBlock *BB = const_cast<BasicBlock*>(B);
+
+  assert(DT->getNode(BB) && "BB not part of the dominance tree");
+
+  BasicBlock *entry = getEntry(), *exit = getExit();
+
+  // Toplevel region.
+  if (!exit)
+    return true;
+
+  return (DT->dominates(entry, BB)
+    && !(DT->dominates(exit, BB) && DT->dominates(entry, exit)));
+}
+
+bool Region::contains(const Loop *L) const {
+  // BBs that are not part of any loop are element of the Loop
+  // described by the NULL pointer. This loop is not part of any region,
+  // except if the region describes the whole function.
+  if (L == 0)
+    return getExit() == 0;
+
+  if (!contains(L->getHeader()))
+    return false;
+
+  SmallVector<BasicBlock *, 8> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  for (SmallVectorImpl<BasicBlock*>::iterator BI = ExitingBlocks.begin(),
+       BE = ExitingBlocks.end(); BI != BE; ++BI)
+    if (!contains(*BI))
+      return false;
+
+  return true;
+}
+
+Loop *Region::outermostLoopInRegion(Loop *L) const {
+  if (!contains(L))
+    return 0;
+
+  while (L && contains(L->getParentLoop())) {
+    L = L->getParentLoop();
+  }
+
+  return L;
+}
+
+Loop *Region::outermostLoopInRegion(LoopInfo *LI, BasicBlock* BB) const {
+  assert(LI && BB && "LI and BB cannot be null!");
+  Loop *L = LI->getLoopFor(BB);
+  return outermostLoopInRegion(L);
+}
+
+bool Region::isSimple() const {
+  bool isSimple = true;
+  bool found = false;
+
+  BasicBlock *entry = getEntry(), *exit = getExit();
+
+  // TopLevelRegion
+  if (!exit)
+    return false;
+
+  for (pred_iterator PI = pred_begin(entry), PE = pred_end(entry); PI != PE;
+       ++PI) {
+    BasicBlock *Pred = *PI;
+    if (DT->getNode(Pred) && !contains(Pred)) {
+      if (found) {
+        isSimple = false;
+        break;
+      }
+      found = true;
+    }
+  }
+
+  found = false;
+
+  for (pred_iterator PI = pred_begin(exit), PE = pred_end(exit); PI != PE;
+       ++PI)
+    if (contains(*PI)) {
+      if (found) {
+        isSimple = false;
+        break;
+      }
+      found = true;
+    }
+
+  return isSimple;
+}
+
+std::string Region::getNameStr() const {
+  std::string exitName;
+  std::string entryName;
+
+  if (getEntry()->getName().empty()) {
+    raw_string_ostream OS(entryName);
+
+    WriteAsOperand(OS, getEntry(), false);
+    entryName = OS.str();
+  } else
+    entryName = getEntry()->getNameStr();
+
+  if (getExit()) {
+    if (getExit()->getName().empty()) {
+      raw_string_ostream OS(exitName);
+
+      WriteAsOperand(OS, getExit(), false);
+      exitName = OS.str();
+    } else
+      exitName = getExit()->getNameStr();
+  } else
+    exitName = "<Function Return>";
+
+  return entryName + " => " + exitName;
+}
+
+void Region::verifyBBInRegion(BasicBlock *BB) const {
+  if (!contains(BB))
+    llvm_unreachable("Broken region found!");
+
+  BasicBlock *entry = getEntry(), *exit = getExit();
+
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+    if (!contains(*SI) && exit != *SI)
+      llvm_unreachable("Broken region found!");
+
+  if (entry != BB)
+    for (pred_iterator SI = pred_begin(BB), SE = pred_end(BB); SI != SE; ++SI)
+      if (!contains(*SI))
+        llvm_unreachable("Broken region found!");
+}
+
+void Region::verifyWalk(BasicBlock *BB, std::set<BasicBlock*> *visited) const {
+  BasicBlock *exit = getExit();
+
+  visited->insert(BB);
+
+  verifyBBInRegion(BB);
+
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+    if (*SI != exit && visited->find(*SI) == visited->end())
+        verifyWalk(*SI, visited);
+}
+
+void Region::verifyRegion() const {
+  // Only do verification when user wants to, otherwise this expensive
+  // check will be invoked by PassManager.
+  if (!VerifyRegionInfo) return;
+
+  std::set<BasicBlock*> visited;
+  verifyWalk(getEntry(), &visited);
+}
+
+void Region::verifyRegionNest() const {
+  for (Region::const_iterator RI = begin(), RE = end(); RI != RE; ++RI)
+    (*RI)->verifyRegionNest();
+
+  verifyRegion();
+}
+
+Region::block_iterator Region::block_begin() {
+  return GraphTraits<FlatIt<Region*> >::nodes_begin(this);
+}
+
+Region::block_iterator Region::block_end() {
+  return GraphTraits<FlatIt<Region*> >::nodes_end(this);
+}
+
+Region::const_block_iterator Region::block_begin() const {
+  return GraphTraits<FlatIt<const Region*> >::nodes_begin(this);
+}
+
+Region::const_block_iterator Region::block_end() const {
+  return GraphTraits<FlatIt<const Region*> >::nodes_end(this);
+}
+
+Region::element_iterator Region::element_begin() {
+  return GraphTraits<Region*>::nodes_begin(this);
+}
+
+Region::element_iterator Region::element_end() {
+  return GraphTraits<Region*>::nodes_end(this);
+}
+
+Region::const_element_iterator Region::element_begin() const {
+  return GraphTraits<const Region*>::nodes_begin(this);
+}
+
+Region::const_element_iterator Region::element_end() const {
+  return GraphTraits<const Region*>::nodes_end(this);
+}
+
+Region* Region::getSubRegionNode(BasicBlock *BB) const {
+  Region *R = RI->getRegionFor(BB);
+
+  if (!R || R == this)
+    return 0;
+
+  // If we pass the BB out of this region, that means our code is broken.
+  assert(contains(R) && "BB not in current region!");
+
+  while (contains(R->getParent()) && R->getParent() != this)
+    R = R->getParent();
+
+  if (R->getEntry() != BB)
+    return 0;
+
+  return R;
+}
+
+RegionNode* Region::getBBNode(BasicBlock *BB) const {
+  assert(contains(BB) && "Can get BB node out of this region!");
+
+  BBNodeMapT::const_iterator at = BBNodeMap.find(BB);
+
+  if (at != BBNodeMap.end())
+    return at->second;
+
+  RegionNode *NewNode = new RegionNode(const_cast<Region*>(this), BB);
+  BBNodeMap.insert(std::make_pair(BB, NewNode));
+  return NewNode;
+}
+
+RegionNode* Region::getNode(BasicBlock *BB) const {
+  assert(contains(BB) && "Can get BB node out of this region!");
+  if (Region* Child = getSubRegionNode(BB))
+    return Child->getNode();
+
+  return getBBNode(BB);
+}
+
+void Region::transferChildrenTo(Region *To) {
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    (*I)->parent = To;
+    To->children.push_back(*I);
+  }
+  children.clear();
+}
+
+void Region::addSubRegion(Region *SubRegion) {
+  assert(SubRegion->parent == 0 && "SubRegion already has a parent!");
+  SubRegion->parent = this;
+  // Set up the region node.
+  assert(std::find(children.begin(), children.end(), SubRegion) == children.end()
+         && "Node already exist!");
+  children.push_back(SubRegion);
+}
+
+
+Region *Region::removeSubRegion(Region *Child) {
+  assert(Child->parent == this && "Child is not a child of this region!");
+  Child->parent = 0;
+  RegionSet::iterator I = std::find(children.begin(), children.end(), Child);
+  assert(I != children.end() && "Region does not exit. Unable to remove.");
+  children.erase(children.begin()+(I-begin()));
+  return Child;
+}
+
+unsigned Region::getDepth() const {
+  unsigned Depth = 0;
+
+  for (Region *R = parent; R != 0; R = R->parent)
+    ++Depth;
+
+  return Depth;
+}
+
+void Region::print(raw_ostream &OS, bool print_tree, unsigned level) const {
+  if (print_tree)
+    OS.indent(level*2) << "[" << level << "] " << getNameStr();
+  else
+    OS.indent(level*2) << getNameStr();
+
+  OS << "\n";
+
+
+  if (printStyle != PrintNone) {
+    OS.indent(level*2) << "{\n";
+    OS.indent(level*2 + 2);
+
+    if (printStyle == PrintBB) {
+      for (const_block_iterator I = block_begin(), E = block_end(); I!=E; ++I)
+        OS << **I << ", "; // TODO: remove the last ","
+    } else if (printStyle == PrintRN) {
+      for (const_element_iterator I = element_begin(), E = element_end(); I!=E; ++I)
+        OS << **I << ", "; // TODO: remove the last ",
+    }
+
+    OS << "\n";
+  }
+
+  if (print_tree)
+    for (const_iterator RI = begin(), RE = end(); RI != RE; ++RI)
+      (*RI)->print(OS, print_tree, level+1);
+
+  if (printStyle != PrintNone)
+    OS.indent(level*2) << "} \n";
+}
+
+void Region::dump() const {
+  print(dbgs(), true, getDepth());
+}
+
+void Region::clearNodeCache() {
+  BBNodeMap.clear();
+  for (Region::iterator RI = begin(), RE = end(); RI != RE; ++RI)
+    (*RI)->clearNodeCache();
+}
+
+//===----------------------------------------------------------------------===//
+// RegionInfo implementation
+//
+
+bool RegionInfo::isCommonDomFrontier(BasicBlock *BB, BasicBlock *entry,
+                                     BasicBlock *exit) const {
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+    BasicBlock *P = *PI;
+    if (DT->dominates(entry, P) && !DT->dominates(exit, P))
+      return false;
+  }
+  return true;
+}
+
+bool RegionInfo::isRegion(BasicBlock *entry, BasicBlock *exit) const {
+  assert(entry && exit && "entry and exit must not be null!");
+  typedef DominanceFrontier::DomSetType DST;
+
+  DST *entrySuccs = &DF->find(entry)->second;
+
+  // Exit is the header of a loop that contains the entry. In this case,
+  // the dominance frontier must only contain the exit.
+  if (!DT->dominates(entry, exit)) {
+    for (DST::iterator SI = entrySuccs->begin(), SE = entrySuccs->end();
+         SI != SE; ++SI)
+      if (*SI != exit && *SI != entry)
+        return false;
+
+    return true;
+  }
+
+  DST *exitSuccs = &DF->find(exit)->second;
+
+  // Do not allow edges leaving the region.
+  for (DST::iterator SI = entrySuccs->begin(), SE = entrySuccs->end();
+       SI != SE; ++SI) {
+    if (*SI == exit || *SI == entry)
+      continue;
+    if (exitSuccs->find(*SI) == exitSuccs->end())
+      return false;
+    if (!isCommonDomFrontier(*SI, entry, exit))
+      return false;
+  }
+
+  // Do not allow edges pointing into the region.
+  for (DST::iterator SI = exitSuccs->begin(), SE = exitSuccs->end();
+       SI != SE; ++SI)
+    if (DT->properlyDominates(entry, *SI) && *SI != exit)
+      return false;
+
+
+  return true;
+}
+
+void RegionInfo::insertShortCut(BasicBlock *entry, BasicBlock *exit,
+                             BBtoBBMap *ShortCut) const {
+  assert(entry && exit && "entry and exit must not be null!");
+
+  BBtoBBMap::iterator e = ShortCut->find(exit);
+
+  if (e == ShortCut->end())
+    // No further region at exit available.
+    (*ShortCut)[entry] = exit;
+  else {
+    // We found a region e that starts at exit. Therefore (entry, e->second)
+    // is also a region, that is larger than (entry, exit). Insert the
+    // larger one.
+    BasicBlock *BB = e->second;
+    (*ShortCut)[entry] = BB;
+  }
+}
+
+DomTreeNode* RegionInfo::getNextPostDom(DomTreeNode* N,
+                                        BBtoBBMap *ShortCut) const {
+  BBtoBBMap::iterator e = ShortCut->find(N->getBlock());
+
+  if (e == ShortCut->end())
+    return N->getIDom();
+
+  return PDT->getNode(e->second)->getIDom();
+}
+
+bool RegionInfo::isTrivialRegion(BasicBlock *entry, BasicBlock *exit) const {
+  assert(entry && exit && "entry and exit must not be null!");
+
+  unsigned num_successors = succ_end(entry) - succ_begin(entry);
+
+  if (num_successors <= 1 && exit == *(succ_begin(entry)))
+    return true;
+
+  return false;
+}
+
+void RegionInfo::updateStatistics(Region *R) {
+  ++numRegions;
+
+  // TODO: Slow. Should only be enabled if -stats is used.
+  if (R->isSimple()) ++numSimpleRegions;
+}
+
+Region *RegionInfo::createRegion(BasicBlock *entry, BasicBlock *exit) {
+  assert(entry && exit && "entry and exit must not be null!");
+
+  if (isTrivialRegion(entry, exit))
+    return 0;
+
+  Region *region = new Region(entry, exit, this, DT);
+  BBtoRegion.insert(std::make_pair(entry, region));
+
+ #ifdef XDEBUG
+    region->verifyRegion();
+ #else
+    DEBUG(region->verifyRegion());
+ #endif
+
+  updateStatistics(region);
+  return region;
+}
+
+void RegionInfo::findRegionsWithEntry(BasicBlock *entry, BBtoBBMap *ShortCut) {
+  assert(entry);
+
+  DomTreeNode *N = PDT->getNode(entry);
+
+  if (!N)
+    return;
+
+  Region *lastRegion= 0;
+  BasicBlock *lastExit = entry;
+
+  // As only a BasicBlock that postdominates entry can finish a region, walk the
+  // post dominance tree upwards.
+  while ((N = getNextPostDom(N, ShortCut))) {
+    BasicBlock *exit = N->getBlock();
+
+    if (!exit)
+      break;
+
+    if (isRegion(entry, exit)) {
+      Region *newRegion = createRegion(entry, exit);
+
+      if (lastRegion)
+        newRegion->addSubRegion(lastRegion);
+
+      lastRegion = newRegion;
+      lastExit = exit;
+    }
+
+    // This can never be a region, so stop the search.
+    if (!DT->dominates(entry, exit))
+      break;
+  }
+
+  // Tried to create regions from entry to lastExit.  Next time take a
+  // shortcut from entry to lastExit.
+  if (lastExit != entry)
+    insertShortCut(entry, lastExit, ShortCut);
+}
+
+void RegionInfo::scanForRegions(Function &F, BBtoBBMap *ShortCut) {
+  BasicBlock *entry = &(F.getEntryBlock());
+  DomTreeNode *N = DT->getNode(entry);
+
+  // Iterate over the dominance tree in post order to start with the small
+  // regions from the bottom of the dominance tree.  If the small regions are
+  // detected first, detection of bigger regions is faster, as we can jump
+  // over the small regions.
+  for (po_iterator<DomTreeNode*> FI = po_begin(N), FE = po_end(N); FI != FE;
+    ++FI) {
+    findRegionsWithEntry(FI->getBlock(), ShortCut);
+  }
+}
+
+Region *RegionInfo::getTopMostParent(Region *region) {
+  while (region->parent)
+    region = region->getParent();
+
+  return region;
+}
+
+void RegionInfo::buildRegionsTree(DomTreeNode *N, Region *region) {
+  BasicBlock *BB = N->getBlock();
+
+  // Passed region exit
+  while (BB == region->getExit())
+    region = region->getParent();
+
+  BBtoRegionMap::iterator it = BBtoRegion.find(BB);
+
+  // This basic block is a start block of a region. It is already in the
+  // BBtoRegion relation. Only the child basic blocks have to be updated.
+  if (it != BBtoRegion.end()) {
+    Region *newRegion = it->second;;
+    region->addSubRegion(getTopMostParent(newRegion));
+    region = newRegion;
+  } else {
+    BBtoRegion[BB] = region;
+  }
+
+  for (DomTreeNode::iterator CI = N->begin(), CE = N->end(); CI != CE; ++CI)
+    buildRegionsTree(*CI, region);
+}
+
+void RegionInfo::releaseMemory() {
+  BBtoRegion.clear();
+  if (TopLevelRegion)
+    delete TopLevelRegion;
+  TopLevelRegion = 0;
+}
+
+RegionInfo::RegionInfo() : FunctionPass(ID) {
+  TopLevelRegion = 0;
+}
+
+RegionInfo::~RegionInfo() {
+  releaseMemory();
+}
+
+void RegionInfo::Calculate(Function &F) {
+  // ShortCut a function where for every BB the exit of the largest region
+  // starting with BB is stored. These regions can be threated as single BBS.
+  // This improves performance on linear CFGs.
+  BBtoBBMap ShortCut;
+
+  scanForRegions(F, &ShortCut);
+  BasicBlock *BB = &F.getEntryBlock();
+  buildRegionsTree(DT->getNode(BB), TopLevelRegion);
+}
+
+bool RegionInfo::runOnFunction(Function &F) {
+  releaseMemory();
+
+  DT = &getAnalysis<DominatorTree>();
+  PDT = &getAnalysis<PostDominatorTree>();
+  DF = &getAnalysis<DominanceFrontier>();
+
+  TopLevelRegion = new Region(&F.getEntryBlock(), 0, this, DT, 0);
+  updateStatistics(TopLevelRegion);
+
+  Calculate(F);
+
+  return false;
+}
+
+void RegionInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<DominatorTree>();
+  AU.addRequired<PostDominatorTree>();
+  AU.addRequired<DominanceFrontier>();
+}
+
+void RegionInfo::print(raw_ostream &OS, const Module *) const {
+  OS << "Region tree:\n";
+  TopLevelRegion->print(OS, true, 0);
+  OS << "End region tree\n";
+}
+
+void RegionInfo::verifyAnalysis() const {
+  // Only do verification when user wants to, otherwise this expensive check
+  // will be invoked by PMDataManager::verifyPreservedAnalysis when
+  // a regionpass (marked PreservedAll) finish.
+  if (!VerifyRegionInfo) return;
+
+  TopLevelRegion->verifyRegionNest();
+}
+
+// Region pass manager support.
+Region *RegionInfo::getRegionFor(BasicBlock *BB) const {
+  BBtoRegionMap::const_iterator I=
+    BBtoRegion.find(BB);
+  return I != BBtoRegion.end() ? I->second : 0;
+}
+
+Region *RegionInfo::operator[](BasicBlock *BB) const {
+  return getRegionFor(BB);
+}
+
+
+BasicBlock *RegionInfo::getMaxRegionExit(BasicBlock *BB) const {
+  BasicBlock *Exit = NULL;
+
+  while (true) {
+    // Get largest region that starts at BB.
+    Region *R = getRegionFor(BB);
+    while (R && R->getParent() && R->getParent()->getEntry() == BB)
+      R = R->getParent();
+
+    // Get the single exit of BB.
+    if (R && R->getEntry() == BB)
+      Exit = R->getExit();
+    else if (++succ_begin(BB) == succ_end(BB))
+      Exit = *succ_begin(BB);
+    else // No single exit exists.
+      return Exit;
+
+    // Get largest region that starts at Exit.
+    Region *ExitR = getRegionFor(Exit);
+    while (ExitR && ExitR->getParent()
+           && ExitR->getParent()->getEntry() == Exit)
+      ExitR = ExitR->getParent();
+
+    for (pred_iterator PI = pred_begin(Exit), PE = pred_end(Exit); PI != PE;
+         ++PI)
+      if (!R->contains(*PI) && !ExitR->contains(*PI))
+        break;
+
+    // This stops infinite cycles.
+    if (DT->dominates(Exit, BB))
+      break;
+
+    BB = Exit;
+  }
+
+  return Exit;
+}
+
+Region*
+RegionInfo::getCommonRegion(Region *A, Region *B) const {
+  assert (A && B && "One of the Regions is NULL");
+
+  if (A->contains(B)) return A;
+
+  while (!B->contains(A))
+    B = B->getParent();
+
+  return B;
+}
+
+Region*
+RegionInfo::getCommonRegion(SmallVectorImpl<Region*> &Regions) const {
+  Region* ret = Regions.back();
+  Regions.pop_back();
+
+  for (SmallVectorImpl<Region*>::const_iterator I = Regions.begin(),
+       E = Regions.end(); I != E; ++I)
+      ret = getCommonRegion(ret, *I);
+
+  return ret;
+}
+
+Region*
+RegionInfo::getCommonRegion(SmallVectorImpl<BasicBlock*> &BBs) const {
+  Region* ret = getRegionFor(BBs.back());
+  BBs.pop_back();
+
+  for (SmallVectorImpl<BasicBlock*>::const_iterator I = BBs.begin(),
+       E = BBs.end(); I != E; ++I)
+      ret = getCommonRegion(ret, getRegionFor(*I));
+
+  return ret;
+}
+
+char RegionInfo::ID = 0;
+INITIALIZE_PASS(RegionInfo, "regions",
+                "Detect single entry single exit regions", true, true);
+
+// Create methods available outside of this file, to use them
+// "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by
+// the link time optimization.
+
+namespace llvm {
+  FunctionPass *createRegionInfoPass() {
+    return new RegionInfo();
+  }
+}
+
diff --git a/lib/Analysis/RegionPrinter.cpp b/lib/Analysis/RegionPrinter.cpp
new file mode 100644
index 0000000000000..fee5c1bae9764
--- /dev/null
+++ b/lib/Analysis/RegionPrinter.cpp
@@ -0,0 +1,186 @@
+//===- RegionPrinter.cpp - Print regions tree pass ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Print out the region tree of a function using dotty/graphviz.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/RegionPrinter.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/DOTGraphTraitsPass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+/// onlySimpleRegion - Show only the simple regions in the RegionViewer.
+static cl::opt<bool>
+onlySimpleRegions("only-simple-regions",
+                  cl::desc("Show only simple regions in the graphviz viewer"),
+                  cl::Hidden,
+                  cl::init(false));
+
+namespace llvm {
+template<>
+struct DOTGraphTraits<RegionNode*> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits (bool isSimple=false)
+    : DefaultDOTGraphTraits(isSimple) {}
+
+  std::string getNodeLabel(RegionNode *Node, RegionNode *Graph) {
+
+    if (!Node->isSubRegion()) {
+      BasicBlock *BB = Node->getNodeAs<BasicBlock>();
+
+      if (isSimple())
+        return DOTGraphTraits<const Function*>
+          ::getSimpleNodeLabel(BB, BB->getParent());
+      else
+        return DOTGraphTraits<const Function*>
+          ::getCompleteNodeLabel(BB, BB->getParent());
+    }
+
+    return "Not implemented";
+  }
+};
+
+template<>
+struct DOTGraphTraits<RegionInfo*> : public DOTGraphTraits<RegionNode*> {
+
+  DOTGraphTraits (bool isSimple=false)
+    : DOTGraphTraits<RegionNode*>(isSimple) {}
+
+  static std::string getGraphName(RegionInfo *DT) {
+    return "Region Graph";
+  }
+
+  std::string getNodeLabel(RegionNode *Node, RegionInfo *G) {
+    return DOTGraphTraits<RegionNode*>::getNodeLabel(Node,
+                                                     G->getTopLevelRegion());
+  }
+
+  // Print the cluster of the subregions. This groups the single basic blocks
+  // and adds a different background color for each group.
+  static void printRegionCluster(const Region *R, GraphWriter<RegionInfo*> &GW,
+                                 unsigned depth = 0) {
+    raw_ostream &O = GW.getOStream();
+    O.indent(2 * depth) << "subgraph cluster_" << static_cast<const void*>(R)
+      << " {\n";
+    O.indent(2 * (depth + 1)) << "label = \"\";\n";
+
+    if (!onlySimpleRegions || R->isSimple()) {
+      O.indent(2 * (depth + 1)) << "style = filled;\n";
+      O.indent(2 * (depth + 1)) << "color = "
+        << ((R->getDepth() * 2 % 12) + 1) << "\n";
+
+    } else {
+      O.indent(2 * (depth + 1)) << "style = solid;\n";
+      O.indent(2 * (depth + 1)) << "color = "
+        << ((R->getDepth() * 2 % 12) + 2) << "\n";
+    }
+
+    for (Region::const_iterator RI = R->begin(), RE = R->end(); RI != RE; ++RI)
+      printRegionCluster(*RI, GW, depth + 1);
+
+    RegionInfo *RI = R->getRegionInfo();
+
+    for (Region::const_block_iterator BI = R->block_begin(),
+         BE = R->block_end(); BI != BE; ++BI) {
+      BasicBlock *BB = (*BI)->getNodeAs<BasicBlock>();
+      if (RI->getRegionFor(BB) == R)
+        O.indent(2 * (depth + 1)) << "Node"
+          << static_cast<const void*>(RI->getTopLevelRegion()->getBBNode(BB))
+          << ";\n";
+    }
+
+    O.indent(2 * depth) << "}\n";
+  }
+
+  static void addCustomGraphFeatures(const RegionInfo* RI,
+                                     GraphWriter<RegionInfo*> &GW) {
+    raw_ostream &O = GW.getOStream();
+    O << "\tcolorscheme = \"paired12\"\n";
+    printRegionCluster(RI->getTopLevelRegion(), GW, 4);
+  }
+};
+} //end namespace llvm
+
+namespace {
+
+struct RegionViewer
+  : public DOTGraphTraitsViewer<RegionInfo, false> {
+  static char ID;
+  RegionViewer() : DOTGraphTraitsViewer<RegionInfo, false>("reg", ID){}
+};
+
+char RegionViewer::ID = 0;
+INITIALIZE_PASS(RegionViewer, "view-regions", "View regions of function",
+                true, true);
+
+struct RegionOnlyViewer
+  : public DOTGraphTraitsViewer<RegionInfo, true> {
+  static char ID;
+  RegionOnlyViewer() : DOTGraphTraitsViewer<RegionInfo, true>("regonly", ID){}
+};
+
+char RegionOnlyViewer::ID = 0;
+INITIALIZE_PASS(RegionOnlyViewer, "view-regions-only",
+                "View regions of function (with no function bodies)",
+                true, true);
+
+struct RegionPrinter
+  : public DOTGraphTraitsPrinter<RegionInfo, false> {
+  static char ID;
+  RegionPrinter() :
+    DOTGraphTraitsPrinter<RegionInfo, false>("reg", ID) {}
+};
+} //end anonymous namespace
+
+char RegionPrinter::ID = 0;
+INITIALIZE_PASS(RegionPrinter, "dot-regions",
+                "Print regions of function to 'dot' file", true, true);
+
+namespace {
+
+struct RegionOnlyPrinter
+  : public DOTGraphTraitsPrinter<RegionInfo, true> {
+  static char ID;
+  RegionOnlyPrinter() :
+    DOTGraphTraitsPrinter<RegionInfo, true>("reg", ID) {}
+};
+
+}
+
+char RegionOnlyPrinter::ID = 0;
+INITIALIZE_PASS(RegionOnlyPrinter, "dot-regions-only",
+                "Print regions of function to 'dot' file "
+                "(with no function bodies)",
+                true, true);
+
+FunctionPass* llvm::createRegionViewerPass() {
+  return new RegionViewer();
+}
+
+FunctionPass* llvm::createRegionOnlyViewerPass() {
+  return new RegionOnlyViewer();
+}
+
+FunctionPass* llvm::createRegionPrinterPass() {
+  return new RegionPrinter();
+}
+
+FunctionPass* llvm::createRegionOnlyPrinterPass() {
+  return new RegionOnlyPrinter();
+}
+
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 413b3b47f92a4..b892d85f9f4a2 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -103,8 +103,8 @@ MaxBruteForceIterations("scalar-evolution-max-iterations", cl::ReallyHidden,
                                  "derived loop"),
                         cl::init(100));
 
-static RegisterPass<ScalarEvolution>
-R("scalar-evolution", "Scalar Evolution Analysis", false, true);
+INITIALIZE_PASS(ScalarEvolution, "scalar-evolution",
+                "Scalar Evolution Analysis", false, true);
 char ScalarEvolution::ID = 0;
 
 //===----------------------------------------------------------------------===//
@@ -251,28 +251,59 @@ void SCEVCommutativeExpr::print(raw_ostream &OS) const {
   OS << "(";
   for (op_iterator I = op_begin(), E = op_end(); I != E; ++I) {
     OS << **I;
-    if (next(I) != E)
+    if (llvm::next(I) != E)
       OS << OpStr;
   }
   OS << ")";
 }
 
 bool SCEVNAryExpr::dominates(BasicBlock *BB, DominatorTree *DT) const {
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-    if (!getOperand(i)->dominates(BB, DT))
+  for (op_iterator I = op_begin(), E = op_end(); I != E; ++I)
+    if (!(*I)->dominates(BB, DT))
       return false;
-  }
   return true;
 }
 
 bool SCEVNAryExpr::properlyDominates(BasicBlock *BB, DominatorTree *DT) const {
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-    if (!getOperand(i)->properlyDominates(BB, DT))
+  for (op_iterator I = op_begin(), E = op_end(); I != E; ++I)
+    if (!(*I)->properlyDominates(BB, DT))
       return false;
-  }
   return true;
 }
 
+bool SCEVNAryExpr::isLoopInvariant(const Loop *L) const {
+  for (op_iterator I = op_begin(), E = op_end(); I != E; ++I)
+    if (!(*I)->isLoopInvariant(L))
+      return false;
+  return true;
+}
+
+// hasComputableLoopEvolution - N-ary expressions have computable loop
+// evolutions iff they have at least one operand that varies with the loop,
+// but that all varying operands are computable.
+bool SCEVNAryExpr::hasComputableLoopEvolution(const Loop *L) const {
+  bool HasVarying = false;
+  for (op_iterator I = op_begin(), E = op_end(); I != E; ++I) {
+    const SCEV *S = *I;
+    if (!S->isLoopInvariant(L)) {
+      if (S->hasComputableLoopEvolution(L))
+        HasVarying = true;
+      else
+        return false;
+    }
+  }
+  return HasVarying;
+}
+
+bool SCEVNAryExpr::hasOperand(const SCEV *O) const {
+  for (op_iterator I = op_begin(), E = op_end(); I != E; ++I) {
+    const SCEV *S = *I;
+    if (O == S || S->hasOperand(O))
+      return true;
+  }
+  return false;
+}
+
 bool SCEVUDivExpr::dominates(BasicBlock *BB, DominatorTree *DT) const {
   return LHS->dominates(BB, DT) && RHS->dominates(BB, DT);
 }
@@ -303,10 +334,14 @@ bool SCEVAddRecExpr::isLoopInvariant(const Loop *QueryLoop) const {
   if (QueryLoop->contains(L))
     return false;
 
+  // This recurrence is invariant w.r.t. QueryLoop if L contains QueryLoop.
+  if (L->contains(QueryLoop))
+    return true;
+
   // This recurrence is variant w.r.t. QueryLoop if any of its operands
   // are variant.
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-    if (!getOperand(i)->isLoopInvariant(QueryLoop))
+  for (op_iterator I = op_begin(), E = op_end(); I != E; ++I)
+    if (!(*I)->isLoopInvariant(QueryLoop))
       return false;
 
   // Otherwise it's loop-invariant.
@@ -337,12 +372,36 @@ void SCEVAddRecExpr::print(raw_ostream &OS) const {
   OS << ">";
 }
 
+void SCEVUnknown::deleted() {
+  // Clear this SCEVUnknown from ValuesAtScopes.
+  SE->ValuesAtScopes.erase(this);
+
+  // Remove this SCEVUnknown from the uniquing map.
+  SE->UniqueSCEVs.RemoveNode(this);
+
+  // Release the value.
+  setValPtr(0);
+}
+
+void SCEVUnknown::allUsesReplacedWith(Value *New) {
+  // Clear this SCEVUnknown from ValuesAtScopes.
+  SE->ValuesAtScopes.erase(this);
+
+  // Remove this SCEVUnknown from the uniquing map.
+  SE->UniqueSCEVs.RemoveNode(this);
+
+  // Update this SCEVUnknown to point to the new value. This is needed
+  // because there may still be outstanding SCEVs which still point to
+  // this SCEVUnknown.
+  setValPtr(New);
+}
+
 bool SCEVUnknown::isLoopInvariant(const Loop *L) const {
   // All non-instruction values are loop invariant.  All instructions are loop
   // invariant if they are not contained in the specified loop.
   // Instructions are never considered invariant in the function body
   // (null loop) because they are defined within the "loop".
-  if (Instruction *I = dyn_cast<Instruction>(V))
+  if (Instruction *I = dyn_cast<Instruction>(getValue()))
     return L && !L->contains(I);
   return true;
 }
@@ -360,11 +419,11 @@ bool SCEVUnknown::properlyDominates(BasicBlock *BB, DominatorTree *DT) const {
 }
 
 const Type *SCEVUnknown::getType() const {
-  return V->getType();
+  return getValue()->getType();
 }
 
 bool SCEVUnknown::isSizeOf(const Type *&AllocTy) const {
-  if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(V))
+  if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(getValue()))
     if (VCE->getOpcode() == Instruction::PtrToInt)
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(VCE->getOperand(0)))
         if (CE->getOpcode() == Instruction::GetElementPtr &&
@@ -381,7 +440,7 @@ bool SCEVUnknown::isSizeOf(const Type *&AllocTy) const {
 }
 
 bool SCEVUnknown::isAlignOf(const Type *&AllocTy) const {
-  if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(V))
+  if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(getValue()))
     if (VCE->getOpcode() == Instruction::PtrToInt)
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(VCE->getOperand(0)))
         if (CE->getOpcode() == Instruction::GetElementPtr &&
@@ -406,7 +465,7 @@ bool SCEVUnknown::isAlignOf(const Type *&AllocTy) const {
 }
 
 bool SCEVUnknown::isOffsetOf(const Type *&CTy, Constant *&FieldNo) const {
-  if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(V))
+  if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(getValue()))
     if (VCE->getOpcode() == Instruction::PtrToInt)
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(VCE->getOperand(0)))
         if (CE->getOpcode() == Instruction::GetElementPtr &&
@@ -448,166 +507,183 @@ void SCEVUnknown::print(raw_ostream &OS) const {
   }
 
   // Otherwise just print it normally.
-  WriteAsOperand(OS, V, false);
+  WriteAsOperand(OS, getValue(), false);
 }
 
 //===----------------------------------------------------------------------===//
 //                               SCEV Utilities
 //===----------------------------------------------------------------------===//
 
-static bool CompareTypes(const Type *A, const Type *B) {
-  if (A->getTypeID() != B->getTypeID())
-    return A->getTypeID() < B->getTypeID();
-  if (const IntegerType *AI = dyn_cast<IntegerType>(A)) {
-    const IntegerType *BI = cast<IntegerType>(B);
-    return AI->getBitWidth() < BI->getBitWidth();
-  }
-  if (const PointerType *AI = dyn_cast<PointerType>(A)) {
-    const PointerType *BI = cast<PointerType>(B);
-    return CompareTypes(AI->getElementType(), BI->getElementType());
-  }
-  if (const ArrayType *AI = dyn_cast<ArrayType>(A)) {
-    const ArrayType *BI = cast<ArrayType>(B);
-    if (AI->getNumElements() != BI->getNumElements())
-      return AI->getNumElements() < BI->getNumElements();
-    return CompareTypes(AI->getElementType(), BI->getElementType());
-  }
-  if (const VectorType *AI = dyn_cast<VectorType>(A)) {
-    const VectorType *BI = cast<VectorType>(B);
-    if (AI->getNumElements() != BI->getNumElements())
-      return AI->getNumElements() < BI->getNumElements();
-    return CompareTypes(AI->getElementType(), BI->getElementType());
-  }
-  if (const StructType *AI = dyn_cast<StructType>(A)) {
-    const StructType *BI = cast<StructType>(B);
-    if (AI->getNumElements() != BI->getNumElements())
-      return AI->getNumElements() < BI->getNumElements();
-    for (unsigned i = 0, e = AI->getNumElements(); i != e; ++i)
-      if (CompareTypes(AI->getElementType(i), BI->getElementType(i)) ||
-          CompareTypes(BI->getElementType(i), AI->getElementType(i)))
-        return CompareTypes(AI->getElementType(i), BI->getElementType(i));
-  }
-  return false;
-}
-
 namespace {
   /// SCEVComplexityCompare - Return true if the complexity of the LHS is less
   /// than the complexity of the RHS.  This comparator is used to canonicalize
   /// expressions.
   class SCEVComplexityCompare {
-    LoopInfo *LI;
+    const LoopInfo *const LI;
   public:
-    explicit SCEVComplexityCompare(LoopInfo *li) : LI(li) {}
+    explicit SCEVComplexityCompare(const LoopInfo *li) : LI(li) {}
 
+    // Return true or false if LHS is less than, or at least RHS, respectively.
     bool operator()(const SCEV *LHS, const SCEV *RHS) const {
+      return compare(LHS, RHS) < 0;
+    }
+
+    // Return negative, zero, or positive, if LHS is less than, equal to, or
+    // greater than RHS, respectively. A three-way result allows recursive
+    // comparisons to be more efficient.
+    int compare(const SCEV *LHS, const SCEV *RHS) const {
       // Fast-path: SCEVs are uniqued so we can do a quick equality check.
       if (LHS == RHS)
-        return false;
+        return 0;
 
       // Primarily, sort the SCEVs by their getSCEVType().
-      if (LHS->getSCEVType() != RHS->getSCEVType())
-        return LHS->getSCEVType() < RHS->getSCEVType();
+      unsigned LType = LHS->getSCEVType(), RType = RHS->getSCEVType();
+      if (LType != RType)
+        return (int)LType - (int)RType;
 
       // Aside from the getSCEVType() ordering, the particular ordering
       // isn't very important except that it's beneficial to be consistent,
       // so that (a + b) and (b + a) don't end up as different expressions.
-
-      // Sort SCEVUnknown values with some loose heuristics. TODO: This is
-      // not as complete as it could be.
-      if (const SCEVUnknown *LU = dyn_cast<SCEVUnknown>(LHS)) {
+      switch (LType) {
+      case scUnknown: {
+        const SCEVUnknown *LU = cast<SCEVUnknown>(LHS);
         const SCEVUnknown *RU = cast<SCEVUnknown>(RHS);
 
+        // Sort SCEVUnknown values with some loose heuristics. TODO: This is
+        // not as complete as it could be.
+        const Value *LV = LU->getValue(), *RV = RU->getValue();
+
         // Order pointer values after integer values. This helps SCEVExpander
         // form GEPs.
-        if (LU->getType()->isPointerTy() && !RU->getType()->isPointerTy())
-          return false;
-        if (RU->getType()->isPointerTy() && !LU->getType()->isPointerTy())
-          return true;
+        bool LIsPointer = LV->getType()->isPointerTy(),
+             RIsPointer = RV->getType()->isPointerTy();
+        if (LIsPointer != RIsPointer)
+          return (int)LIsPointer - (int)RIsPointer;
 
         // Compare getValueID values.
-        if (LU->getValue()->getValueID() != RU->getValue()->getValueID())
-          return LU->getValue()->getValueID() < RU->getValue()->getValueID();
+        unsigned LID = LV->getValueID(),
+                 RID = RV->getValueID();
+        if (LID != RID)
+          return (int)LID - (int)RID;
 
         // Sort arguments by their position.
-        if (const Argument *LA = dyn_cast<Argument>(LU->getValue())) {
-          const Argument *RA = cast<Argument>(RU->getValue());
-          return LA->getArgNo() < RA->getArgNo();
+        if (const Argument *LA = dyn_cast<Argument>(LV)) {
+          const Argument *RA = cast<Argument>(RV);
+          unsigned LArgNo = LA->getArgNo(), RArgNo = RA->getArgNo();
+          return (int)LArgNo - (int)RArgNo;
         }
 
-        // For instructions, compare their loop depth, and their opcode.
-        // This is pretty loose.
-        if (Instruction *LV = dyn_cast<Instruction>(LU->getValue())) {
-          Instruction *RV = cast<Instruction>(RU->getValue());
+        // For instructions, compare their loop depth, and their operand
+        // count.  This is pretty loose.
+        if (const Instruction *LInst = dyn_cast<Instruction>(LV)) {
+          const Instruction *RInst = cast<Instruction>(RV);
 
           // Compare loop depths.
-          if (LI->getLoopDepth(LV->getParent()) !=
-              LI->getLoopDepth(RV->getParent()))
-            return LI->getLoopDepth(LV->getParent()) <
-                   LI->getLoopDepth(RV->getParent());
-
-          // Compare opcodes.
-          if (LV->getOpcode() != RV->getOpcode())
-            return LV->getOpcode() < RV->getOpcode();
+          const BasicBlock *LParent = LInst->getParent(),
+                           *RParent = RInst->getParent();
+          if (LParent != RParent) {
+            unsigned LDepth = LI->getLoopDepth(LParent),
+                     RDepth = LI->getLoopDepth(RParent);
+            if (LDepth != RDepth)
+              return (int)LDepth - (int)RDepth;
+          }
 
           // Compare the number of operands.
-          if (LV->getNumOperands() != RV->getNumOperands())
-            return LV->getNumOperands() < RV->getNumOperands();
+          unsigned LNumOps = LInst->getNumOperands(),
+                   RNumOps = RInst->getNumOperands();
+          return (int)LNumOps - (int)RNumOps;
         }
 
-        return false;
+        return 0;
       }
 
-      // Compare constant values.
-      if (const SCEVConstant *LC = dyn_cast<SCEVConstant>(LHS)) {
+      case scConstant: {
+        const SCEVConstant *LC = cast<SCEVConstant>(LHS);
         const SCEVConstant *RC = cast<SCEVConstant>(RHS);
-        if (LC->getValue()->getBitWidth() != RC->getValue()->getBitWidth())
-          return LC->getValue()->getBitWidth() < RC->getValue()->getBitWidth();
-        return LC->getValue()->getValue().ult(RC->getValue()->getValue());
+
+        // Compare constant values.
+        const APInt &LA = LC->getValue()->getValue();
+        const APInt &RA = RC->getValue()->getValue();
+        unsigned LBitWidth = LA.getBitWidth(), RBitWidth = RA.getBitWidth();
+        if (LBitWidth != RBitWidth)
+          return (int)LBitWidth - (int)RBitWidth;
+        return LA.ult(RA) ? -1 : 1;
       }
 
-      // Compare addrec loop depths.
-      if (const SCEVAddRecExpr *LA = dyn_cast<SCEVAddRecExpr>(LHS)) {
+      case scAddRecExpr: {
+        const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
         const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);
-        if (LA->getLoop()->getLoopDepth() != RA->getLoop()->getLoopDepth())
-          return LA->getLoop()->getLoopDepth() < RA->getLoop()->getLoopDepth();
+
+        // Compare addrec loop depths.
+        const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop();
+        if (LLoop != RLoop) {
+          unsigned LDepth = LLoop->getLoopDepth(),
+                   RDepth = RLoop->getLoopDepth();
+          if (LDepth != RDepth)
+            return (int)LDepth - (int)RDepth;
+        }
+
+        // Addrec complexity grows with operand count.
+        unsigned LNumOps = LA->getNumOperands(), RNumOps = RA->getNumOperands();
+        if (LNumOps != RNumOps)
+          return (int)LNumOps - (int)RNumOps;
+
+        // Lexicographically compare.
+        for (unsigned i = 0; i != LNumOps; ++i) {
+          long X = compare(LA->getOperand(i), RA->getOperand(i));
+          if (X != 0)
+            return X;
+        }
+
+        return 0;
       }
 
-      // Lexicographically compare n-ary expressions.
-      if (const SCEVNAryExpr *LC = dyn_cast<SCEVNAryExpr>(LHS)) {
+      case scAddExpr:
+      case scMulExpr:
+      case scSMaxExpr:
+      case scUMaxExpr: {
+        const SCEVNAryExpr *LC = cast<SCEVNAryExpr>(LHS);
         const SCEVNAryExpr *RC = cast<SCEVNAryExpr>(RHS);
-        for (unsigned i = 0, e = LC->getNumOperands(); i != e; ++i) {
-          if (i >= RC->getNumOperands())
-            return false;
-          if (operator()(LC->getOperand(i), RC->getOperand(i)))
-            return true;
-          if (operator()(RC->getOperand(i), LC->getOperand(i)))
-            return false;
+
+        // Lexicographically compare n-ary expressions.
+        unsigned LNumOps = LC->getNumOperands(), RNumOps = RC->getNumOperands();
+        for (unsigned i = 0; i != LNumOps; ++i) {
+          if (i >= RNumOps)
+            return 1;
+          long X = compare(LC->getOperand(i), RC->getOperand(i));
+          if (X != 0)
+            return X;
         }
-        return LC->getNumOperands() < RC->getNumOperands();
+        return (int)LNumOps - (int)RNumOps;
       }
 
-      // Lexicographically compare udiv expressions.
-      if (const SCEVUDivExpr *LC = dyn_cast<SCEVUDivExpr>(LHS)) {
+      case scUDivExpr: {
+        const SCEVUDivExpr *LC = cast<SCEVUDivExpr>(LHS);
         const SCEVUDivExpr *RC = cast<SCEVUDivExpr>(RHS);
-        if (operator()(LC->getLHS(), RC->getLHS()))
-          return true;
-        if (operator()(RC->getLHS(), LC->getLHS()))
-          return false;
-        if (operator()(LC->getRHS(), RC->getRHS()))
-          return true;
-        if (operator()(RC->getRHS(), LC->getRHS()))
-          return false;
-        return false;
+
+        // Lexicographically compare udiv expressions.
+        long X = compare(LC->getLHS(), RC->getLHS());
+        if (X != 0)
+          return X;
+        return compare(LC->getRHS(), RC->getRHS());
       }
 
-      // Compare cast expressions by operand.
-      if (const SCEVCastExpr *LC = dyn_cast<SCEVCastExpr>(LHS)) {
+      case scTruncate:
+      case scZeroExtend:
+      case scSignExtend: {
+        const SCEVCastExpr *LC = cast<SCEVCastExpr>(LHS);
         const SCEVCastExpr *RC = cast<SCEVCastExpr>(RHS);
-        return operator()(LC->getOperand(), RC->getOperand());
+
+        // Compare cast expressions by operand.
+        return compare(LC->getOperand(), RC->getOperand());
+      }
+
+      default:
+        break;
       }
 
       llvm_unreachable("Unknown SCEV kind!");
-      return false;
+      return 0;
     }
   };
 }
@@ -628,8 +704,9 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
   if (Ops.size() == 2) {
     // This is the common case, which also happens to be trivially simple.
     // Special case it.
-    if (SCEVComplexityCompare(LI)(Ops[1], Ops[0]))
-      std::swap(Ops[0], Ops[1]);
+    const SCEV *&LHS = Ops[0], *&RHS = Ops[1];
+    if (SCEVComplexityCompare(LI)(RHS, LHS))
+      std::swap(LHS, RHS);
     return;
   }
 
@@ -845,6 +922,13 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op,
     return getAddRecExpr(Operands, AddRec->getLoop());
   }
 
+  // As a special case, fold trunc(undef) to undef. We don't want to
+  // know too much about SCEVUnknowns, but this special case is handy
+  // and harmless.
+  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Op))
+    if (isa<UndefValue>(U->getValue()))
+      return getSCEV(UndefValue::get(Ty));
+
   // The cast wasn't folded; create an explicit cast node. We can reuse
   // the existing insert position since if we get here, we won't have
   // made any changes which would invalidate it.
@@ -1163,6 +1247,13 @@ const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op,
     return getAddRecExpr(Ops, AR->getLoop());
   }
 
+  // As a special case, fold anyext(undef) to undef. We don't want to
+  // know too much about SCEVUnknowns, but this special case is handy
+  // and harmless.
+  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Op))
+    if (isa<UndefValue>(U->getValue()))
+      return getSCEV(UndefValue::get(Ty));
+
   // If the expression is obviously signed, use the sext cast value.
   if (isa<SCEVSMaxExpr>(Op))
     return SExt;
@@ -1287,8 +1378,9 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   // If HasNSW is true and all the operands are non-negative, infer HasNUW.
   if (!HasNUW && HasNSW) {
     bool All = true;
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (!isKnownNonNegative(Ops[i])) {
+    for (SmallVectorImpl<const SCEV *>::const_iterator I = Ops.begin(),
+         E = Ops.end(); I != E; ++I)
+      if (!isKnownNonNegative(*I)) {
         All = false;
         break;
       }
@@ -1321,22 +1413,29 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     if (Ops.size() == 1) return Ops[0];
   }
 
-  // Okay, check to see if the same value occurs in the operand list twice.  If
-  // so, merge them together into an multiply expression.  Since we sorted the
-  // list, these values are required to be adjacent.
+  // Okay, check to see if the same value occurs in the operand list more than
+  // once.  If so, merge them together into an multiply expression.  Since we
+  // sorted the list, these values are required to be adjacent.
   const Type *Ty = Ops[0]->getType();
-  for (unsigned i = 0, e = Ops.size()-1; i != e; ++i)
+  bool FoundMatch = false;
+  for (unsigned i = 0, e = Ops.size(); i != e-1; ++i)
     if (Ops[i] == Ops[i+1]) {      //  X + Y + Y  -->  X + Y*2
-      // Found a match, merge the two values into a multiply, and add any
-      // remaining values to the result.
-      const SCEV *Two = getConstant(Ty, 2);
-      const SCEV *Mul = getMulExpr(Ops[i], Two);
-      if (Ops.size() == 2)
+      // Scan ahead to count how many equal operands there are.
+      unsigned Count = 2;
+      while (i+Count != e && Ops[i+Count] == Ops[i])
+        ++Count;
+      // Merge the values into a multiply.
+      const SCEV *Scale = getConstant(Ty, Count);
+      const SCEV *Mul = getMulExpr(Scale, Ops[i]);
+      if (Ops.size() == Count)
         return Mul;
-      Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
-      Ops.push_back(Mul);
-      return getAddExpr(Ops, HasNUW, HasNSW);
+      Ops[i] = Mul;
+      Ops.erase(Ops.begin()+i+1, Ops.begin()+i+Count);
+      --i; e -= Count - 1;
+      FoundMatch = true;
     }
+  if (FoundMatch)
+    return getAddExpr(Ops, HasNUW, HasNSW);
 
   // Check for truncates. If all the operands are truncated from the same
   // type, see if factoring out the truncate would permit the result to be
@@ -1433,7 +1532,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
       // re-generate the operands list. Group the operands by constant scale,
       // to avoid multiplying by the same constant scale multiple times.
       std::map<APInt, SmallVector<const SCEV *, 4>, APIntCompare> MulOpLists;
-      for (SmallVector<const SCEV *, 8>::iterator I = NewOps.begin(),
+      for (SmallVector<const SCEV *, 8>::const_iterator I = NewOps.begin(),
            E = NewOps.end(); I != E; ++I)
         MulOpLists[M.find(*I)->second].push_back(*I);
       // Re-generate the operands list.
@@ -1460,20 +1559,23 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     const SCEVMulExpr *Mul = cast<SCEVMulExpr>(Ops[Idx]);
     for (unsigned MulOp = 0, e = Mul->getNumOperands(); MulOp != e; ++MulOp) {
       const SCEV *MulOpSCEV = Mul->getOperand(MulOp);
+      if (isa<SCEVConstant>(MulOpSCEV))
+        continue;
       for (unsigned AddOp = 0, e = Ops.size(); AddOp != e; ++AddOp)
-        if (MulOpSCEV == Ops[AddOp] && !isa<SCEVConstant>(Ops[AddOp])) {
+        if (MulOpSCEV == Ops[AddOp]) {
           // Fold W + X + (X * Y * Z)  -->  W + (X * ((Y*Z)+1))
           const SCEV *InnerMul = Mul->getOperand(MulOp == 0);
           if (Mul->getNumOperands() != 2) {
             // If the multiply has more than two operands, we must get the
             // Y*Z term.
-            SmallVector<const SCEV *, 4> MulOps(Mul->op_begin(), Mul->op_end());
-            MulOps.erase(MulOps.begin()+MulOp);
+            SmallVector<const SCEV *, 4> MulOps(Mul->op_begin(),
+                                                Mul->op_begin()+MulOp);
+            MulOps.append(Mul->op_begin()+MulOp+1, Mul->op_end());
             InnerMul = getMulExpr(MulOps);
           }
           const SCEV *One = getConstant(Ty, 1);
-          const SCEV *AddOne = getAddExpr(InnerMul, One);
-          const SCEV *OuterMul = getMulExpr(AddOne, Ops[AddOp]);
+          const SCEV *AddOne = getAddExpr(One, InnerMul);
+          const SCEV *OuterMul = getMulExpr(AddOne, MulOpSCEV);
           if (Ops.size() == 2) return OuterMul;
           if (AddOp < Idx) {
             Ops.erase(Ops.begin()+AddOp);
@@ -1500,15 +1602,15 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
             const SCEV *InnerMul1 = Mul->getOperand(MulOp == 0);
             if (Mul->getNumOperands() != 2) {
               SmallVector<const SCEV *, 4> MulOps(Mul->op_begin(),
-                                                  Mul->op_end());
-              MulOps.erase(MulOps.begin()+MulOp);
+                                                  Mul->op_begin()+MulOp);
+              MulOps.append(Mul->op_begin()+MulOp+1, Mul->op_end());
               InnerMul1 = getMulExpr(MulOps);
             }
             const SCEV *InnerMul2 = OtherMul->getOperand(OMulOp == 0);
             if (OtherMul->getNumOperands() != 2) {
               SmallVector<const SCEV *, 4> MulOps(OtherMul->op_begin(),
-                                                  OtherMul->op_end());
-              MulOps.erase(MulOps.begin()+OMulOp);
+                                                  OtherMul->op_begin()+OMulOp);
+              MulOps.append(OtherMul->op_begin()+OMulOp+1, OtherMul->op_end());
               InnerMul2 = getMulExpr(MulOps);
             }
             const SCEV *InnerMulSum = getAddExpr(InnerMul1,InnerMul2);
@@ -1574,30 +1676,31 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     // there are multiple AddRec's with the same loop induction variable being
     // added together.  If so, we can fold them.
     for (unsigned OtherIdx = Idx+1;
-         OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);++OtherIdx)
-      if (OtherIdx != Idx) {
-        const SCEVAddRecExpr *OtherAddRec = cast<SCEVAddRecExpr>(Ops[OtherIdx]);
-        if (AddRecLoop == OtherAddRec->getLoop()) {
-          // Other + {A,+,B} + {C,+,D}  -->  Other + {A+C,+,B+D}
-          SmallVector<const SCEV *, 4> NewOps(AddRec->op_begin(),
-                                              AddRec->op_end());
-          for (unsigned i = 0, e = OtherAddRec->getNumOperands(); i != e; ++i) {
-            if (i >= NewOps.size()) {
-              NewOps.append(OtherAddRec->op_begin()+i,
-                            OtherAddRec->op_end());
-              break;
+         OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
+         ++OtherIdx)
+      if (AddRecLoop == cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) {
+        // Other + {A,+,B}<L> + {C,+,D}<L>  -->  Other + {A+C,+,B+D}<L>
+        SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(),
+                                               AddRec->op_end());
+        for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
+             ++OtherIdx)
+          if (const SCEVAddRecExpr *OtherAddRec =
+                dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]))
+            if (OtherAddRec->getLoop() == AddRecLoop) {
+              for (unsigned i = 0, e = OtherAddRec->getNumOperands();
+                   i != e; ++i) {
+                if (i >= AddRecOps.size()) {
+                  AddRecOps.append(OtherAddRec->op_begin()+i,
+                                   OtherAddRec->op_end());
+                  break;
+                }
+                AddRecOps[i] = getAddExpr(AddRecOps[i],
+                                          OtherAddRec->getOperand(i));
+              }
+              Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
             }
-            NewOps[i] = getAddExpr(NewOps[i], OtherAddRec->getOperand(i));
-          }
-          const SCEV *NewAddRec = getAddRecExpr(NewOps, AddRecLoop);
-
-          if (Ops.size() == 2) return NewAddRec;
-
-          Ops.erase(Ops.begin()+Idx);
-          Ops.erase(Ops.begin()+OtherIdx-1);
-          Ops.push_back(NewAddRec);
-          return getAddExpr(Ops);
-        }
+        Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop);
+        return getAddExpr(Ops);
       }
 
     // Otherwise couldn't fold anything into this recurrence.  Move onto the
@@ -1633,17 +1736,18 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
   assert(!Ops.empty() && "Cannot get empty mul!");
   if (Ops.size() == 1) return Ops[0];
 #ifndef NDEBUG
+  const Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
   for (unsigned i = 1, e = Ops.size(); i != e; ++i)
-    assert(getEffectiveSCEVType(Ops[i]->getType()) ==
-           getEffectiveSCEVType(Ops[0]->getType()) &&
+    assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
            "SCEVMulExpr operand types don't match!");
 #endif
 
   // If HasNSW is true and all the operands are non-negative, infer HasNUW.
   if (!HasNUW && HasNSW) {
     bool All = true;
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (!isKnownNonNegative(Ops[i])) {
+    for (SmallVectorImpl<const SCEV *>::const_iterator I = Ops.begin(),
+         E = Ops.end(); I != E; ++I)
+      if (!isKnownNonNegative(*I)) {
         All = false;
         break;
       }
@@ -1740,8 +1844,9 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
     // they are loop invariant w.r.t. the recurrence.
     SmallVector<const SCEV *, 8> LIOps;
     const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
+    const Loop *AddRecLoop = AddRec->getLoop();
     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (Ops[i]->isLoopInvariant(AddRec->getLoop())) {
+      if (Ops[i]->isLoopInvariant(AddRecLoop)) {
         LIOps.push_back(Ops[i]);
         Ops.erase(Ops.begin()+i);
         --i; --e;
@@ -1758,7 +1863,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
 
       // Build the new addrec. Propagate the NUW and NSW flags if both the
       // outer mul and the inner addrec are guaranteed to have no overflow.
-      const SCEV *NewRec = getAddRecExpr(NewOps, AddRec->getLoop(),
+      const SCEV *NewRec = getAddRecExpr(NewOps, AddRecLoop,
                                          HasNUW && AddRec->hasNoUnsignedWrap(),
                                          HasNSW && AddRec->hasNoSignedWrap());
 
@@ -1778,28 +1883,30 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
     // there are multiple AddRec's with the same loop induction variable being
     // multiplied together.  If so, we can fold them.
     for (unsigned OtherIdx = Idx+1;
-         OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);++OtherIdx)
-      if (OtherIdx != Idx) {
-        const SCEVAddRecExpr *OtherAddRec = cast<SCEVAddRecExpr>(Ops[OtherIdx]);
-        if (AddRec->getLoop() == OtherAddRec->getLoop()) {
-          // F * G  -->  {A,+,B} * {C,+,D}  -->  {A*C,+,F*D + G*B + B*D}
-          const SCEVAddRecExpr *F = AddRec, *G = OtherAddRec;
-          const SCEV *NewStart = getMulExpr(F->getStart(),
-                                                 G->getStart());
-          const SCEV *B = F->getStepRecurrence(*this);
-          const SCEV *D = G->getStepRecurrence(*this);
-          const SCEV *NewStep = getAddExpr(getMulExpr(F, D),
-                                          getMulExpr(G, B),
-                                          getMulExpr(B, D));
-          const SCEV *NewAddRec = getAddRecExpr(NewStart, NewStep,
-                                               F->getLoop());
-          if (Ops.size() == 2) return NewAddRec;
-
-          Ops.erase(Ops.begin()+Idx);
-          Ops.erase(Ops.begin()+OtherIdx-1);
-          Ops.push_back(NewAddRec);
-          return getMulExpr(Ops);
-        }
+         OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
+         ++OtherIdx)
+      if (AddRecLoop == cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) {
+        // F * G, where F = {A,+,B}<L> and G = {C,+,D}<L>  -->
+        // {A*C,+,F*D + G*B + B*D}<L>
+        for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
+             ++OtherIdx)
+          if (const SCEVAddRecExpr *OtherAddRec =
+                dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]))
+            if (OtherAddRec->getLoop() == AddRecLoop) {
+              const SCEVAddRecExpr *F = AddRec, *G = OtherAddRec;
+              const SCEV *NewStart = getMulExpr(F->getStart(), G->getStart());
+              const SCEV *B = F->getStepRecurrence(*this);
+              const SCEV *D = G->getStepRecurrence(*this);
+              const SCEV *NewStep = getAddExpr(getMulExpr(F, D),
+                                               getMulExpr(G, B),
+                                               getMulExpr(B, D));
+              const SCEV *NewAddRec = getAddRecExpr(NewStart, NewStep,
+                                                    F->getLoop());
+              if (Ops.size() == 2) return NewAddRec;
+              Ops[Idx] = AddRec = cast<SCEVAddRecExpr>(NewAddRec);
+              Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
+            }
+        return getMulExpr(Ops);
       }
 
     // Otherwise couldn't fold anything into this recurrence.  Move onto the
@@ -1848,7 +1955,7 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
       // TODO: Generalize this to non-constants by using known-bits information.
       const Type *Ty = LHS->getType();
       unsigned LZ = RHSC->getValue()->getValue().countLeadingZeros();
-      unsigned MaxShiftAmt = getTypeSizeInBits(Ty) - LZ;
+      unsigned MaxShiftAmt = getTypeSizeInBits(Ty) - LZ - 1;
       // For non-power-of-two values, effectively round the value up to the
       // nearest power of two.
       if (!RHSC->getValue()->getValue().isPowerOf2())
@@ -1955,9 +2062,9 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
                                bool HasNUW, bool HasNSW) {
   if (Operands.size() == 1) return Operands[0];
 #ifndef NDEBUG
+  const Type *ETy = getEffectiveSCEVType(Operands[0]->getType());
   for (unsigned i = 1, e = Operands.size(); i != e; ++i)
-    assert(getEffectiveSCEVType(Operands[i]->getType()) ==
-           getEffectiveSCEVType(Operands[0]->getType()) &&
+    assert(getEffectiveSCEVType(Operands[i]->getType()) == ETy &&
            "SCEVAddRecExpr operand types don't match!");
 #endif
 
@@ -1975,8 +2082,9 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
   // If HasNSW is true and all the operands are non-negative, infer HasNUW.
   if (!HasNUW && HasNSW) {
     bool All = true;
-    for (unsigned i = 0, e = Operands.size(); i != e; ++i)
-      if (!isKnownNonNegative(Operands[i])) {
+    for (SmallVectorImpl<const SCEV *>::const_iterator I = Operands.begin(),
+         E = Operands.end(); I != E; ++I)
+      if (!isKnownNonNegative(*I)) {
         All = false;
         break;
       }
@@ -1986,9 +2094,9 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
   // Canonicalize nested AddRecs in by nesting them in order of loop depth.
   if (const SCEVAddRecExpr *NestedAR = dyn_cast<SCEVAddRecExpr>(Operands[0])) {
     const Loop *NestedLoop = NestedAR->getLoop();
-    if (L->contains(NestedLoop->getHeader()) ?
+    if (L->contains(NestedLoop) ?
         (L->getLoopDepth() < NestedLoop->getLoopDepth()) :
-        (!NestedLoop->contains(L->getHeader()) &&
+        (!NestedLoop->contains(L) &&
          DT->dominates(L->getHeader(), NestedLoop->getHeader()))) {
       SmallVector<const SCEV *, 4> NestedOperands(NestedAR->op_begin(),
                                                   NestedAR->op_end());
@@ -2055,9 +2163,9 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
   assert(!Ops.empty() && "Cannot get empty smax!");
   if (Ops.size() == 1) return Ops[0];
 #ifndef NDEBUG
+  const Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
   for (unsigned i = 1, e = Ops.size(); i != e; ++i)
-    assert(getEffectiveSCEVType(Ops[i]->getType()) ==
-           getEffectiveSCEVType(Ops[0]->getType()) &&
+    assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
            "SCEVSMaxExpr operand types don't match!");
 #endif
 
@@ -2160,9 +2268,9 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
   assert(!Ops.empty() && "Cannot get empty umax!");
   if (Ops.size() == 1) return Ops[0];
 #ifndef NDEBUG
+  const Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
   for (unsigned i = 1, e = Ops.size(); i != e; ++i)
-    assert(getEffectiveSCEVType(Ops[i]->getType()) ==
-           getEffectiveSCEVType(Ops[0]->getType()) &&
+    assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
            "SCEVUMaxExpr operand types don't match!");
 #endif
 
@@ -2326,8 +2434,14 @@ const SCEV *ScalarEvolution::getUnknown(Value *V) {
   ID.AddInteger(scUnknown);
   ID.AddPointer(V);
   void *IP = 0;
-  if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
-  SCEV *S = new (SCEVAllocator) SCEVUnknown(ID.Intern(SCEVAllocator), V);
+  if (SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) {
+    assert(cast<SCEVUnknown>(S)->getValue() == V &&
+           "Stale SCEVUnknown in uniquing map!");
+    return S;
+  }
+  SCEV *S = new (SCEVAllocator) SCEVUnknown(ID.Intern(SCEVAllocator), V, this,
+                                            FirstUnknown);
+  FirstUnknown = cast<SCEVUnknown>(S);
   UniqueSCEVs.InsertNode(S, IP);
   return S;
 }
@@ -2391,10 +2505,15 @@ const SCEV *ScalarEvolution::getCouldNotCompute() {
 const SCEV *ScalarEvolution::getSCEV(Value *V) {
   assert(isSCEVable(V->getType()) && "Value is not SCEVable!");
 
-  std::map<SCEVCallbackVH, const SCEV *>::iterator I = Scalars.find(V);
-  if (I != Scalars.end()) return I->second;
+  ValueExprMapType::const_iterator I = ValueExprMap.find(V);
+  if (I != ValueExprMap.end()) return I->second;
   const SCEV *S = createSCEV(V);
-  Scalars.insert(std::make_pair(SCEVCallbackVH(V, this), S));
+
+  // The process of creating a SCEV for V may have caused other SCEVs
+  // to have been created, so it's necessary to insert the new entry
+  // from scratch, rather than trying to remember the insert position
+  // above.
+  ValueExprMap.insert(std::make_pair(SCEVCallbackVH(V, this), S));
   return S;
 }
 
@@ -2428,6 +2547,10 @@ const SCEV *ScalarEvolution::getNotSCEV(const SCEV *V) {
 ///
 const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS,
                                           const SCEV *RHS) {
+  // Fast path: X - X --> 0.
+  if (LHS == RHS)
+    return getConstant(LHS->getType(), 0);
+
   // X - Y --> X + -Y
   return getAddExpr(LHS, getNegativeSCEV(RHS));
 }
@@ -2570,12 +2693,12 @@ PushDefUseChildren(Instruction *I,
   // Push the def-use children onto the Worklist stack.
   for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
        UI != UE; ++UI)
-    Worklist.push_back(cast<Instruction>(UI));
+    Worklist.push_back(cast<Instruction>(*UI));
 }
 
 /// ForgetSymbolicValue - This looks up computed SCEV values for all
 /// instructions that depend on the given instruction and removes them from
-/// the Scalars map if they reference SymName. This is used during PHI
+/// the ValueExprMapType map if they reference SymName. This is used during PHI
 /// resolution.
 void
 ScalarEvolution::ForgetSymbolicName(Instruction *PN, const SCEV *SymName) {
@@ -2588,9 +2711,9 @@ ScalarEvolution::ForgetSymbolicName(Instruction *PN, const SCEV *SymName) {
     Instruction *I = Worklist.pop_back_val();
     if (!Visited.insert(I)) continue;
 
-    std::map<SCEVCallbackVH, const SCEV *>::iterator It =
-      Scalars.find(static_cast<Value *>(I));
-    if (It != Scalars.end()) {
+    ValueExprMapType::iterator It =
+      ValueExprMap.find(static_cast<Value *>(I));
+    if (It != ValueExprMap.end()) {
       // Short-circuit the def-use traversal if the symbolic name
       // ceases to appear in expressions.
       if (It->second != SymName && !It->second->hasOperand(SymName))
@@ -2607,7 +2730,7 @@ ScalarEvolution::ForgetSymbolicName(Instruction *PN, const SCEV *SymName) {
           !isa<SCEVUnknown>(It->second) ||
           (I != PN && It->second == SymName)) {
         ValuesAtScopes.erase(It->second);
-        Scalars.erase(It);
+        ValueExprMap.erase(It);
       }
     }
 
@@ -2644,9 +2767,9 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
       if (BEValueV && StartValueV) {
         // While we are analyzing this PHI node, handle its value symbolically.
         const SCEV *SymbolicName = getUnknown(PN);
-        assert(Scalars.find(PN) == Scalars.end() &&
+        assert(ValueExprMap.find(PN) == ValueExprMap.end() &&
                "PHI node already processed?");
-        Scalars.insert(std::make_pair(SCEVCallbackVH(PN, this), SymbolicName));
+        ValueExprMap.insert(std::make_pair(SCEVCallbackVH(PN, this), SymbolicName));
 
         // Using this symbolic name for the PHI, analyze the value coming around
         // the back-edge.
@@ -2707,7 +2830,7 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
               // to be symbolic.  We now need to go back and purge all of the
               // entries for the scalars that use the symbolic expression.
               ForgetSymbolicName(PN, SymbolicName);
-              Scalars[SCEVCallbackVH(PN, this)] = PHISCEV;
+              ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;
               return PHISCEV;
             }
           }
@@ -2732,7 +2855,7 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
               // to be symbolic.  We now need to go back and purge all of the
               // entries for the scalars that use the symbolic expression.
               ForgetSymbolicName(PN, SymbolicName);
-              Scalars[SCEVCallbackVH(PN, this)] = PHISCEV;
+              ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;
               return PHISCEV;
             }
           }
@@ -2777,7 +2900,7 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
     return getUnknown(GEP);
   const SCEV *TotalOffset = getConstant(IntPtrTy, 0);
   gep_type_iterator GTI = gep_type_begin(GEP);
-  for (GetElementPtrInst::op_iterator I = next(GEP->op_begin()),
+  for (GetElementPtrInst::op_iterator I = llvm::next(GEP->op_begin()),
                                       E = GEP->op_end();
        I != E; ++I) {
     Value *Index = *I;
@@ -3200,12 +3323,42 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
 
   Operator *U = cast<Operator>(V);
   switch (Opcode) {
-  case Instruction::Add:
-    return getAddExpr(getSCEV(U->getOperand(0)),
-                      getSCEV(U->getOperand(1)));
-  case Instruction::Mul:
-    return getMulExpr(getSCEV(U->getOperand(0)),
-                      getSCEV(U->getOperand(1)));
+  case Instruction::Add: {
+    // The simple thing to do would be to just call getSCEV on both operands
+    // and call getAddExpr with the result. However if we're looking at a
+    // bunch of things all added together, this can be quite inefficient,
+    // because it leads to N-1 getAddExpr calls for N ultimate operands.
+    // Instead, gather up all the operands and make a single getAddExpr call.
+    // LLVM IR canonical form means we need only traverse the left operands.
+    SmallVector<const SCEV *, 4> AddOps;
+    AddOps.push_back(getSCEV(U->getOperand(1)));
+    for (Value *Op = U->getOperand(0); ; Op = U->getOperand(0)) {
+      unsigned Opcode = Op->getValueID() - Value::InstructionVal;
+      if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
+        break;
+      U = cast<Operator>(Op);
+      const SCEV *Op1 = getSCEV(U->getOperand(1));
+      if (Opcode == Instruction::Sub)
+        AddOps.push_back(getNegativeSCEV(Op1));
+      else
+        AddOps.push_back(Op1);
+    }
+    AddOps.push_back(getSCEV(U->getOperand(0)));
+    return getAddExpr(AddOps);
+  }
+  case Instruction::Mul: {
+    // See the Add code above.
+    SmallVector<const SCEV *, 4> MulOps;
+    MulOps.push_back(getSCEV(U->getOperand(1)));
+    for (Value *Op = U->getOperand(0);
+         Op->getValueID() == Instruction::Mul + Value::InstructionVal; 
+         Op = U->getOperand(0)) {
+      U = cast<Operator>(Op);
+      MulOps.push_back(getSCEV(U->getOperand(1)));
+    }
+    MulOps.push_back(getSCEV(U->getOperand(0)));
+    return getMulExpr(MulOps);
+  }
   case Instruction::UDiv:
     return getUDivExpr(getSCEV(U->getOperand(0)),
                        getSCEV(U->getOperand(1)));
@@ -3467,7 +3620,7 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
           const SCEV *LDiff = getMinusSCEV(LA, LS);
           const SCEV *RDiff = getMinusSCEV(RA, One);
           if (LDiff == RDiff)
-            return getAddExpr(getUMaxExpr(LS, One), LDiff);
+            return getAddExpr(getUMaxExpr(One, LS), LDiff);
         }
         break;
       case ICmpInst::ICMP_EQ:
@@ -3482,7 +3635,7 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
           const SCEV *LDiff = getMinusSCEV(LA, One);
           const SCEV *RDiff = getMinusSCEV(RA, LS);
           if (LDiff == RDiff)
-            return getAddExpr(getUMaxExpr(LS, One), LDiff);
+            return getAddExpr(getUMaxExpr(One, LS), LDiff);
         }
         break;
       default:
@@ -3579,9 +3732,9 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
         Instruction *I = Worklist.pop_back_val();
         if (!Visited.insert(I)) continue;
 
-        std::map<SCEVCallbackVH, const SCEV *>::iterator It =
-          Scalars.find(static_cast<Value *>(I));
-        if (It != Scalars.end()) {
+        ValueExprMapType::iterator It =
+          ValueExprMap.find(static_cast<Value *>(I));
+        if (It != ValueExprMap.end()) {
           // SCEVUnknown for a PHI either means that it has an unrecognized
           // structure, or it's a PHI that's in the progress of being computed
           // by createNodeForPHI.  In the former case, additional loop trip
@@ -3590,7 +3743,7 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
           // own when it gets to that point.
           if (!isa<PHINode>(I) || !isa<SCEVUnknown>(It->second)) {
             ValuesAtScopes.erase(It->second);
-            Scalars.erase(It);
+            ValueExprMap.erase(It);
           }
           if (PHINode *PN = dyn_cast<PHINode>(I))
             ConstantEvolutionLoopExitValue.erase(PN);
@@ -3619,11 +3772,10 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
     Instruction *I = Worklist.pop_back_val();
     if (!Visited.insert(I)) continue;
 
-    std::map<SCEVCallbackVH, const SCEV *>::iterator It =
-      Scalars.find(static_cast<Value *>(I));
-    if (It != Scalars.end()) {
+    ValueExprMapType::iterator It = ValueExprMap.find(static_cast<Value *>(I));
+    if (It != ValueExprMap.end()) {
       ValuesAtScopes.erase(It->second);
-      Scalars.erase(It);
+      ValueExprMap.erase(It);
       if (PHINode *PN = dyn_cast<PHINode>(I))
         ConstantEvolutionLoopExitValue.erase(PN);
     }
@@ -3648,35 +3800,14 @@ void ScalarEvolution::forgetValue(Value *V) {
     I = Worklist.pop_back_val();
     if (!Visited.insert(I)) continue;
 
-    std::map<SCEVCallbackVH, const SCEV *>::iterator It =
-      Scalars.find(static_cast<Value *>(I));
-    if (It != Scalars.end()) {
+    ValueExprMapType::iterator It = ValueExprMap.find(static_cast<Value *>(I));
+    if (It != ValueExprMap.end()) {
       ValuesAtScopes.erase(It->second);
-      Scalars.erase(It);
+      ValueExprMap.erase(It);
       if (PHINode *PN = dyn_cast<PHINode>(I))
         ConstantEvolutionLoopExitValue.erase(PN);
     }
 
-    // If there's a SCEVUnknown tying this value into the SCEV
-    // space, remove it from the folding set map. The SCEVUnknown
-    // object and any other SCEV objects which reference it
-    // (transitively) remain allocated, effectively leaked until
-    // the underlying BumpPtrAllocator is freed.
-    //
-    // This permits SCEV pointers to be used as keys in maps
-    // such as the ValuesAtScopes map.
-    FoldingSetNodeID ID;
-    ID.AddInteger(scUnknown);
-    ID.AddPointer(I);
-    void *IP;
-    if (SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) {
-      UniqueSCEVs.RemoveNode(S);
-
-      // This isn't necessary, but we might as well remove the
-      // value from the ValuesAtScopes map too.
-      ValuesAtScopes.erase(S);
-    }
-
     PushDefUseChildren(I, Worklist);
   }
 }
@@ -3816,14 +3947,13 @@ ScalarEvolution::ComputeBackedgeTakenCountFromExitCond(const Loop *L,
         else
           MaxBECount = getUMinFromMismatchedTypes(BTI0.Max, BTI1.Max);
       } else {
-        // Both conditions must be true for the loop to exit.
+        // Both conditions must be true at the same time for the loop to exit.
+        // For now, be conservative.
         assert(L->contains(FBB) && "Loop block has no successor in loop!");
-        if (BTI0.Exact != getCouldNotCompute() &&
-            BTI1.Exact != getCouldNotCompute())
-          BECount = getUMaxFromMismatchedTypes(BTI0.Exact, BTI1.Exact);
-        if (BTI0.Max != getCouldNotCompute() &&
-            BTI1.Max != getCouldNotCompute())
-          MaxBECount = getUMaxFromMismatchedTypes(BTI0.Max, BTI1.Max);
+        if (BTI0.Max == BTI1.Max)
+          MaxBECount = BTI0.Max;
+        if (BTI0.Exact == BTI1.Exact)
+          BECount = BTI0.Exact;
       }
 
       return BackedgeTakenInfo(BECount, MaxBECount);
@@ -3851,14 +3981,13 @@ ScalarEvolution::ComputeBackedgeTakenCountFromExitCond(const Loop *L,
         else
           MaxBECount = getUMinFromMismatchedTypes(BTI0.Max, BTI1.Max);
       } else {
-        // Both conditions must be false for the loop to exit.
+        // Both conditions must be false at the same time for the loop to exit.
+        // For now, be conservative.
         assert(L->contains(TBB) && "Loop block has no successor in loop!");
-        if (BTI0.Exact != getCouldNotCompute() &&
-            BTI1.Exact != getCouldNotCompute())
-          BECount = getUMaxFromMismatchedTypes(BTI0.Exact, BTI1.Exact);
-        if (BTI0.Max != getCouldNotCompute() &&
-            BTI1.Max != getCouldNotCompute())
-          MaxBECount = getUMaxFromMismatchedTypes(BTI0.Max, BTI1.Max);
+        if (BTI0.Max == BTI1.Max)
+          MaxBECount = BTI0.Max;
+        if (BTI0.Exact == BTI1.Exact)
+          BECount = BTI0.Exact;
       }
 
       return BackedgeTakenInfo(BECount, MaxBECount);
@@ -4203,7 +4332,7 @@ Constant *
 ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
                                                    const APInt &BEs,
                                                    const Loop *L) {
-  std::map<PHINode*, Constant*>::iterator I =
+  std::map<PHINode*, Constant*>::const_iterator I =
     ConstantEvolutionLoopExitValue.find(PN);
   if (I != ConstantEvolutionLoopExitValue.end())
     return I->second;
@@ -5185,7 +5314,8 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
       LoopContinuePredicate->isUnconditional())
     return false;
 
-  return isImpliedCond(LoopContinuePredicate->getCondition(), Pred, LHS, RHS,
+  return isImpliedCond(Pred, LHS, RHS,
+                       LoopContinuePredicate->getCondition(),
                        LoopContinuePredicate->getSuccessor(0) != L->getHeader());
 }
 
@@ -5214,7 +5344,8 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
         LoopEntryPredicate->isUnconditional())
       continue;
 
-    if (isImpliedCond(LoopEntryPredicate->getCondition(), Pred, LHS, RHS,
+    if (isImpliedCond(Pred, LHS, RHS,
+                      LoopEntryPredicate->getCondition(),
                       LoopEntryPredicate->getSuccessor(0) != Pair.second))
       return true;
   }
@@ -5224,24 +5355,24 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
 
 /// isImpliedCond - Test whether the condition described by Pred, LHS,
 /// and RHS is true whenever the given Cond value evaluates to true.
-bool ScalarEvolution::isImpliedCond(Value *CondValue,
-                                    ICmpInst::Predicate Pred,
+bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred,
                                     const SCEV *LHS, const SCEV *RHS,
+                                    Value *FoundCondValue,
                                     bool Inverse) {
   // Recursively handle And and Or conditions.
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CondValue)) {
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FoundCondValue)) {
     if (BO->getOpcode() == Instruction::And) {
       if (!Inverse)
-        return isImpliedCond(BO->getOperand(0), Pred, LHS, RHS, Inverse) ||
-               isImpliedCond(BO->getOperand(1), Pred, LHS, RHS, Inverse);
+        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) ||
+               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse);
     } else if (BO->getOpcode() == Instruction::Or) {
       if (Inverse)
-        return isImpliedCond(BO->getOperand(0), Pred, LHS, RHS, Inverse) ||
-               isImpliedCond(BO->getOperand(1), Pred, LHS, RHS, Inverse);
+        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) ||
+               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse);
     }
   }
 
-  ICmpInst *ICI = dyn_cast<ICmpInst>(CondValue);
+  ICmpInst *ICI = dyn_cast<ICmpInst>(FoundCondValue);
   if (!ICI) return false;
 
   // Bail if the ICmp's operands' types are wider than the needed type
@@ -5658,20 +5789,19 @@ void ScalarEvolution::SCEVCallbackVH::deleted() {
   assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!");
   if (PHINode *PN = dyn_cast<PHINode>(getValPtr()))
     SE->ConstantEvolutionLoopExitValue.erase(PN);
-  SE->Scalars.erase(getValPtr());
+  SE->ValueExprMap.erase(getValPtr());
   // this now dangles!
 }
 
-void ScalarEvolution::SCEVCallbackVH::allUsesReplacedWith(Value *) {
+void ScalarEvolution::SCEVCallbackVH::allUsesReplacedWith(Value *V) {
   assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!");
 
   // Forget all the expressions associated with users of the old value,
   // so that future queries will recompute the expressions using the new
   // value.
+  Value *Old = getValPtr();
   SmallVector<User *, 16> Worklist;
   SmallPtrSet<User *, 8> Visited;
-  Value *Old = getValPtr();
-  bool DeleteOld = false;
   for (Value::use_iterator UI = Old->use_begin(), UE = Old->use_end();
        UI != UE; ++UI)
     Worklist.push_back(*UI);
@@ -5679,27 +5809,22 @@ void ScalarEvolution::SCEVCallbackVH::allUsesReplacedWith(Value *) {
     User *U = Worklist.pop_back_val();
     // Deleting the Old value will cause this to dangle. Postpone
     // that until everything else is done.
-    if (U == Old) {
-      DeleteOld = true;
+    if (U == Old)
       continue;
-    }
     if (!Visited.insert(U))
       continue;
     if (PHINode *PN = dyn_cast<PHINode>(U))
       SE->ConstantEvolutionLoopExitValue.erase(PN);
-    SE->Scalars.erase(U);
+    SE->ValueExprMap.erase(U);
     for (Value::use_iterator UI = U->use_begin(), UE = U->use_end();
          UI != UE; ++UI)
       Worklist.push_back(*UI);
   }
-  // Delete the Old value if it (indirectly) references itself.
-  if (DeleteOld) {
-    if (PHINode *PN = dyn_cast<PHINode>(Old))
-      SE->ConstantEvolutionLoopExitValue.erase(PN);
-    SE->Scalars.erase(Old);
-    // this now dangles!
-  }
-  // this may dangle!
+  // Delete the Old value.
+  if (PHINode *PN = dyn_cast<PHINode>(Old))
+    SE->ConstantEvolutionLoopExitValue.erase(PN);
+  SE->ValueExprMap.erase(Old);
+  // this now dangles!
 }
 
 ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se)
@@ -5710,7 +5835,7 @@ ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se)
 //===----------------------------------------------------------------------===//
 
 ScalarEvolution::ScalarEvolution()
-  : FunctionPass(&ID) {
+  : FunctionPass(ID), FirstUnknown(0) {
 }
 
 bool ScalarEvolution::runOnFunction(Function &F) {
@@ -5722,7 +5847,13 @@ bool ScalarEvolution::runOnFunction(Function &F) {
 }
 
 void ScalarEvolution::releaseMemory() {
-  Scalars.clear();
+  // Iterate through all the SCEVUnknown instances and call their
+  // destructors, so that they release their references to their values.
+  for (SCEVUnknown *U = FirstUnknown; U; U = U->Next)
+    U->~SCEVUnknown();
+  FirstUnknown = 0;
+
+  ValueExprMap.clear();
   BackedgeTakenCounts.clear();
   ConstantEvolutionLoopExitValue.clear();
   ValuesAtScopes.clear();
diff --git a/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp b/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
index 58711b8be59e8..93b2a8b06fbe9 100644
--- a/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
+++ b/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
@@ -34,14 +34,14 @@ namespace {
 
   public:
     static char ID; // Class identification, replacement for typeinfo
-    ScalarEvolutionAliasAnalysis() : FunctionPass(&ID), SE(0) {}
+    ScalarEvolutionAliasAnalysis() : FunctionPass(ID), SE(0) {}
 
     /// getAdjustedAnalysisPointer - This method is used when a pass implements
     /// an analysis interface through multiple inheritance.  If needed, it
     /// should override this to adjust the this pointer as needed for the
     /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(const PassInfo *PI) {
-      if (PI->isPassID(&AliasAnalysis::ID))
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &AliasAnalysis::ID)
         return (AliasAnalysis*)this;
       return this;
     }
@@ -58,11 +58,8 @@ namespace {
 
 // Register this pass...
 char ScalarEvolutionAliasAnalysis::ID = 0;
-static RegisterPass<ScalarEvolutionAliasAnalysis>
-X("scev-aa", "ScalarEvolution-based Alias Analysis", false, true);
-
-// Declare that we implement the AliasAnalysis interface
-static RegisterAnalysisGroup<AliasAnalysis> Y(X);
+INITIALIZE_AG_PASS(ScalarEvolutionAliasAnalysis, AliasAnalysis, "scev-aa",
+                   "ScalarEvolution-based Alias Analysis", false, true, false);
 
 FunctionPass *llvm::createScalarEvolutionAliasAnalysisPass() {
   return new ScalarEvolutionAliasAnalysis();
@@ -158,8 +155,8 @@ ScalarEvolutionAliasAnalysis::alias(const Value *A, unsigned ASize,
   Value *AO = GetBaseValue(AS);
   Value *BO = GetBaseValue(BS);
   if ((AO && AO != A) || (BO && BO != B))
-    if (alias(AO ? AO : A, AO ? ~0u : ASize,
-              BO ? BO : B, BO ? ~0u : BSize) == NoAlias)
+    if (alias(AO ? AO : A, AO ? UnknownSize : ASize,
+              BO ? BO : B, BO ? UnknownSize : BSize) == NoAlias)
       return NoAlias;
 
   // Forward the query to the next analysis.
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index d4a4b26e25ec4..66a06aeac43ca 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -647,6 +647,11 @@ public:
 
   bool operator()(std::pair<const Loop *, const SCEV *> LHS,
                   std::pair<const Loop *, const SCEV *> RHS) const {
+    // Keep pointer operands sorted at the end.
+    if (LHS.second->getType()->isPointerTy() !=
+        RHS.second->getType()->isPointerTy())
+      return LHS.second->getType()->isPointerTy();
+
     // Compare loops with PickMostRelevantLoop.
     if (LHS.first != RHS.first)
       return PickMostRelevantLoop(LHS.first, RHS.first, DT) != LHS.first;
@@ -699,8 +704,15 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
       // The running sum expression is a pointer. Try to form a getelementptr
       // at this level with that as the base.
       SmallVector<const SCEV *, 4> NewOps;
-      for (; I != E && I->first == CurLoop; ++I)
-        NewOps.push_back(I->second);
+      for (; I != E && I->first == CurLoop; ++I) {
+        // If the operand is SCEVUnknown and not instructions, peek through
+        // it, to enable more of it to be folded into the GEP.
+        const SCEV *X = I->second;
+        if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(X))
+          if (!isa<Instruction>(U->getValue()))
+            X = SE.getSCEV(U->getValue());
+        NewOps.push_back(X);
+      }
       Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, Sum);
     } else if (const PointerType *PTy = dyn_cast<PointerType>(Op->getType())) {
       // The running sum is an integer, and there's a pointer at this level.
@@ -1047,9 +1059,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   // First check for an existing canonical IV in a suitable type.
   PHINode *CanonicalIV = 0;
   if (PHINode *PN = L->getCanonicalInductionVariable())
-    if (SE.isSCEVable(PN->getType()) &&
-        SE.getEffectiveSCEVType(PN->getType())->isIntegerTy() &&
-        SE.getTypeSizeInBits(PN->getType()) >= SE.getTypeSizeInBits(Ty))
+    if (SE.getTypeSizeInBits(PN->getType()) >= SE.getTypeSizeInBits(Ty))
       CanonicalIV = PN;
 
   // Rewrite an AddRec in terms of the canonical induction variable, if
@@ -1102,21 +1112,13 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
                                 SE.getUnknown(expand(Rest))));
   }
 
-  // {0,+,1} --> Insert a canonical induction variable into the loop!
-  if (S->isAffine() && S->getOperand(1)->isOne()) {
-    // If there's a canonical IV, just use it.
-    if (CanonicalIV) {
-      assert(Ty == SE.getEffectiveSCEVType(CanonicalIV->getType()) &&
-             "IVs with types different from the canonical IV should "
-             "already have been handled!");
-      return CanonicalIV;
-    }
-
+  // If we don't yet have a canonical IV, create one.
+  if (!CanonicalIV) {
     // Create and insert the PHI node for the induction variable in the
     // specified loop.
     BasicBlock *Header = L->getHeader();
-    PHINode *PN = PHINode::Create(Ty, "indvar", Header->begin());
-    rememberInstruction(PN);
+    CanonicalIV = PHINode::Create(Ty, "indvar", Header->begin());
+    rememberInstruction(CanonicalIV);
 
     Constant *One = ConstantInt::get(Ty, 1);
     for (pred_iterator HPI = pred_begin(Header), HPE = pred_end(Header);
@@ -1125,40 +1127,45 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
       if (L->contains(HP)) {
         // Insert a unit add instruction right before the terminator
         // corresponding to the back-edge.
-        Instruction *Add = BinaryOperator::CreateAdd(PN, One, "indvar.next",
-                                                           HP->getTerminator());
+        Instruction *Add = BinaryOperator::CreateAdd(CanonicalIV, One,
+                                                     "indvar.next",
+                                                     HP->getTerminator());
         rememberInstruction(Add);
-        PN->addIncoming(Add, HP);
+        CanonicalIV->addIncoming(Add, HP);
       } else {
-        PN->addIncoming(Constant::getNullValue(Ty), HP);
+        CanonicalIV->addIncoming(Constant::getNullValue(Ty), HP);
       }
     }
   }
 
+  // {0,+,1} --> Insert a canonical induction variable into the loop!
+  if (S->isAffine() && S->getOperand(1)->isOne()) {
+    assert(Ty == SE.getEffectiveSCEVType(CanonicalIV->getType()) &&
+           "IVs with types different from the canonical IV should "
+           "already have been handled!");
+    return CanonicalIV;
+  }
+
   // {0,+,F} --> {0,+,1} * F
-  // Get the canonical induction variable I for this loop.
-  Value *I = CanonicalIV ?
-             CanonicalIV :
-             getOrInsertCanonicalInductionVariable(L, Ty);
 
   // If this is a simple linear addrec, emit it now as a special case.
   if (S->isAffine())    // {0,+,F} --> i*F
     return
       expand(SE.getTruncateOrNoop(
-        SE.getMulExpr(SE.getUnknown(I),
+        SE.getMulExpr(SE.getUnknown(CanonicalIV),
                       SE.getNoopOrAnyExtend(S->getOperand(1),
-                                            I->getType())),
+                                            CanonicalIV->getType())),
         Ty));
 
   // If this is a chain of recurrences, turn it into a closed form, using the
   // folders, then expandCodeFor the closed form.  This allows the folders to
   // simplify the expression without having to build a bunch of special code
   // into this folder.
-  const SCEV *IH = SE.getUnknown(I);   // Get I as a "symbolic" SCEV.
+  const SCEV *IH = SE.getUnknown(CanonicalIV);   // Get I as a "symbolic" SCEV.
 
   // Promote S up to the canonical IV type, if the cast is foldable.
   const SCEV *NewS = S;
-  const SCEV *Ext = SE.getNoopOrAnyExtend(S, I->getType());
+  const SCEV *Ext = SE.getNoopOrAnyExtend(S, CanonicalIV->getType());
   if (isa<SCEVAddRecExpr>(Ext))
     NewS = Ext;
 
@@ -1337,16 +1344,21 @@ void SCEVExpander::restoreInsertPoint(BasicBlock *BB, BasicBlock::iterator I) {
 /// canonical induction variable of the specified type for the specified
 /// loop (inserting one if there is none).  A canonical induction variable
 /// starts at zero and steps by one on each iteration.
-Value *
+PHINode *
 SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
                                                     const Type *Ty) {
   assert(Ty->isIntegerTy() && "Can only insert integer induction variables!");
+
+  // Build a SCEV for {0,+,1}<L>.
   const SCEV *H = SE.getAddRecExpr(SE.getConstant(Ty, 0),
                                    SE.getConstant(Ty, 1), L);
+
+  // Emit code for it.
   BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
   BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
-  Value *V = expandCodeFor(H, 0, L->getHeader()->begin());
+  PHINode *V = cast<PHINode>(expandCodeFor(H, 0, L->getHeader()->begin()));
   if (SaveInsertBB)
     restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+
   return V;
 }
diff --git a/lib/Analysis/ScalarEvolutionNormalization.cpp b/lib/Analysis/ScalarEvolutionNormalization.cpp
index 563fd2fa96e2e..ac36cef89ebb5 100644
--- a/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -26,7 +26,7 @@ using namespace llvm;
 /// post-inc value when we cannot) or it can end up adding extra live-ranges to
 /// the loop, resulting in reg-reg copies (if we use the pre-inc value when we
 /// should use the post-inc value).
-static bool IVUseShouldUsePostIncValue(Instruction *User, Instruction *IV,
+static bool IVUseShouldUsePostIncValue(Instruction *User, Value *Operand,
                                        const Loop *L, DominatorTree *DT) {
   // If the user is in the loop, use the preinc value.
   if (L->contains(User)) return false;
@@ -45,20 +45,17 @@ static bool IVUseShouldUsePostIncValue(Instruction *User, Instruction *IV,
   // their uses occur in the predecessor block, not the block the PHI lives in)
   // should still use the post-inc value.  Check for this case now.
   PHINode *PN = dyn_cast<PHINode>(User);
-  if (!PN) return false;  // not a phi, not dominated by latch block.
+  if (!PN || !Operand) return false; // not a phi, not dominated by latch block.
 
-  // Look at all of the uses of IV by the PHI node.  If any use corresponds to
-  // a block that is not dominated by the latch block, give up and use the
+  // Look at all of the uses of Operand by the PHI node.  If any use corresponds
+  // to a block that is not dominated by the latch block, give up and use the
   // preincremented value.
-  unsigned NumUses = 0;
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-    if (PN->getIncomingValue(i) == IV) {
-      ++NumUses;
-      if (!DT->dominates(LatchBlock, PN->getIncomingBlock(i)))
-        return false;
-    }
+    if (PN->getIncomingValue(i) == Operand &&
+        !DT->dominates(LatchBlock, PN->getIncomingBlock(i)))
+      return false;
 
-  // Okay, all uses of IV by PN are in predecessor blocks that really are
+  // Okay, all uses of Operand by PN are in predecessor blocks that really are
   // dominated by the latch block.  Use the post-incremented value.
   return true;
 }
@@ -72,6 +69,7 @@ const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
                                          DominatorTree &DT) {
   if (isa<SCEVConstant>(S) || isa<SCEVUnknown>(S))
     return S;
+
   if (const SCEVCastExpr *X = dyn_cast<SCEVCastExpr>(S)) {
     const SCEV *O = X->getOperand();
     const SCEV *N = TransformForPostIncUse(Kind, O, User, OperandValToReplace,
@@ -85,9 +83,69 @@ const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
       }
     return S;
   }
+
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    // An addrec. This is the interesting part.
+    SmallVector<const SCEV *, 8> Operands;
+    const Loop *L = AR->getLoop();
+    // The addrec conceptually uses its operands at loop entry.
+    Instruction *LUser = L->getHeader()->begin();
+    // Transform each operand.
+    for (SCEVNAryExpr::op_iterator I = AR->op_begin(), E = AR->op_end();
+         I != E; ++I) {
+      const SCEV *O = *I;
+      const SCEV *N = TransformForPostIncUse(Kind, O, LUser, 0, Loops, SE, DT);
+      Operands.push_back(N);
+    }
+    const SCEV *Result = SE.getAddRecExpr(Operands, L);
+    switch (Kind) {
+    default: llvm_unreachable("Unexpected transform name!");
+    case NormalizeAutodetect:
+      if (IVUseShouldUsePostIncValue(User, OperandValToReplace, L, &DT)) {
+        const SCEV *TransformedStep =
+          TransformForPostIncUse(Kind, AR->getStepRecurrence(SE),
+                                 User, OperandValToReplace, Loops, SE, DT);
+        Result = SE.getMinusSCEV(Result, TransformedStep);
+        Loops.insert(L);
+      }
+#if 0
+      // This assert is conceptually correct, but ScalarEvolution currently
+      // sometimes fails to canonicalize two equal SCEVs to exactly the same
+      // form. It's possibly a pessimization when this happens, but it isn't a
+      // correctness problem, so disable this assert for now.
+      assert(S == TransformForPostIncUse(Denormalize, Result,
+                                         User, OperandValToReplace,
+                                         Loops, SE, DT) &&
+             "SCEV normalization is not invertible!");
+#endif
+      break;
+    case Normalize:
+      if (Loops.count(L)) {
+        const SCEV *TransformedStep =
+          TransformForPostIncUse(Kind, AR->getStepRecurrence(SE),
+                                 User, OperandValToReplace, Loops, SE, DT);
+        Result = SE.getMinusSCEV(Result, TransformedStep);
+      }
+#if 0
+      // See the comment on the assert above.
+      assert(S == TransformForPostIncUse(Denormalize, Result,
+                                         User, OperandValToReplace,
+                                         Loops, SE, DT) &&
+             "SCEV normalization is not invertible!");
+#endif
+      break;
+    case Denormalize:
+      if (Loops.count(L))
+        Result = cast<SCEVAddRecExpr>(Result)->getPostIncExpr(SE);
+      break;
+    }
+    return Result;
+  }
+
   if (const SCEVNAryExpr *X = dyn_cast<SCEVNAryExpr>(S)) {
     SmallVector<const SCEV *, 8> Operands;
     bool Changed = false;
+    // Transform each operand.
     for (SCEVNAryExpr::op_iterator I = X->op_begin(), E = X->op_end();
          I != E; ++I) {
       const SCEV *O = *I;
@@ -96,37 +154,7 @@ const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
       Changed |= N != O;
       Operands.push_back(N);
     }
-    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
-      // An addrec. This is the interesting part.
-      const Loop *L = AR->getLoop();
-      const SCEV *Result = SE.getAddRecExpr(Operands, L);
-      switch (Kind) {
-      default: llvm_unreachable("Unexpected transform name!");
-      case NormalizeAutodetect:
-        if (Instruction *OI = dyn_cast<Instruction>(OperandValToReplace))
-          if (IVUseShouldUsePostIncValue(User, OI, L, &DT)) {
-            const SCEV *TransformedStep =
-              TransformForPostIncUse(Kind, AR->getStepRecurrence(SE),
-                                     User, OperandValToReplace, Loops, SE, DT);
-            Result = SE.getMinusSCEV(Result, TransformedStep);
-            Loops.insert(L);
-          }
-        break;
-      case Normalize:
-        if (Loops.count(L)) {
-          const SCEV *TransformedStep =
-            TransformForPostIncUse(Kind, AR->getStepRecurrence(SE),
-                                   User, OperandValToReplace, Loops, SE, DT);
-          Result = SE.getMinusSCEV(Result, TransformedStep);
-        }
-        break;
-      case Denormalize:
-        if (Loops.count(L))
-          Result = SE.getAddExpr(Result, AR->getStepRecurrence(SE));
-        break;
-      }
-      return Result;
-    }
+    // If any operand actually changed, return a transformed result.
     if (Changed)
       switch (S->getSCEVType()) {
       case scAddExpr: return SE.getAddExpr(Operands);
@@ -137,6 +165,7 @@ const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
       }
     return S;
   }
+
   if (const SCEVUDivExpr *X = dyn_cast<SCEVUDivExpr>(S)) {
     const SCEV *LO = X->getLHS();
     const SCEV *RO = X->getRHS();
@@ -148,6 +177,7 @@ const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
       return SE.getUDivExpr(LN, RN);
     return S;
   }
+
   llvm_unreachable("Unexpected SCEV kind!");
   return 0;
 }
diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp
new file mode 100644
index 0000000000000..bbfdcec3f9b4b
--- /dev/null
+++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -0,0 +1,191 @@
+//===- TypeBasedAliasAnalysis.cpp - Type-Based Alias Analysis -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the TypeBasedAliasAnalysis pass, which implements
+// metadata-based TBAA.
+//
+// In LLVM IR, memory does not have types, so LLVM's own type system is not
+// suitable for doing TBAA. Instead, metadata is added to the IR to describe
+// a type system of a higher level language.
+//
+// This pass is language-independent. The type system is encoded in
+// metadata. This allows this pass to support typical C and C++ TBAA, but
+// it can also support custom aliasing behavior for other languages.
+//
+// This is a work-in-progress. It doesn't work yet, and the metadata
+// format isn't stable.
+//
+// TODO: getModRefBehavior. The AliasAnalysis infrastructure will need to
+//       be extended.
+// TODO: AA chaining
+// TODO: struct fields
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Module.h"
+#include "llvm/Metadata.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+namespace {
+  /// TBAANode - This is a simple wrapper around an MDNode which provides a
+  /// higher-level interface by hiding the details of how alias analysis
+  /// information is encoded in its operands.
+  class TBAANode {
+    const MDNode *Node;
+
+  public:
+    TBAANode() : Node(0) {}
+    explicit TBAANode(MDNode *N) : Node(N) {}
+
+    /// getNode - Get the MDNode for this TBAANode.
+    const MDNode *getNode() const { return Node; }
+
+    /// getParent - Get this TBAANode's Alias DAG parent.
+    TBAANode getParent() const {
+      if (Node->getNumOperands() < 2)
+        return TBAANode();
+      MDNode *P = dyn_cast<MDNode>(Node->getOperand(1));
+      if (!P)
+        return TBAANode();
+      // Ok, this node has a valid parent. Return it.
+      return TBAANode(P);
+    }
+
+    /// TypeIsImmutable - Test if this TBAANode represents a type for objects
+    /// which are not modified (by any means) in the context where this
+    /// AliasAnalysis is relevant.
+    bool TypeIsImmutable() const {
+      if (Node->getNumOperands() < 3)
+        return false;
+      ConstantInt *CI = dyn_cast<ConstantInt>(Node->getOperand(2));
+      if (!CI)
+        return false;
+      // TODO: Think about the encoding.
+      return CI->isOne();
+    }
+  };
+}
+
+namespace {
+  /// TypeBasedAliasAnalysis - This is a simple alias analysis
+  /// implementation that uses TypeBased to answer queries.
+  class TypeBasedAliasAnalysis : public ImmutablePass,
+                                 public AliasAnalysis {
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    TypeBasedAliasAnalysis() : ImmutablePass(ID) {}
+
+    /// getAdjustedAnalysisPointer - This method is used when a pass implements
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(const void *PI) {
+      if (PI == &AliasAnalysis::ID)
+        return (AliasAnalysis*)this;
+      return this;
+    }
+
+  private:
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual AliasResult alias(const Value *V1, unsigned V1Size,
+                              const Value *V2, unsigned V2Size);
+    virtual bool pointsToConstantMemory(const Value *P);
+  };
+}  // End of anonymous namespace
+
+// Register this pass...
+char TypeBasedAliasAnalysis::ID = 0;
+INITIALIZE_AG_PASS(TypeBasedAliasAnalysis, AliasAnalysis, "tbaa",
+                   "Type-Based Alias Analysis", false, true, false);
+
+ImmutablePass *llvm::createTypeBasedAliasAnalysisPass() {
+  return new TypeBasedAliasAnalysis();
+}
+
+void
+TypeBasedAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AliasAnalysis::getAnalysisUsage(AU);
+}
+
+AliasAnalysis::AliasResult
+TypeBasedAliasAnalysis::alias(const Value *A, unsigned ASize,
+                              const Value *B, unsigned BSize) {
+  // Currently, metadata can only be attached to Instructions.
+  const Instruction *AI = dyn_cast<Instruction>(A);
+  if (!AI) return MayAlias;
+  const Instruction *BI = dyn_cast<Instruction>(B);
+  if (!BI) return MayAlias;
+
+  // Get the attached MDNodes. If either value lacks a tbaa MDNode, we must
+  // be conservative.
+  MDNode *AM =
+    AI->getMetadata(AI->getParent()->getParent()->getParent()
+                      ->getMDKindID("tbaa"));
+  if (!AM) return MayAlias;
+  MDNode *BM =
+    BI->getMetadata(BI->getParent()->getParent()->getParent()
+                      ->getMDKindID("tbaa"));
+  if (!BM) return MayAlias;
+
+  // Keep track of the root node for A and B.
+  TBAANode RootA, RootB;
+
+  // Climb the DAG from A to see if we reach B.
+  for (TBAANode T(AM); ; ) {
+    if (T.getNode() == BM)
+      // B is an ancestor of A.
+      return MayAlias;
+
+    RootA = T;
+    T = T.getParent();
+    if (!T.getNode())
+      break;
+  }
+
+  // Climb the DAG from B to see if we reach A.
+  for (TBAANode T(BM); ; ) {
+    if (T.getNode() == AM)
+      // A is an ancestor of B.
+      return MayAlias;
+
+    RootB = T;
+    T = T.getParent();
+    if (!T.getNode())
+      break;
+  }
+
+  // Neither node is an ancestor of the other.
+  
+  // If they have the same root, then we've proved there's no alias.
+  if (RootA.getNode() == RootB.getNode())
+    return NoAlias;
+
+  // If they have different roots, they're part of different potentially
+  // unrelated type systems, so we must be conservative.
+  return MayAlias;
+}
+
+bool TypeBasedAliasAnalysis::pointsToConstantMemory(const Value *P) {
+  // Currently, metadata can only be attached to Instructions.
+  const Instruction *I = dyn_cast<Instruction>(P);
+  if (!I) return false;
+
+  MDNode *M =
+    I->getMetadata(I->getParent()->getParent()->getParent()
+                    ->getMDKindID("tbaa"));
+  if (!M) return false;
+
+  // If this is an "immutable" type, we can assume the pointer is pointing
+  // to constant memory.
+  return TBAANode(M).TypeIsImmutable();
+}
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index b4c9884a20ed8..181c9b01980c3 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -880,19 +880,20 @@ bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
     }
 
     Value *Mul0 = NULL;
-    Value *Mul1 = NULL;
-    bool M0 = ComputeMultiple(Op0, Base, Mul0,
-                              LookThroughSExt, Depth+1);
-    bool M1 = ComputeMultiple(Op1, Base, Mul1,
-                              LookThroughSExt, Depth+1);
-
-    if (M0) {
-      if (isa<Constant>(Op1) && isa<Constant>(Mul0)) {
-        // V == Base * (Mul0 * Op1), so return (Mul0 * Op1)
-        Multiple = ConstantExpr::getMul(cast<Constant>(Mul0),
-                                        cast<Constant>(Op1));
-        return true;
-      }
+    if (ComputeMultiple(Op0, Base, Mul0, LookThroughSExt, Depth+1)) {
+      if (Constant *Op1C = dyn_cast<Constant>(Op1))
+        if (Constant *MulC = dyn_cast<Constant>(Mul0)) {
+          if (Op1C->getType()->getPrimitiveSizeInBits() < 
+              MulC->getType()->getPrimitiveSizeInBits())
+            Op1C = ConstantExpr::getZExt(Op1C, MulC->getType());
+          if (Op1C->getType()->getPrimitiveSizeInBits() > 
+              MulC->getType()->getPrimitiveSizeInBits())
+            MulC = ConstantExpr::getZExt(MulC, Op1C->getType());
+          
+          // V == Base * (Mul0 * Op1), so return (Mul0 * Op1)
+          Multiple = ConstantExpr::getMul(MulC, Op1C);
+          return true;
+        }
 
       if (ConstantInt *Mul0CI = dyn_cast<ConstantInt>(Mul0))
         if (Mul0CI->getValue() == 1) {
@@ -902,13 +903,21 @@ bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
         }
     }
 
-    if (M1) {
-      if (isa<Constant>(Op0) && isa<Constant>(Mul1)) {
-        // V == Base * (Mul1 * Op0), so return (Mul1 * Op0)
-        Multiple = ConstantExpr::getMul(cast<Constant>(Mul1),
-                                        cast<Constant>(Op0));
-        return true;
-      }
+    Value *Mul1 = NULL;
+    if (ComputeMultiple(Op1, Base, Mul1, LookThroughSExt, Depth+1)) {
+      if (Constant *Op0C = dyn_cast<Constant>(Op0))
+        if (Constant *MulC = dyn_cast<Constant>(Mul1)) {
+          if (Op0C->getType()->getPrimitiveSizeInBits() < 
+              MulC->getType()->getPrimitiveSizeInBits())
+            Op0C = ConstantExpr::getZExt(Op0C, MulC->getType());
+          if (Op0C->getType()->getPrimitiveSizeInBits() > 
+              MulC->getType()->getPrimitiveSizeInBits())
+            MulC = ConstantExpr::getZExt(MulC, Op0C->getType());
+          
+          // V == Base * (Mul1 * Op0), so return (Mul1 * Op0)
+          Multiple = ConstantExpr::getMul(MulC, Op0C);
+          return true;
+        }
 
       if (ConstantInt *Mul1CI = dyn_cast<ConstantInt>(Mul1))
         if (Mul1CI->getValue() == 1) {
@@ -973,195 +982,6 @@ bool llvm::CannotBeNegativeZero(const Value *V, unsigned Depth) {
   return false;
 }
 
-
-/// GetLinearExpression - Analyze the specified value as a linear expression:
-/// "A*V + B", where A and B are constant integers.  Return the scale and offset
-/// values as APInts and return V as a Value*.  The incoming Value is known to
-/// have IntegerType.  Note that this looks through extends, so the high bits
-/// may not be represented in the result.
-static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
-                                  const TargetData *TD, unsigned Depth) {
-  assert(V->getType()->isIntegerTy() && "Not an integer value");
-
-  // Limit our recursion depth.
-  if (Depth == 6) {
-    Scale = 1;
-    Offset = 0;
-    return V;
-  }
-  
-  if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
-    if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
-      switch (BOp->getOpcode()) {
-      default: break;
-      case Instruction::Or:
-        // X|C == X+C if all the bits in C are unset in X.  Otherwise we can't
-        // analyze it.
-        if (!MaskedValueIsZero(BOp->getOperand(0), RHSC->getValue(), TD))
-          break;
-        // FALL THROUGH.
-      case Instruction::Add:
-        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, TD, Depth+1);
-        Offset += RHSC->getValue();
-        return V;
-      case Instruction::Mul:
-        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, TD, Depth+1);
-        Offset *= RHSC->getValue();
-        Scale *= RHSC->getValue();
-        return V;
-      case Instruction::Shl:
-        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, TD, Depth+1);
-        Offset <<= RHSC->getValue().getLimitedValue();
-        Scale <<= RHSC->getValue().getLimitedValue();
-        return V;
-      }
-    }
-  }
-  
-  // Since clients don't care about the high bits of the value, just scales and
-  // offsets, we can look through extensions.
-  if (isa<SExtInst>(V) || isa<ZExtInst>(V)) {
-    Value *CastOp = cast<CastInst>(V)->getOperand(0);
-    unsigned OldWidth = Scale.getBitWidth();
-    unsigned SmallWidth = CastOp->getType()->getPrimitiveSizeInBits();
-    Scale.trunc(SmallWidth);
-    Offset.trunc(SmallWidth);
-    Value *Result = GetLinearExpression(CastOp, Scale, Offset, TD, Depth+1);
-    Scale.zext(OldWidth);
-    Offset.zext(OldWidth);
-    return Result;
-  }
-  
-  Scale = 1;
-  Offset = 0;
-  return V;
-}
-
-/// DecomposeGEPExpression - If V is a symbolic pointer expression, decompose it
-/// into a base pointer with a constant offset and a number of scaled symbolic
-/// offsets.
-///
-/// The scaled symbolic offsets (represented by pairs of a Value* and a scale in
-/// the VarIndices vector) are Value*'s that are known to be scaled by the
-/// specified amount, but which may have other unrepresented high bits. As such,
-/// the gep cannot necessarily be reconstructed from its decomposed form.
-///
-/// When TargetData is around, this function is capable of analyzing everything
-/// that Value::getUnderlyingObject() can look through.  When not, it just looks
-/// through pointer casts.
-///
-const Value *llvm::DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
-                 SmallVectorImpl<std::pair<const Value*, int64_t> > &VarIndices,
-                                          const TargetData *TD) {
-  // Limit recursion depth to limit compile time in crazy cases.
-  unsigned MaxLookup = 6;
-  
-  BaseOffs = 0;
-  do {
-    // See if this is a bitcast or GEP.
-    const Operator *Op = dyn_cast<Operator>(V);
-    if (Op == 0) {
-      // The only non-operator case we can handle are GlobalAliases.
-      if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
-        if (!GA->mayBeOverridden()) {
-          V = GA->getAliasee();
-          continue;
-        }
-      }
-      return V;
-    }
-    
-    if (Op->getOpcode() == Instruction::BitCast) {
-      V = Op->getOperand(0);
-      continue;
-    }
-    
-    const GEPOperator *GEPOp = dyn_cast<GEPOperator>(Op);
-    if (GEPOp == 0)
-      return V;
-    
-    // Don't attempt to analyze GEPs over unsized objects.
-    if (!cast<PointerType>(GEPOp->getOperand(0)->getType())
-        ->getElementType()->isSized())
-      return V;
-    
-    // If we are lacking TargetData information, we can't compute the offets of
-    // elements computed by GEPs.  However, we can handle bitcast equivalent
-    // GEPs.
-    if (!TD) {
-      if (!GEPOp->hasAllZeroIndices())
-        return V;
-      V = GEPOp->getOperand(0);
-      continue;
-    }
-    
-    // Walk the indices of the GEP, accumulating them into BaseOff/VarIndices.
-    gep_type_iterator GTI = gep_type_begin(GEPOp);
-    for (User::const_op_iterator I = GEPOp->op_begin()+1,
-         E = GEPOp->op_end(); I != E; ++I) {
-      Value *Index = *I;
-      // Compute the (potentially symbolic) offset in bytes for this index.
-      if (const StructType *STy = dyn_cast<StructType>(*GTI++)) {
-        // For a struct, add the member offset.
-        unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
-        if (FieldNo == 0) continue;
-        
-        BaseOffs += TD->getStructLayout(STy)->getElementOffset(FieldNo);
-        continue;
-      }
-      
-      // For an array/pointer, add the element offset, explicitly scaled.
-      if (ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
-        if (CIdx->isZero()) continue;
-        BaseOffs += TD->getTypeAllocSize(*GTI)*CIdx->getSExtValue();
-        continue;
-      }
-      
-      uint64_t Scale = TD->getTypeAllocSize(*GTI);
-      
-      // Use GetLinearExpression to decompose the index into a C1*V+C2 form.
-      unsigned Width = cast<IntegerType>(Index->getType())->getBitWidth();
-      APInt IndexScale(Width, 0), IndexOffset(Width, 0);
-      Index = GetLinearExpression(Index, IndexScale, IndexOffset, TD, 0);
-      
-      // The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
-      // This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
-      BaseOffs += IndexOffset.getZExtValue()*Scale;
-      Scale *= IndexScale.getZExtValue();
-      
-      
-      // If we already had an occurrance of this index variable, merge this
-      // scale into it.  For example, we want to handle:
-      //   A[x][x] -> x*16 + x*4 -> x*20
-      // This also ensures that 'x' only appears in the index list once.
-      for (unsigned i = 0, e = VarIndices.size(); i != e; ++i) {
-        if (VarIndices[i].first == Index) {
-          Scale += VarIndices[i].second;
-          VarIndices.erase(VarIndices.begin()+i);
-          break;
-        }
-      }
-      
-      // Make sure that we have a scale that makes sense for this target's
-      // pointer size.
-      if (unsigned ShiftBits = 64-TD->getPointerSizeInBits()) {
-        Scale <<= ShiftBits;
-        Scale >>= ShiftBits;
-      }
-      
-      if (Scale)
-        VarIndices.push_back(std::make_pair(Index, Scale));
-    }
-    
-    // Analyze the base pointer next.
-    V = GEPOp->getOperand(0);
-  } while (--MaxLookup);
-  
-  // If the chain of expressions is too deep, just return early.
-  return V;
-}
-
-
 // This is the recursive version of BuildSubAggregate. It takes a few different
 // arguments. Idxs is the index within the nested struct From that we are
 // looking at now (which is of type IndexedType). IdxSkip is the number of
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index f4c0e50fd94df..032753a3b2c61 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -493,6 +493,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(private);
   KEYWORD(linker_private);
   KEYWORD(linker_private_weak);
+  KEYWORD(linker_private_weak_def_auto);
   KEYWORD(internal);
   KEYWORD(available_externally);
   KEYWORD(linkonce);
@@ -572,7 +573,6 @@ lltok::Kind LLLexer::LexIdentifier() {
 
   KEYWORD(type);
   KEYWORD(opaque);
-  KEYWORD(union);
 
   KEYWORD(eq); KEYWORD(ne); KEYWORD(slt); KEYWORD(sgt); KEYWORD(sle);
   KEYWORD(sge); KEYWORD(ult); KEYWORD(ugt); KEYWORD(ule); KEYWORD(uge);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 221b994db55fa..f21a065473b62 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -199,6 +199,7 @@ bool LLParser::ParseTopLevelEntities() {
     case lltok::kw_private:             // OptionalLinkage
     case lltok::kw_linker_private:      // OptionalLinkage
     case lltok::kw_linker_private_weak: // OptionalLinkage
+    case lltok::kw_linker_private_weak_def_auto: // OptionalLinkage
     case lltok::kw_internal:            // OptionalLinkage
     case lltok::kw_weak:                // OptionalLinkage
     case lltok::kw_weak_odr:            // OptionalLinkage
@@ -517,11 +518,7 @@ bool LLParser::ParseMDNodeID(MDNode *&Result) {
   if (Result) return false;
 
   // Otherwise, create MDNode forward reference.
-
-  // FIXME: This is not unique enough!
-  std::string FwdRefName = "llvm.mdnode.fwdref." + utostr(MID);
-  Value *V = MDString::get(Context, FwdRefName);
-  MDNode *FwdNode = MDNode::get(Context, &V, 1);
+  MDNode *FwdNode = MDNode::getTemporary(Context, 0, 0);
   ForwardRefMDNodes[MID] = std::make_pair(FwdNode, Lex.getLoc());
   
   if (NumberedMetadata.size() <= MID)
@@ -543,27 +540,20 @@ bool LLParser::ParseNamedMetadata() {
       ParseToken(lltok::lbrace, "Expected '{' here"))
     return true;
 
-  SmallVector<MDNode *, 8> Elts;
+  NamedMDNode *NMD = M->getOrInsertNamedMetadata(Name);
   if (Lex.getKind() != lltok::rbrace)
     do {
-      // Null is a special case since it is typeless.
-      if (EatIfPresent(lltok::kw_null)) {
-        Elts.push_back(0);
-        continue;
-      }
-
       if (ParseToken(lltok::exclaim, "Expected '!' here"))
         return true;
     
       MDNode *N = 0;
       if (ParseMDNodeID(N)) return true;
-      Elts.push_back(N);
+      NMD->addOperand(N);
     } while (EatIfPresent(lltok::comma));
 
   if (ParseToken(lltok::rbrace, "expected end of metadata node"))
     return true;
 
-  NamedMDNode::Create(Context, Name, Elts.data(), Elts.size(), M);
   return false;
 }
 
@@ -592,7 +582,9 @@ bool LLParser::ParseStandaloneMetadata() {
   std::map<unsigned, std::pair<TrackingVH<MDNode>, LocTy> >::iterator
     FI = ForwardRefMDNodes.find(MetadataID);
   if (FI != ForwardRefMDNodes.end()) {
-    FI->second.first->replaceAllUsesWith(Init);
+    MDNode *Temp = FI->second.first;
+    Temp->replaceAllUsesWith(Init);
+    MDNode::deleteTemporary(Temp);
     ForwardRefMDNodes.erase(FI);
     
     assert(NumberedMetadata[MetadataID] == Init && "Tracking VH didn't work");
@@ -632,7 +624,8 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
       Linkage != GlobalValue::InternalLinkage &&
       Linkage != GlobalValue::PrivateLinkage &&
       Linkage != GlobalValue::LinkerPrivateLinkage &&
-      Linkage != GlobalValue::LinkerPrivateWeakLinkage)
+      Linkage != GlobalValue::LinkerPrivateWeakLinkage &&
+      Linkage != GlobalValue::LinkerPrivateWeakDefAutoLinkage)
     return Error(LinkageLoc, "invalid linkage type for alias");
 
   Constant *Aliasee;
@@ -1017,6 +1010,7 @@ bool LLParser::ParseOptionalAttrs(unsigned &Attrs, unsigned AttrKind) {
 ///   ::= 'private'
 ///   ::= 'linker_private'
 ///   ::= 'linker_private_weak'
+///   ::= 'linker_private_weak_def_auto'
 ///   ::= 'internal'
 ///   ::= 'weak'
 ///   ::= 'weak_odr'
@@ -1038,6 +1032,9 @@ bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage) {
   case lltok::kw_linker_private_weak:
     Res = GlobalValue::LinkerPrivateWeakLinkage;
     break;
+  case lltok::kw_linker_private_weak_def_auto:
+    Res = GlobalValue::LinkerPrivateWeakDefAutoLinkage;
+    break;
   case lltok::kw_internal:       Res = GlobalValue::InternalLinkage;      break;
   case lltok::kw_weak:           Res = GlobalValue::WeakAnyLinkage;       break;
   case lltok::kw_weak_odr:       Res = GlobalValue::WeakODRLinkage;       break;
@@ -1120,29 +1117,44 @@ bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
 
 /// ParseInstructionMetadata
 ///   ::= !dbg !42 (',' !dbg !57)*
-bool LLParser::ParseInstructionMetadata(Instruction *Inst) {
+bool LLParser::ParseInstructionMetadata(Instruction *Inst,
+                                        PerFunctionState *PFS) {
   do {
     if (Lex.getKind() != lltok::MetadataVar)
       return TokError("expected metadata after comma");
 
     std::string Name = Lex.getStrVal();
+    unsigned MDK = M->getMDKindID(Name.c_str());
     Lex.Lex();
 
     MDNode *Node;
     unsigned NodeID;
     SMLoc Loc = Lex.getLoc();
-    if (ParseToken(lltok::exclaim, "expected '!' here") ||
-        ParseMDNodeID(Node, NodeID))
+
+    if (ParseToken(lltok::exclaim, "expected '!' here"))
       return true;
 
-    unsigned MDK = M->getMDKindID(Name.c_str());
-    if (Node) {
-      // If we got the node, add it to the instruction.
-      Inst->setMetadata(MDK, Node);
+    // This code is similar to that of ParseMetadataValue, however it needs to
+    // have special-case code for a forward reference; see the comments on
+    // ForwardRefInstMetadata for details. Also, MDStrings are not supported
+    // at the top level here.
+    if (Lex.getKind() == lltok::lbrace) {
+      ValID ID;
+      if (ParseMetadataListValue(ID, PFS))
+        return true;
+      assert(ID.Kind == ValID::t_MDNode);
+      Inst->setMetadata(MDK, ID.MDNodeVal);
     } else {
-      MDRef R = { Loc, MDK, NodeID };
-      // Otherwise, remember that this should be resolved later.
-      ForwardRefInstMetadata[Inst].push_back(R);
+      if (ParseMDNodeID(Node, NodeID))
+        return true;
+      if (Node) {
+        // If we got the node, add it to the instruction.
+        Inst->setMetadata(MDK, Node);
+      } else {
+        MDRef R = { Loc, MDK, NodeID };
+        // Otherwise, remember that this should be resolved later.
+        ForwardRefInstMetadata[Inst].push_back(R);
+      }
     }
 
     // If this is the end of the list, we're done.
@@ -1161,6 +1173,8 @@ bool LLParser::ParseOptionalAlignment(unsigned &Alignment) {
   if (ParseUInt32(Alignment)) return true;
   if (!isPowerOf2_32(Alignment))
     return Error(AlignLoc, "alignment is not a power of two");
+  if (Alignment > Value::MaximumAlignment)
+    return Error(AlignLoc, "huge alignments are not supported yet");
   return false;
 }
 
@@ -1183,6 +1197,7 @@ bool LLParser::ParseOptionalCommaAlign(unsigned &Alignment,
     if (Lex.getKind() != lltok::kw_align)
       return Error(Lex.getLoc(), "expected metadata or 'align'");
     
+    LocTy AlignLoc = Lex.getLoc();
     if (ParseOptionalAlignment(Alignment)) return true;
   }
 
@@ -1344,11 +1359,6 @@ bool LLParser::ParseTypeRec(PATypeHolder &Result) {
     if (ParseStructType(Result, false))
       return true;
     break;
-  case lltok::kw_union:
-    // TypeRec ::= 'union' '{' ... '}'
-    if (ParseUnionType(Result))
-      return true;
-    break;
   case lltok::lsquare:
     // TypeRec ::= '[' ... ']'
     Lex.Lex(); // eat the lsquare.
@@ -1658,38 +1668,6 @@ bool LLParser::ParseStructType(PATypeHolder &Result, bool Packed) {
   return false;
 }
 
-/// ParseUnionType
-///   TypeRec
-///     ::= 'union' '{' TypeRec (',' TypeRec)* '}'
-bool LLParser::ParseUnionType(PATypeHolder &Result) {
-  assert(Lex.getKind() == lltok::kw_union);
-  Lex.Lex(); // Consume the 'union'
-
-  if (ParseToken(lltok::lbrace, "'{' expected after 'union'")) return true;
-
-  SmallVector<PATypeHolder, 8> ParamsList;
-  do {
-    LocTy EltTyLoc = Lex.getLoc();
-    if (ParseTypeRec(Result)) return true;
-    ParamsList.push_back(Result);
-
-    if (Result->isVoidTy())
-      return Error(EltTyLoc, "union element can not have void type");
-    if (!UnionType::isValidElementType(Result))
-      return Error(EltTyLoc, "invalid element type for union");
-
-  } while (EatIfPresent(lltok::comma)) ;
-
-  if (ParseToken(lltok::rbrace, "expected '}' at end of union"))
-    return true;
-
-  SmallVector<const Type*, 8> ParamsListTy;
-  for (unsigned i = 0, e = ParamsList.size(); i != e; ++i)
-    ParamsListTy.push_back(ParamsList[i].get());
-  Result = HandleUpRefs(UnionType::get(&ParamsListTy[0], ParamsListTy.size()));
-  return false;
-}
-
 /// ParseArrayVectorType - Parse an array or vector type, assuming the first
 /// token has already been consumed.
 ///   TypeRec
@@ -2504,6 +2482,20 @@ bool LLParser::ParseGlobalValueVector(SmallVectorImpl<Constant*> &Elts) {
   return false;
 }
 
+bool LLParser::ParseMetadataListValue(ValID &ID, PerFunctionState *PFS) {
+  assert(Lex.getKind() == lltok::lbrace);
+  Lex.Lex();
+
+  SmallVector<Value*, 16> Elts;
+  if (ParseMDNodeVector(Elts, PFS) ||
+      ParseToken(lltok::rbrace, "expected end of metadata node"))
+    return true;
+
+  ID.MDNodeVal = MDNode::get(Context, Elts.data(), Elts.size());
+  ID.Kind = ValID::t_MDNode;
+  return false;
+}
+
 /// ParseMetadataValue
 ///  ::= !42
 ///  ::= !{...}
@@ -2514,16 +2506,8 @@ bool LLParser::ParseMetadataValue(ValID &ID, PerFunctionState *PFS) {
 
   // MDNode:
   // !{ ... }
-  if (EatIfPresent(lltok::lbrace)) {
-    SmallVector<Value*, 16> Elts;
-    if (ParseMDNodeVector(Elts, PFS) ||
-        ParseToken(lltok::rbrace, "expected end of metadata node"))
-      return true;
-
-    ID.MDNodeVal = MDNode::get(Context, Elts.data(), Elts.size());
-    ID.Kind = ValID::t_MDNode;
-    return false;
-  }
+  if (Lex.getKind() == lltok::lbrace)
+    return ParseMetadataListValue(ID, PFS);
 
   // Standalone metadata reference
   // !42
@@ -2635,16 +2619,8 @@ bool LLParser::ConvertValIDToValue(const Type *Ty, ValID &ID, Value *&V,
     V = Constant::getNullValue(Ty);
     return false;
   case ValID::t_Constant:
-    if (ID.ConstantVal->getType() != Ty) {
-      // Allow a constant struct with a single member to be converted
-      // to a union, if the union has a member which is the same type
-      // as the struct member.
-      if (const UnionType* utype = dyn_cast<UnionType>(Ty)) {
-        return ParseUnionValue(utype, ID, V);
-      }
-
+    if (ID.ConstantVal->getType() != Ty)
       return Error(ID.Loc, "constant expression type mismatch");
-    }
 
     V = ID.ConstantVal;
     return false;
@@ -2675,22 +2651,6 @@ bool LLParser::ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
   return false;
 }
 
-bool LLParser::ParseUnionValue(const UnionType* utype, ValID &ID, Value *&V) {
-  if (const StructType* stype = dyn_cast<StructType>(ID.ConstantVal->getType())) {
-    if (stype->getNumContainedTypes() != 1)
-      return Error(ID.Loc, "constant expression type mismatch");
-    int index = utype->getElementTypeIndex(stype->getContainedType(0));
-    if (index < 0)
-      return Error(ID.Loc, "initializer type is not a member of the union");
-
-    V = ConstantUnion::get(
-        utype, cast<Constant>(ID.ConstantVal->getOperand(0)));
-    return false;
-  }
-
-  return Error(ID.Loc, "constant expression type mismatch");
-}
-
 
 /// FunctionHeader
 ///   ::= OptionalLinkage OptionalVisibility OptionalCallingConv OptRetAttrs
@@ -2724,6 +2684,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   case GlobalValue::PrivateLinkage:
   case GlobalValue::LinkerPrivateLinkage:
   case GlobalValue::LinkerPrivateWeakLinkage:
+  case GlobalValue::LinkerPrivateWeakDefAutoLinkage:
   case GlobalValue::InternalLinkage:
   case GlobalValue::AvailableExternallyLinkage:
   case GlobalValue::LinkOnceAnyLinkage:
@@ -2980,7 +2941,7 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
       // With a normal result, we check to see if the instruction is followed by
       // a comma and metadata.
       if (EatIfPresent(lltok::comma))
-        if (ParseInstructionMetadata(Inst))
+        if (ParseInstructionMetadata(Inst, &PFS))
           return true;
       break;
     case InstExtraComma:
@@ -2988,7 +2949,7 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
 
       // If the instruction parser ate an extra comma at the end of it, it
       // *must* be followed by metadata.
-      if (ParseInstructionMetadata(Inst))
+      if (ParseInstructionMetadata(Inst, &PFS))
         return true;
       break;        
     }
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index f765a2ae4e6ce..404cec3ed7c74 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -32,7 +32,6 @@ namespace llvm {
   class GlobalValue;
   class MDString;
   class MDNode;
-  class UnionType;
 
   /// ValID - Represents a reference of a definition of some sort with no type.
   /// There are several cases where we have to parse the value but where the
@@ -80,6 +79,14 @@ namespace llvm {
     
     // Instruction metadata resolution.  Each instruction can have a list of
     // MDRef info associated with them.
+    //
+    // The simpler approach of just creating temporary MDNodes and then calling
+    // RAUW on them when the definition is processed doesn't work because some
+    // instruction metadata kinds, such as dbg, get stored in the IR in an
+    // "optimized" format which doesn't participate in the normal value use
+    // lists. This means that RAUW doesn't work, even on temporary MDNodes
+    // which otherwise support RAUW. Instead, we defer resolving MDNode
+    // references until the definitions have been processed.
     struct MDRef {
       SMLoc Loc;
       unsigned MDKind, MDSlot;
@@ -180,7 +187,6 @@ namespace llvm {
     bool ParseOptionalCallingConv(CallingConv::ID &CC);
     bool ParseOptionalAlignment(unsigned &Alignment);
     bool ParseOptionalStackAlignment(unsigned &Alignment);
-    bool ParseInstructionMetadata(Instruction *Inst);
     bool ParseOptionalCommaAlign(unsigned &Alignment, bool &AteExtraComma);
     bool ParseIndexList(SmallVectorImpl<unsigned> &Indices,bool &AteExtraComma);
     bool ParseIndexList(SmallVectorImpl<unsigned> &Indices) {
@@ -222,7 +228,6 @@ namespace llvm {
     }
     bool ParseTypeRec(PATypeHolder &H);
     bool ParseStructType(PATypeHolder &H, bool Packed);
-    bool ParseUnionType(PATypeHolder &H);
     bool ParseArrayVectorType(PATypeHolder &H, bool isVector);
     bool ParseFunctionType(PATypeHolder &Result);
     PATypeHolder HandleUpRefs(const Type *Ty);
@@ -291,7 +296,6 @@ namespace llvm {
       return ParseTypeAndBasicBlock(BB, Loc, PFS);
     }
 
-    bool ParseUnionValue(const UnionType* utype, ValID &ID, Value *&V);
 
     struct ParamInfo {
       LocTy Loc;
@@ -308,8 +312,10 @@ namespace llvm {
     bool ParseGlobalValue(const Type *Ty, Constant *&V);
     bool ParseGlobalTypeAndValue(Constant *&V);
     bool ParseGlobalValueVector(SmallVectorImpl<Constant*> &Elts);
+    bool ParseMetadataListValue(ValID &ID, PerFunctionState *PFS);
     bool ParseMetadataValue(ValID &ID, PerFunctionState *PFS);
     bool ParseMDNodeVector(SmallVectorImpl<Value*> &, PerFunctionState *PFS);
+    bool ParseInstructionMetadata(Instruction *Inst, PerFunctionState *PFS);
 
     // Function Parsing.
     struct ArgInfo {
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 2703134ec1a96..61f93a4274981 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -37,7 +37,8 @@ namespace lltok {
     kw_declare, kw_define,
     kw_global,  kw_constant,
 
-    kw_private, kw_linker_private, kw_linker_private_weak, kw_internal,
+    kw_private, kw_linker_private, kw_linker_private_weak,
+    kw_linker_private_weak_def_auto, kw_internal,
     kw_linkonce, kw_linkonce_odr, kw_weak, kw_weak_odr, kw_appending,
     kw_dllimport, kw_dllexport, kw_common, kw_available_externally,
     kw_default, kw_hidden, kw_protected,
@@ -97,7 +98,6 @@ namespace lltok {
 
     kw_type,
     kw_opaque,
-    kw_union,
 
     kw_eq, kw_ne, kw_slt, kw_sgt, kw_sle, kw_sge, kw_ult, kw_ugt, kw_ule,
     kw_uge, kw_oeq, kw_one, kw_olt, kw_ogt, kw_ole, kw_oge, kw_ord, kw_uno,
diff --git a/lib/AsmParser/Parser.cpp b/lib/AsmParser/Parser.cpp
index e511cbe29c756..e7cef9b5c3c58 100644
--- a/lib/AsmParser/Parser.cpp
+++ b/lib/AsmParser/Parser.cpp
@@ -45,8 +45,7 @@ Module *llvm::ParseAssemblyFile(const std::string &Filename, SMDiagnostic &Err,
   MemoryBuffer *F = MemoryBuffer::getFileOrSTDIN(Filename.c_str(), &ErrorStr);
   if (F == 0) {
     Err = SMDiagnostic(Filename,
-                       "Could not open input file '" + Filename + "': " +
-                       ErrorStr);
+                       "Could not open input file: " + ErrorStr);
     return 0;
   }
 
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index b3f0776d29d54..830c79aa3b54e 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -39,6 +39,7 @@ void BitcodeReader::FreeState() {
   std::vector<BasicBlock*>().swap(FunctionBBs);
   std::vector<Function*>().swap(FunctionsWithBodies);
   DeferredFunctionInfo.clear();
+  MDKindMap.clear();
 }
 
 //===----------------------------------------------------------------------===//
@@ -76,6 +77,7 @@ static GlobalValue::LinkageTypes GetDecodedLinkage(unsigned Val) {
   case 12: return GlobalValue::AvailableExternallyLinkage;
   case 13: return GlobalValue::LinkerPrivateLinkage;
   case 14: return GlobalValue::LinkerPrivateWeakLinkage;
+  case 15: return GlobalValue::LinkerPrivateWeakDefAutoLinkage;
   }
 }
 
@@ -295,8 +297,6 @@ void BitcodeReaderValueList::ResolveConstantForwardRefs() {
       } else if (ConstantStruct *UserCS = dyn_cast<ConstantStruct>(UserC)) {
         NewC = ConstantStruct::get(Context, &NewOps[0], NewOps.size(),
                                          UserCS->getType()->isPacked());
-      } else if (ConstantUnion *UserCU = dyn_cast<ConstantUnion>(UserC)) {
-        NewC = ConstantUnion::get(UserCU->getType(), NewOps[0]);
       } else if (isa<ConstantVector>(UserC)) {
         NewC = ConstantVector::get(&NewOps[0], NewOps.size());
       } else {
@@ -332,9 +332,9 @@ void BitcodeReaderMDValueList::AssignValue(Value *V, unsigned Idx) {
   }
 
   // If there was a forward reference to this value, replace it.
-  Value *PrevVal = OldV;
+  MDNode *PrevVal = cast<MDNode>(OldV);
   OldV->replaceAllUsesWith(V);
-  delete PrevVal;
+  MDNode::deleteTemporary(PrevVal);
   // Deleting PrevVal sets Idx value in MDValuePtrs to null. Set new
   // value for Idx.
   MDValuePtrs[Idx] = V;
@@ -350,7 +350,7 @@ Value *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) {
   }
 
   // Create and return a placeholder, which will later be RAUW'd.
-  Value *V = new Argument(Type::getMetadataTy(Context));
+  Value *V = MDNode::getTemporary(Context, 0, 0);
   MDValuePtrs[Idx] = V;
   return V;
 }
@@ -589,13 +589,6 @@ bool BitcodeReader::ParseTypeTable() {
       ResultTy = StructType::get(Context, EltTys, Record[0]);
       break;
     }
-    case bitc::TYPE_CODE_UNION: {  // UNION: [eltty x N]
-      SmallVector<const Type*, 8> EltTys;
-      for (unsigned i = 0, e = Record.size(); i != e; ++i)
-        EltTys.push_back(getTypeByID(Record[i], true));
-      ResultTy = UnionType::get(&EltTys[0], EltTys.size());
-      break;
-    }
     case bitc::TYPE_CODE_ARRAY:     // ARRAY: [numelts, eltty]
       if (Record.size() < 2)
         return Error("Invalid ARRAY type record");
@@ -781,7 +774,8 @@ bool BitcodeReader::ParseMetadata() {
     bool IsFunctionLocal = false;
     // Read a record.
     Record.clear();
-    switch (Stream.ReadRecord(Code, Record)) {
+    Code = Stream.ReadRecord(Code, Record);
+    switch (Code) {
     default:  // Default behavior: ignore.
       break;
     case bitc::METADATA_NAME: {
@@ -794,34 +788,46 @@ bool BitcodeReader::ParseMetadata() {
       Record.clear();
       Code = Stream.ReadCode();
 
-      // METADATA_NAME is always followed by METADATA_NAMED_NODE.
-      if (Stream.ReadRecord(Code, Record) != bitc::METADATA_NAMED_NODE)
+      // METADATA_NAME is always followed by METADATA_NAMED_NODE2.
+      // Or METADATA_NAMED_NODE in LLVM 2.7. FIXME: Remove this in LLVM 3.0.
+      unsigned NextBitCode = Stream.ReadRecord(Code, Record);
+      if (NextBitCode == bitc::METADATA_NAMED_NODE) {
+        LLVM2_7MetadataDetected = true;
+      } else if (NextBitCode != bitc::METADATA_NAMED_NODE2)
         assert ( 0 && "Inavlid Named Metadata record");
 
       // Read named metadata elements.
       unsigned Size = Record.size();
-      SmallVector<MDNode *, 8> Elts;
+      NamedMDNode *NMD = TheModule->getOrInsertNamedMetadata(Name);
       for (unsigned i = 0; i != Size; ++i) {
-        if (Record[i] == ~0U) {
-          Elts.push_back(NULL);
-          continue;
-        }
         MDNode *MD = dyn_cast<MDNode>(MDValueList.getValueFwdRef(Record[i]));
         if (MD == 0)
           return Error("Malformed metadata record");
-        Elts.push_back(MD);
+        NMD->addOperand(MD);
       }
-      Value *V = NamedMDNode::Create(Context, Name.str(), Elts.data(),
-                                     Elts.size(), TheModule);
-      MDValueList.AssignValue(V, NextMDValueNo++);
+      // Backwards compatibility hack: NamedMDValues used to be Values,
+      // and they got their own slots in the value numbering. They are no
+      // longer Values, however we still need to account for them in the
+      // numbering in order to be able to read old bitcode files.
+      // FIXME: Remove this in LLVM 3.0.
+      if (LLVM2_7MetadataDetected)
+        MDValueList.AssignValue(0, NextMDValueNo++);
       break;
     }
-    case bitc::METADATA_FN_NODE:
+    case bitc::METADATA_FN_NODE: // FIXME: Remove in LLVM 3.0.
+    case bitc::METADATA_FN_NODE2:
       IsFunctionLocal = true;
       // fall-through
-    case bitc::METADATA_NODE: {
+    case bitc::METADATA_NODE:    // FIXME: Remove in LLVM 3.0.
+    case bitc::METADATA_NODE2: {
+
+      // Detect 2.7-era metadata.
+      // FIXME: Remove in LLVM 3.0.
+      if (Code == bitc::METADATA_FN_NODE || Code == bitc::METADATA_NODE)
+        LLVM2_7MetadataDetected = true;
+
       if (Record.size() % 2 == 1)
-        return Error("Invalid METADATA_NODE record");
+        return Error("Invalid METADATA_NODE2 record");
 
       unsigned Size = Record.size();
       SmallVector<Value*, 8> Elts;
@@ -859,13 +865,12 @@ bool BitcodeReader::ParseMetadata() {
       SmallString<8> Name;
       Name.resize(RecordLength-1);
       unsigned Kind = Record[0];
-      (void) Kind;
       for (unsigned i = 1; i != RecordLength; ++i)
         Name[i-1] = Record[i];
       
       unsigned NewKind = TheModule->getMDKindID(Name.str());
-      assert(Kind == NewKind &&
-             "FIXME: Unable to handle custom metadata mismatch!");(void)NewKind;
+      if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second)
+        return Error("Conflicting METADATA_KIND records");
       break;
     }
     }
@@ -1020,11 +1025,6 @@ bool BitcodeReader::ParseConstants() {
           Elts.push_back(ValueList.getConstantFwdRef(Record[i],
                                                      STy->getElementType(i)));
         V = ConstantStruct::get(STy, Elts);
-      } else if (const UnionType *UnTy = dyn_cast<UnionType>(CurTy)) {
-        uint64_t Index = Record[0];
-        Constant *Val = ValueList.getConstantFwdRef(Record[1],
-                                        UnTy->getElementType(Index));
-        V = ConstantUnion::get(UnTy, Val);
       } else if (const ArrayType *ATy = dyn_cast<ArrayType>(CurTy)) {
         const Type *EltTy = ATy->getElementType();
         for (unsigned i = 0; i != Size; ++i)
@@ -1297,6 +1297,12 @@ bool BitcodeReader::ParseModule() {
           UpgradedIntrinsics.push_back(std::make_pair(FI, NewFn));
       }
 
+      // Look for global variables which need to be renamed.
+      for (Module::global_iterator
+             GI = TheModule->global_begin(), GE = TheModule->global_end();
+           GI != GE; ++GI)
+        UpgradeGlobalVariable(GI);
+
       // Force deallocation of memory for these vectors to favor the client that
       // want lazy deserialization.
       std::vector<std::pair<GlobalVariable*, unsigned> >().swap(GlobalInits);
@@ -1614,15 +1620,22 @@ bool BitcodeReader::ParseMetadataAttachment() {
     switch (Stream.ReadRecord(Code, Record)) {
     default:  // Default behavior: ignore.
       break;
-    case bitc::METADATA_ATTACHMENT: {
+    // FIXME: Remove in LLVM 3.0.
+    case bitc::METADATA_ATTACHMENT:
+      LLVM2_7MetadataDetected = true;
+    case bitc::METADATA_ATTACHMENT2: {
       unsigned RecordLength = Record.size();
       if (Record.empty() || (RecordLength - 1) % 2 == 1)
         return Error ("Invalid METADATA_ATTACHMENT reader!");
       Instruction *Inst = InstructionList[Record[0]];
       for (unsigned i = 1; i != RecordLength; i = i+2) {
         unsigned Kind = Record[i];
+        DenseMap<unsigned, unsigned>::iterator I =
+          MDKindMap.find(Kind);
+        if (I == MDKindMap.end())
+          return Error("Invalid metadata kind ID");
         Value *Node = MDValueList.getValueFwdRef(Record[i+1]);
-        Inst->setMetadata(Kind, cast<MDNode>(Node));
+        Inst->setMetadata(I->second, cast<MDNode>(Node));
       }
       break;
     }
@@ -1638,6 +1651,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
   InstructionList.clear();
   unsigned ModuleValueListSize = ValueList.size();
+  unsigned ModuleMDValueListSize = MDValueList.size();
 
   // Add all the function arguments to the value table.
   for(Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I)
@@ -1722,7 +1736,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       I = 0;
       continue;
         
-    case bitc::FUNC_CODE_DEBUG_LOC: {      // DEBUG_LOC: [line, col, scope, ia]
+    // FIXME: Remove this in LLVM 3.0.
+    case bitc::FUNC_CODE_DEBUG_LOC:
+      LLVM2_7MetadataDetected = true;
+    case bitc::FUNC_CODE_DEBUG_LOC2: {      // DEBUG_LOC: [line, col, scope, ia]
       I = 0;     // Get the last instruction emitted.
       if (CurBB && !CurBB->empty())
         I = &CurBB->back();
@@ -1988,6 +2005,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         } while(OpNum != Record.size());
 
         const Type *ReturnType = F->getReturnType();
+        // Handle multiple return values. FIXME: Remove in LLVM 3.0.
         if (Vs.size() > 1 ||
             (ReturnType->isStructTy() &&
              (Vs.empty() || Vs[0]->getType() != ReturnType))) {
@@ -2183,7 +2201,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, opty, op, align]
       // For backward compatibility, tolerate a lack of an opty, and use i32.
-      // LLVM 3.0: Remove this.
+      // Remove this in LLVM 3.0.
       if (Record.size() < 3 || Record.size() > 4)
         return Error("Invalid ALLOCA record");
       unsigned OpNum = 0;
@@ -2236,7 +2254,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       InstructionList.push_back(I);
       break;
     }
-    case bitc::FUNC_CODE_INST_CALL: {
+    // FIXME: Remove this in LLVM 3.0.
+    case bitc::FUNC_CODE_INST_CALL:
+      LLVM2_7MetadataDetected = true;
+    case bitc::FUNC_CODE_INST_CALL2: {
       // CALL: [paramattrs, cc, fnty, fnid, arg0, arg1...]
       if (Record.size() < 3)
         return Error("Invalid CALL record");
@@ -2324,7 +2345,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     if (A->getParent() == 0) {
       // We found at least one unresolved value.  Nuke them all to avoid leaks.
       for (unsigned i = ModuleValueListSize, e = ValueList.size(); i != e; ++i){
-        if ((A = dyn_cast<Argument>(ValueList.back())) && A->getParent() == 0) {
+        if ((A = dyn_cast<Argument>(ValueList[i])) && A->getParent() == 0) {
           A->replaceAllUsesWith(UndefValue::get(A->getType()));
           delete A;
         }
@@ -2333,6 +2354,9 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     }
   }
 
+  // FIXME: Check for unresolved forward-declared metadata references
+  // and clean up leaks.
+
   // See if anything took the address of blocks in this function.  If so,
   // resolve them now.
   DenseMap<Function*, std::vector<BlockAddrRefTy> >::iterator BAFRI =
@@ -2352,8 +2376,21 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     BlockAddrFwdRefs.erase(BAFRI);
   }
   
+  // FIXME: Remove this in LLVM 3.0.
+  unsigned NewMDValueListSize = MDValueList.size();
+
   // Trim the value list down to the size it was before we parsed this function.
   ValueList.shrinkTo(ModuleValueListSize);
+  MDValueList.shrinkTo(ModuleMDValueListSize);
+
+  // Backwards compatibility hack: Function-local metadata numbers
+  // were previously not reset between functions. This is now fixed,
+  // however we still need to understand the old numbering in order
+  // to be able to read old bitcode files.
+  // FIXME: Remove this in LLVM 3.0.
+  if (LLVM2_7MetadataDetected)
+    MDValueList.resize(NewMDValueListSize);
+
   std::vector<BasicBlock*>().swap(FunctionBBs);
 
   return false;
diff --git a/lib/Bitcode/Reader/BitcodeReader.h b/lib/Bitcode/Reader/BitcodeReader.h
index 55c71f7c886fe..053121bdad6e5 100644
--- a/lib/Bitcode/Reader/BitcodeReader.h
+++ b/lib/Bitcode/Reader/BitcodeReader.h
@@ -156,6 +156,9 @@ class BitcodeReader : public GVMaterializer {
   // stored here with their replacement function.
   typedef std::vector<std::pair<Function*, Function*> > UpgradedIntrinsicMap;
   UpgradedIntrinsicMap UpgradedIntrinsics;
+
+  // Map the bitcode's custom MDKind ID to the Module's MDKind ID.
+  DenseMap<unsigned, unsigned> MDKindMap;
   
   // After the module header has been read, the FunctionsWithBodies list is 
   // reversed.  This keeps track of whether we've done this yet.
@@ -170,11 +173,18 @@ class BitcodeReader : public GVMaterializer {
   /// are resolved lazily when functions are loaded.
   typedef std::pair<unsigned, GlobalVariable*> BlockAddrRefTy;
   DenseMap<Function*, std::vector<BlockAddrRefTy> > BlockAddrFwdRefs;
+
+  /// LLVM2_7MetadataDetected - True if metadata produced by LLVM 2.7 or
+  /// earlier was detected, in which case we behave slightly differently,
+  /// for compatibility.
+  /// FIXME: Remove in LLVM 3.0.
+  bool LLVM2_7MetadataDetected;
   
 public:
   explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C)
     : Context(C), TheModule(0), Buffer(buffer), BufferOwned(false),
-      ErrorString(0), ValueList(C), MDValueList(C) {
+      ErrorString(0), ValueList(C), MDValueList(C),
+      LLVM2_7MetadataDetected(false) {
     HasReversedFunctionsWithBodies = false;
   }
   ~BitcodeReader() {
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index fa1b2c4bee2b2..7b6fc6cd928df 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -181,14 +181,6 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
                             Log2_32_Ceil(VE.getTypes().size()+1)));
   unsigned StructAbbrev = Stream.EmitAbbrev(Abbv);
 
-  // Abbrev for TYPE_CODE_UNION.
-  Abbv = new BitCodeAbbrev();
-  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_UNION));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                            Log2_32_Ceil(VE.getTypes().size()+1)));
-  unsigned UnionAbbrev = Stream.EmitAbbrev(Abbv);
-
   // Abbrev for TYPE_CODE_ARRAY.
   Abbv = new BitCodeAbbrev();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_ARRAY));
@@ -258,17 +250,6 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
       AbbrevToUse = StructAbbrev;
       break;
     }
-    case Type::UnionTyID: {
-      const UnionType *UT = cast<UnionType>(T);
-      // UNION: [eltty x N]
-      Code = bitc::TYPE_CODE_UNION;
-      // Output all of the element types.
-      for (UnionType::element_iterator I = UT->element_begin(),
-           E = UT->element_end(); I != E; ++I)
-        TypeVals.push_back(VE.getTypeID(*I));
-      AbbrevToUse = UnionAbbrev;
-      break;
-    }
     case Type::ArrayTyID: {
       const ArrayType *AT = cast<ArrayType>(T);
       // ARRAY: [numelts, eltty]
@@ -299,21 +280,22 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
 static unsigned getEncodedLinkage(const GlobalValue *GV) {
   switch (GV->getLinkage()) {
   default: llvm_unreachable("Invalid linkage!");
-  case GlobalValue::ExternalLinkage:            return 0;
-  case GlobalValue::WeakAnyLinkage:             return 1;
-  case GlobalValue::AppendingLinkage:           return 2;
-  case GlobalValue::InternalLinkage:            return 3;
-  case GlobalValue::LinkOnceAnyLinkage:         return 4;
-  case GlobalValue::DLLImportLinkage:           return 5;
-  case GlobalValue::DLLExportLinkage:           return 6;
-  case GlobalValue::ExternalWeakLinkage:        return 7;
-  case GlobalValue::CommonLinkage:              return 8;
-  case GlobalValue::PrivateLinkage:             return 9;
-  case GlobalValue::WeakODRLinkage:             return 10;
-  case GlobalValue::LinkOnceODRLinkage:         return 11;
-  case GlobalValue::AvailableExternallyLinkage: return 12;
-  case GlobalValue::LinkerPrivateLinkage:       return 13;
-  case GlobalValue::LinkerPrivateWeakLinkage:   return 14;
+  case GlobalValue::ExternalLinkage:                 return 0;
+  case GlobalValue::WeakAnyLinkage:                  return 1;
+  case GlobalValue::AppendingLinkage:                return 2;
+  case GlobalValue::InternalLinkage:                 return 3;
+  case GlobalValue::LinkOnceAnyLinkage:              return 4;
+  case GlobalValue::DLLImportLinkage:                return 5;
+  case GlobalValue::DLLExportLinkage:                return 6;
+  case GlobalValue::ExternalWeakLinkage:             return 7;
+  case GlobalValue::CommonLinkage:                   return 8;
+  case GlobalValue::PrivateLinkage:                  return 9;
+  case GlobalValue::WeakODRLinkage:                  return 10;
+  case GlobalValue::LinkOnceODRLinkage:              return 11;
+  case GlobalValue::AvailableExternallyLinkage:      return 12;
+  case GlobalValue::LinkerPrivateLinkage:            return 13;
+  case GlobalValue::LinkerPrivateWeakLinkage:        return 14;
+  case GlobalValue::LinkerPrivateWeakDefAutoLinkage: return 15;
   }
 }
 
@@ -503,13 +485,14 @@ static void WriteMDNode(const MDNode *N,
       Record.push_back(0);
     }
   }
-  unsigned MDCode = N->isFunctionLocal() ? bitc::METADATA_FN_NODE :
-                                           bitc::METADATA_NODE;
+  unsigned MDCode = N->isFunctionLocal() ? bitc::METADATA_FN_NODE2 :
+                                           bitc::METADATA_NODE2;
   Stream.EmitRecord(MDCode, Record, 0);
   Record.clear();
 }
 
-static void WriteModuleMetadata(const ValueEnumerator &VE,
+static void WriteModuleMetadata(const Module *M,
+                                const ValueEnumerator &VE,
                                 BitstreamWriter &Stream) {
   const ValueEnumerator::ValueList &Vals = VE.getMDValues();
   bool StartedMetadataBlock = false;
@@ -544,29 +527,30 @@ static void WriteModuleMetadata(const ValueEnumerator &VE,
       // Emit the finished record.
       Stream.EmitRecord(bitc::METADATA_STRING, Record, MDSAbbrev);
       Record.clear();
-    } else if (const NamedMDNode *NMD = dyn_cast<NamedMDNode>(Vals[i].first)) {
-      if (!StartedMetadataBlock)  {
-        Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
-        StartedMetadataBlock = true;
-      }
-
-      // Write name.
-      StringRef Str = NMD->getName();
-      for (unsigned i = 0, e = Str.size(); i != e; ++i)
-        Record.push_back(Str[i]);
-      Stream.EmitRecord(bitc::METADATA_NAME, Record, 0/*TODO*/);
-      Record.clear();
+    }
+  }
 
-      // Write named metadata operands.
-      for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
-        if (NMD->getOperand(i))
-          Record.push_back(VE.getValueID(NMD->getOperand(i)));
-        else
-          Record.push_back(~0U);
-      }
-      Stream.EmitRecord(bitc::METADATA_NAMED_NODE, Record, 0);
-      Record.clear();
+  // Write named metadata.
+  for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
+       E = M->named_metadata_end(); I != E; ++I) {
+    const NamedMDNode *NMD = I;
+    if (!StartedMetadataBlock)  {
+      Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
+      StartedMetadataBlock = true;
     }
+
+    // Write name.
+    StringRef Str = NMD->getName();
+    for (unsigned i = 0, e = Str.size(); i != e; ++i)
+      Record.push_back(Str[i]);
+    Stream.EmitRecord(bitc::METADATA_NAME, Record, 0/*TODO*/);
+    Record.clear();
+
+    // Write named metadata operands.
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
+      Record.push_back(VE.getValueID(NMD->getOperand(i)));
+    Stream.EmitRecord(bitc::METADATA_NAMED_NODE2, Record, 0);
+    Record.clear();
   }
 
   if (StartedMetadataBlock)
@@ -601,7 +585,7 @@ static void WriteMetadataAttachment(const Function &F,
   SmallVector<uint64_t, 64> Record;
 
   // Write metadata attachments
-  // METADATA_ATTACHMENT - [m x [value, [n x [id, mdnode]]]
+  // METADATA_ATTACHMENT2 - [m x [value, [n x [id, mdnode]]]
   SmallVector<std::pair<unsigned, MDNode*>, 4> MDs;
   
   for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
@@ -619,7 +603,7 @@ static void WriteMetadataAttachment(const Function &F,
         Record.push_back(MDs[i].first);
         Record.push_back(VE.getValueID(MDs[i].second));
       }
-      Stream.EmitRecord(bitc::METADATA_ATTACHMENT, Record, 0);
+      Stream.EmitRecord(bitc::METADATA_ATTACHMENT2, Record, 0);
       Record.clear();
     }
 
@@ -634,12 +618,11 @@ static void WriteModuleMetadataStore(const Module *M, BitstreamWriter &Stream) {
   SmallVector<StringRef, 4> Names;
   M->getMDKindNames(Names);
   
-  assert(Names[0] == "" && "MDKind #0 is invalid");
-  if (Names.size() == 1) return;
+  if (Names.empty()) return;
 
   Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
   
-  for (unsigned MDKindID = 1, e = Names.size(); MDKindID != e; ++MDKindID) {
+  for (unsigned MDKindID = 0, e = Names.size(); MDKindID != e; ++MDKindID) {
     Record.push_back(MDKindID);
     StringRef KName = Names[MDKindID];
     Record.append(KName.begin(), KName.end());
@@ -734,8 +717,8 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
       Code = bitc::CST_CODE_UNDEF;
     } else if (const ConstantInt *IV = dyn_cast<ConstantInt>(C)) {
       if (IV->getBitWidth() <= 64) {
-        int64_t V = IV->getSExtValue();
-        if (V >= 0)
+        uint64_t V = IV->getSExtValue();
+        if ((int64_t)V >= 0)
           Record.push_back(V << 1);
         else
           Record.push_back((-V << 1) | 1);
@@ -809,20 +792,6 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
       for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i)
         Record.push_back(VE.getValueID(C->getOperand(i)));
       AbbrevToUse = AggregateAbbrev;
-    } else if (isa<ConstantUnion>(C)) {
-      Code = bitc::CST_CODE_AGGREGATE;
-
-      // Unions only have one entry but we must send type along with it.
-      const Type *EntryKind = C->getOperand(0)->getType();
-
-      const UnionType *UnTy = cast<UnionType>(C->getType());
-      int UnionIndex = UnTy->getElementTypeIndex(EntryKind);
-      assert(UnionIndex != -1 && "Constant union contains invalid entry");
-
-      Record.push_back(UnionIndex);
-      Record.push_back(VE.getValueID(C->getOperand(0)));
-
-      AbbrevToUse = AggregateAbbrev;
     } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
       switch (CE->getOpcode()) {
       default:
@@ -902,6 +871,9 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
       Record.push_back(VE.getValueID(BA->getFunction()));
       Record.push_back(VE.getGlobalBasicBlockID(BA->getBasicBlock()));
     } else {
+#ifndef NDEBUG
+      C->dump();
+#endif
       llvm_unreachable("Unknown constant!");
     }
     Stream.EmitRecord(Code, Record, AbbrevToUse);
@@ -1139,7 +1111,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     const PointerType *PTy = cast<PointerType>(CI.getCalledValue()->getType());
     const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
 
-    Code = bitc::FUNC_CODE_INST_CALL;
+    Code = bitc::FUNC_CODE_INST_CALL2;
 
     Vals.push_back(VE.getAttributeID(CI.getAttributes()));
     Vals.push_back((CI.getCallingConv() << 1) | unsigned(CI.isTailCall()));
@@ -1283,7 +1255,7 @@ static void WriteFunction(const Function &F, ValueEnumerator &VE,
         Vals.push_back(DL.getCol());
         Vals.push_back(Scope ? VE.getValueID(Scope)+1 : 0);
         Vals.push_back(IA ? VE.getValueID(IA)+1 : 0);
-        Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC, Vals);
+        Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC2, Vals);
         Vals.clear();
         
         LastDL = DL;
@@ -1532,7 +1504,7 @@ static void WriteModule(const Module *M, BitstreamWriter &Stream) {
   WriteModuleConstants(VE, Stream);
 
   // Emit metadata.
-  WriteModuleMetadata(VE, Stream);
+  WriteModuleMetadata(M, VE, Stream);
 
   // Emit function bodies.
   for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I)
diff --git a/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index 3a0d3ce0be994..91e115cba6cc4 100644
--- a/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -21,7 +21,7 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit WriteBitcodePass(raw_ostream &o)
-      : ModulePass(&ID), OS(o) {}
+      : ModulePass(ID), OS(o) {}
     
     const char *getPassName() const { return "Bitcode Writer"; }
     
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 7fa425a7d871f..2f02262c36aff 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -75,7 +75,7 @@ ValueEnumerator::ValueEnumerator(const Module *M) {
   // Insert constants and metadata that are named at module level into the slot 
   // pool so that the module symbol table can refer to them...
   EnumerateValueSymbolTable(M->getValueSymbolTable());
-  EnumerateMDSymbolTable(M->getMDSymbolTable());
+  EnumerateNamedMetadata(M);
 
   SmallVector<std::pair<unsigned, MDNode*>, 8> MDs;
 
@@ -137,7 +137,7 @@ ValueEnumerator::ValueEnumerator(const Module *M) {
 unsigned ValueEnumerator::getInstructionID(const Instruction *Inst) const {
   InstructionMapType::const_iterator I = InstructionMap.find(Inst);
   assert (I != InstructionMap.end() && "Instruction is not mapped!");
-    return I->second;
+  return I->second;
 }
 
 void ValueEnumerator::setInstructionID(const Instruction *I) {
@@ -207,35 +207,48 @@ void ValueEnumerator::EnumerateValueSymbolTable(const ValueSymbolTable &VST) {
     EnumerateValue(VI->getValue());
 }
 
-/// EnumerateMDSymbolTable - Insert all of the values in the specified metadata
-/// table.
-void ValueEnumerator::EnumerateMDSymbolTable(const MDSymbolTable &MST) {
-  for (MDSymbolTable::const_iterator MI = MST.begin(), ME = MST.end();
-       MI != ME; ++MI)
-    EnumerateValue(MI->getValue());
+/// EnumerateNamedMetadata - Insert all of the values referenced by
+/// named metadata in the specified module.
+void ValueEnumerator::EnumerateNamedMetadata(const Module *M) {
+  for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
+       E = M->named_metadata_end(); I != E; ++I)
+    EnumerateNamedMDNode(I);
 }
 
 void ValueEnumerator::EnumerateNamedMDNode(const NamedMDNode *MD) {
-  // Check to see if it's already in!
-  unsigned &MDValueID = MDValueMap[MD];
-  if (MDValueID) {
-    // Increment use count.
-    MDValues[MDValueID-1].second++;
-    return;
+  for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i)
+    EnumerateMetadata(MD->getOperand(i));
+}
+
+/// EnumerateMDNodeOperands - Enumerate all non-function-local values
+/// and types referenced by the given MDNode.
+void ValueEnumerator::EnumerateMDNodeOperands(const MDNode *N) {
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (Value *V = N->getOperand(i)) {
+      if (isa<MDNode>(V) || isa<MDString>(V))
+        EnumerateMetadata(V);
+      else if (!isa<Instruction>(V) && !isa<Argument>(V))
+        EnumerateValue(V);
+    } else
+      EnumerateType(Type::getVoidTy(N->getContext()));
   }
+}
+
+void ValueEnumerator::EnumerateMetadata(const Value *MD) {
+  assert((isa<MDNode>(MD) || isa<MDString>(MD)) && "Invalid metadata kind");
 
   // Enumerate the type of this value.
   EnumerateType(MD->getType());
 
-  for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i)
-    if (MDNode *E = MD->getOperand(i))
-      EnumerateValue(E);
-  MDValues.push_back(std::make_pair(MD, 1U));
-  MDValueMap[MD] = Values.size();
-}
+  const MDNode *N = dyn_cast<MDNode>(MD);
+
+  // In the module-level pass, skip function-local nodes themselves, but
+  // do walk their operands.
+  if (N && N->isFunctionLocal() && N->getFunction()) {
+    EnumerateMDNodeOperands(N);
+    return;
+  }
 
-void ValueEnumerator::EnumerateMetadata(const Value *MD) {
-  assert((isa<MDNode>(MD) || isa<MDString>(MD)) && "Invalid metadata kind");
   // Check to see if it's already in!
   unsigned &MDValueID = MDValueMap[MD];
   if (MDValueID) {
@@ -243,37 +256,52 @@ void ValueEnumerator::EnumerateMetadata(const Value *MD) {
     MDValues[MDValueID-1].second++;
     return;
   }
+  MDValues.push_back(std::make_pair(MD, 1U));
+  MDValueID = MDValues.size();
+
+  // Enumerate all non-function-local operands.
+  if (N)
+    EnumerateMDNodeOperands(N);
+}
+
+/// EnumerateFunctionLocalMetadataa - Incorporate function-local metadata
+/// information reachable from the given MDNode.
+void ValueEnumerator::EnumerateFunctionLocalMetadata(const MDNode *N) {
+  assert(N->isFunctionLocal() && N->getFunction() &&
+         "EnumerateFunctionLocalMetadata called on non-function-local mdnode!");
 
   // Enumerate the type of this value.
-  EnumerateType(MD->getType());
+  EnumerateType(N->getType());
 
-  if (const MDNode *N = dyn_cast<MDNode>(MD)) {
-    MDValues.push_back(std::make_pair(MD, 1U));
-    MDValueMap[MD] = MDValues.size();
-    MDValueID = MDValues.size();
-    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-      if (Value *V = N->getOperand(i))
-        EnumerateValue(V);
-      else
-        EnumerateType(Type::getVoidTy(MD->getContext()));
-    }
-    if (N->isFunctionLocal() && N->getFunction())
-      FunctionLocalMDs.push_back(N);
+  // Check to see if it's already in!
+  unsigned &MDValueID = MDValueMap[N];
+  if (MDValueID) {
+    // Increment use count.
+    MDValues[MDValueID-1].second++;
     return;
   }
-  
-  // Add the value.
-  assert(isa<MDString>(MD) && "Unknown metadata kind");
-  MDValues.push_back(std::make_pair(MD, 1U));
+  MDValues.push_back(std::make_pair(N, 1U));
   MDValueID = MDValues.size();
+
+  // To incoroporate function-local information visit all function-local
+  // MDNodes and all function-local values they reference.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    if (Value *V = N->getOperand(i)) {
+      if (MDNode *O = dyn_cast<MDNode>(V)) {
+        if (O->isFunctionLocal() && O->getFunction())
+          EnumerateFunctionLocalMetadata(O);
+      } else if (isa<Instruction>(V) || isa<Argument>(V))
+        EnumerateValue(V);
+    }
+
+  // Also, collect all function-local MDNodes for easy access.
+  FunctionLocalMDs.push_back(N);
 }
 
 void ValueEnumerator::EnumerateValue(const Value *V) {
   assert(!V->getType()->isVoidTy() && "Can't insert void values!");
-  if (isa<MDNode>(V) || isa<MDString>(V))
-    return EnumerateMetadata(V);
-  else if (const NamedMDNode *NMD = dyn_cast<NamedMDNode>(V))
-    return EnumerateNamedMDNode(NMD);
+  assert(!isa<MDNode>(V) && !isa<MDString>(V) &&
+         "EnumerateValue doesn't handle Metadata!");
 
   // Check to see if it's already in!
   unsigned &ValueID = ValueMap[V];
@@ -359,7 +387,7 @@ void ValueEnumerator::EnumerateOperandType(const Value *V) {
       // blockaddress.
       if (isa<BasicBlock>(Op)) continue;
       
-      EnumerateOperandType(cast<Constant>(Op));
+      EnumerateOperandType(Op);
     }
 
     if (const MDNode *N = dyn_cast<MDNode>(V)) {
@@ -368,7 +396,7 @@ void ValueEnumerator::EnumerateOperandType(const Value *V) {
           EnumerateOperandType(Elem);
     }
   } else if (isa<MDString>(V) || isa<MDNode>(V))
-    EnumerateValue(V);
+    EnumerateMetadata(V);
 }
 
 void ValueEnumerator::EnumerateAttributes(const AttrListPtr &PAL) {
@@ -386,10 +414,11 @@ void ValueEnumerator::EnumerateAttributes(const AttrListPtr &PAL) {
 void ValueEnumerator::incorporateFunction(const Function &F) {
   InstructionCount = 0;
   NumModuleValues = Values.size();
+  NumModuleMDValues = MDValues.size();
 
   // Adding function arguments to the value table.
-  for(Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
-      I != E; ++I)
+  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
+       I != E; ++I)
     EnumerateValue(I);
 
   FirstFuncConstantID = Values.size();
@@ -416,7 +445,6 @@ void ValueEnumerator::incorporateFunction(const Function &F) {
 
   FirstInstID = Values.size();
 
-  FunctionLocalMDs.clear();
   SmallVector<MDNode *, 8> FnLocalMDVector;
   // Add all of the instructions.
   for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
@@ -428,6 +456,15 @@ void ValueEnumerator::incorporateFunction(const Function &F) {
             // Enumerate metadata after the instructions they might refer to.
             FnLocalMDVector.push_back(MD);
       }
+
+      SmallVector<std::pair<unsigned, MDNode*>, 8> MDs;
+      I->getAllMetadataOtherThanDebugLoc(MDs);
+      for (unsigned i = 0, e = MDs.size(); i != e; ++i) {
+        MDNode *N = MDs[i].second;
+        if (N->isFunctionLocal() && N->getFunction())
+          FnLocalMDVector.push_back(N);
+      }
+        
       if (!I->getType()->isVoidTy())
         EnumerateValue(I);
     }
@@ -435,18 +472,22 @@ void ValueEnumerator::incorporateFunction(const Function &F) {
 
   // Add all of the function-local metadata.
   for (unsigned i = 0, e = FnLocalMDVector.size(); i != e; ++i)
-    EnumerateOperandType(FnLocalMDVector[i]);
+    EnumerateFunctionLocalMetadata(FnLocalMDVector[i]);
 }
 
 void ValueEnumerator::purgeFunction() {
   /// Remove purged values from the ValueMap.
   for (unsigned i = NumModuleValues, e = Values.size(); i != e; ++i)
     ValueMap.erase(Values[i].first);
+  for (unsigned i = NumModuleMDValues, e = MDValues.size(); i != e; ++i)
+    MDValueMap.erase(MDValues[i].first);
   for (unsigned i = 0, e = BasicBlocks.size(); i != e; ++i)
     ValueMap.erase(BasicBlocks[i]);
 
   Values.resize(NumModuleValues);
+  MDValues.resize(NumModuleMDValues);
   BasicBlocks.clear();
+  FunctionLocalMDs.clear();
 }
 
 static void IncorporateFunctionInfoGlobalBBIDs(const Function *F,
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index 2b9b15fa5a77a..cd1d2371b701a 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -72,6 +72,11 @@ private:
   /// When a function is incorporated, this is the size of the Values list
   /// before incorporation.
   unsigned NumModuleValues;
+
+  /// When a function is incorporated, this is the size of the MDValues list
+  /// before incorporation.
+  unsigned NumModuleMDValues;
+
   unsigned FirstFuncConstantID;
   unsigned FirstInstID;
   
@@ -132,7 +137,9 @@ public:
 private:
   void OptimizeConstants(unsigned CstStart, unsigned CstEnd);
     
+  void EnumerateMDNodeOperands(const MDNode *N);
   void EnumerateMetadata(const Value *MD);
+  void EnumerateFunctionLocalMetadata(const MDNode *N);
   void EnumerateNamedMDNode(const NamedMDNode *NMD);
   void EnumerateValue(const Value *V);
   void EnumerateType(const Type *T);
@@ -141,7 +148,7 @@ private:
   
   void EnumerateTypeSymbolTable(const TypeSymbolTable &ST);
   void EnumerateValueSymbolTable(const ValueSymbolTable &ST);
-  void EnumerateMDSymbolTable(const MDSymbolTable &ST);
+  void EnumerateNamedMetadata(const Module *M);
 };
 
 } // End llvm namespace
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index a7189acc3fecd..5a634d6ccb018 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -41,8 +41,11 @@ DebugMod("agg-antidep-debugmod",
 
 AggressiveAntiDepState::AggressiveAntiDepState(const unsigned TargetRegs,
                                                MachineBasicBlock *BB) :
-  NumTargetRegs(TargetRegs), GroupNodes(TargetRegs, 0) {
-
+  NumTargetRegs(TargetRegs), GroupNodes(TargetRegs, 0),
+  GroupNodeIndices(TargetRegs, 0),
+  KillIndices(TargetRegs, 0),
+  DefIndices(TargetRegs, 0)
+{
   const unsigned BBSize = BB->size();
   for (unsigned i = 0; i < NumTargetRegs; ++i) {
     // Initialize all registers to be in their own group. Initially we
@@ -54,8 +57,7 @@ AggressiveAntiDepState::AggressiveAntiDepState(const unsigned TargetRegs,
   }
 }
 
-unsigned AggressiveAntiDepState::GetGroup(unsigned Reg)
-{
+unsigned AggressiveAntiDepState::GetGroup(unsigned Reg) {
   unsigned Node = GroupNodeIndices[Reg];
   while (GroupNodes[Node] != Node)
     Node = GroupNodes[Node];
@@ -145,8 +147,8 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   State = new AggressiveAntiDepState(TRI->getNumRegs(), BB);
 
   bool IsReturnBlock = (!BB->empty() && BB->back().getDesc().isReturn());
-  unsigned *KillIndices = State->GetKillIndices();
-  unsigned *DefIndices = State->GetDefIndices();
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
 
   // Determine the live-out physregs for this block.
   if (IsReturnBlock) {
@@ -226,7 +228,7 @@ void AggressiveAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
   DEBUG(MI->dump());
   DEBUG(dbgs() << "\tRegs:");
 
-  unsigned *DefIndices = State->GetDefIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
   for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg) {
     // If Reg is current live, then mark that it can't be renamed as
     // we don't know the extent of its live-range anymore (now that it
@@ -328,8 +330,8 @@ void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
                                              const char *tag,
                                              const char *header,
                                              const char *footer) {
-  unsigned *KillIndices = State->GetKillIndices();
-  unsigned *DefIndices = State->GetDefIndices();
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
     RegRefs = State->GetRegRefs();
 
@@ -364,7 +366,7 @@ void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
 void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
                                                   unsigned Count,
                                              std::set<unsigned>& PassthruRegs) {
-  unsigned *DefIndices = State->GetDefIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
     RegRefs = State->GetRegRefs();
 
@@ -560,8 +562,8 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
                                 unsigned AntiDepGroupIndex,
                                 RenameOrderType& RenameOrder,
                                 std::map<unsigned, unsigned> &RenameMap) {
-  unsigned *KillIndices = State->GetKillIndices();
-  unsigned *DefIndices = State->GetDefIndices();
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
     RegRefs = State->GetRegRefs();
 
@@ -652,6 +654,8 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
     if (R == RB) R = RE;
     --R;
     const unsigned NewSuperReg = *R;
+    // Don't consider non-allocatable registers
+    if (!AllocatableSet.test(NewSuperReg)) continue;
     // Don't replace a register with itself.
     if (NewSuperReg == SuperReg) continue;
 
@@ -733,8 +737,8 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
                               MachineBasicBlock::iterator Begin,
                               MachineBasicBlock::iterator End,
                               unsigned InsertPosIndex) {
-  unsigned *KillIndices = State->GetKillIndices();
-  unsigned *DefIndices = State->GetDefIndices();
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
     RegRefs = State->GetRegRefs();
 
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.h b/lib/CodeGen/AggressiveAntiDepBreaker.h
index 91ebb850d19db..9d715ccf79f8d 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.h
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -59,27 +59,27 @@ namespace llvm {
     /// currently representing the group that the register belongs to.
     /// Register 0 is always represented by the 0 group, a group
     /// composed of registers that are not eligible for anti-aliasing.
-    unsigned GroupNodeIndices[TargetRegisterInfo::FirstVirtualRegister];
+    std::vector<unsigned> GroupNodeIndices;
 
     /// RegRefs - Map registers to all their references within a live range.
     std::multimap<unsigned, RegisterReference> RegRefs;
 
     /// KillIndices - The index of the most recent kill (proceding bottom-up),
     /// or ~0u if the register is not live.
-    unsigned KillIndices[TargetRegisterInfo::FirstVirtualRegister];
+    std::vector<unsigned> KillIndices;
 
     /// DefIndices - The index of the most recent complete def (proceding bottom
     /// up), or ~0u if the register is live.
-    unsigned DefIndices[TargetRegisterInfo::FirstVirtualRegister];
+    std::vector<unsigned> DefIndices;
 
   public:
     AggressiveAntiDepState(const unsigned TargetRegs, MachineBasicBlock *BB);
 
     /// GetKillIndices - Return the kill indices.
-    unsigned *GetKillIndices() { return KillIndices; }
+    std::vector<unsigned> &GetKillIndices() { return KillIndices; }
 
     /// GetDefIndices - Return the define indices.
-    unsigned *GetDefIndices() { return DefIndices; }
+    std::vector<unsigned> &GetDefIndices() { return DefIndices; }
 
     /// GetRegRefs - Return the RegRefs map.
     std::multimap<unsigned, RegisterReference>& GetRegRefs() { return RegRefs; }
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index f71eee5d01b89..e3dd646c952ed 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -109,7 +109,7 @@ GlobalVariable *llvm::ExtractTypeInfo(Value *V) {
   V = V->stripPointerCasts();
   GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
 
-  if (GV && GV->getName() == ".llvm.eh.catch.all.value") {
+  if (GV && GV->getName() == "llvm.eh.catch.all.value") {
     assert(GV->hasInitializer() &&
            "The EH catch-all value must have an initializer");
     Value *Init = GV->getInitializer();
@@ -171,7 +171,7 @@ ISD::CondCode llvm::getFCmpCondCode(FCmpInst::Predicate Pred) {
     FOC = FPC = ISD::SETFALSE;
     break;
   }
-  if (FiniteOnlyFPMath())
+  if (NoNaNsFPMath)
     return FOC;
   else
     return FPC;
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index db1b37ab263fb..d358ab20ffc53 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -91,7 +91,7 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const TargetData &TD,
 
 
 AsmPrinter::AsmPrinter(TargetMachine &tm, MCStreamer &Streamer)
-  : MachineFunctionPass(&ID),
+  : MachineFunctionPass(ID),
     TM(tm), MAI(tm.getMCAsmInfo()),
     OutContext(Streamer.getContext()),
     OutStreamer(Streamer),
@@ -200,11 +200,17 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
   case GlobalValue::LinkerPrivateWeakLinkage:
+  case GlobalValue::LinkerPrivateWeakDefAutoLinkage:
     if (MAI->getWeakDefDirective() != 0) {
       // .globl _foo
       OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
-      // .weak_definition _foo
-      OutStreamer.EmitSymbolAttribute(GVSym, MCSA_WeakDefinition);
+
+      if ((GlobalValue::LinkageTypes)Linkage !=
+          GlobalValue::LinkerPrivateWeakDefAutoLinkage)
+        // .weak_definition _foo
+        OutStreamer.EmitSymbolAttribute(GVSym, MCSA_WeakDefinition);
+      else
+        OutStreamer.EmitSymbolAttribute(GVSym, MCSA_WeakDefAutoPrivate);
     } else if (MAI->getLinkOnceDirective() != 0) {
       // .globl _foo
       OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
@@ -510,12 +516,8 @@ static void EmitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   }
   
   // Check for spill-induced copies
-  unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-  if (TM.getInstrInfo()->isMoveInstr(MI, SrcReg, DstReg,
-                                     SrcSubIdx, DstSubIdx)) {
-    if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse))
-      CommentOS << " Reload Reuse\n";
-  }
+  if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse))
+    CommentOS << " Reload Reuse\n";
 }
 
 /// EmitImplicitDef - This method emits the specified machine instruction
@@ -603,12 +605,15 @@ void AsmPrinter::EmitFunctionBody() {
   
   // Print out code for the function.
   bool HasAnyRealCode = false;
+  const MachineInstr *LastMI = 0;
   for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
        I != E; ++I) {
     // Print a label for the basic block.
     EmitBasicBlockStart(I);
     for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
          II != IE; ++II) {
+      LastMI = II;
+
       // Print the assembly for the instruction.
       if (!II->isLabel() && !II->isImplicitDef() && !II->isKill() &&
           !II->isDebugValue()) {
@@ -625,7 +630,7 @@ void AsmPrinter::EmitFunctionBody() {
         EmitComments(*II, OutStreamer.GetCommentOS());
 
       switch (II->getOpcode()) {
-      case TargetOpcode::DBG_LABEL:
+      case TargetOpcode::PROLOG_LABEL:
       case TargetOpcode::EH_LABEL:
       case TargetOpcode::GC_LABEL:
         OutStreamer.EmitLabel(II->getOperand(0).getMCSymbol());
@@ -656,11 +661,18 @@ void AsmPrinter::EmitFunctionBody() {
       }
     }
   }
-  
+
+  // If the last instruction was a prolog label, then we have a situation where
+  // we emitted a prolog but no function body. This results in the ending prolog
+  // label equaling the end of function label and an invalid "row" in the
+  // FDE. We need to emit a noop in this situation so that the FDE's rows are
+  // valid.
+  bool RequiresNoop = LastMI && LastMI->isPrologLabel();
+
   // If the function is empty and the object file uses .subsections_via_symbols,
   // then we need to emit *something* to the function body to prevent the
   // labels from collapsing together.  Just emit a noop.
-  if (MAI->hasSubsectionsViaSymbols() && !HasAnyRealCode) {
+  if ((MAI->hasSubsectionsViaSymbols() && !HasAnyRealCode) || RequiresNoop) {
     MCInst Noop;
     TM.getInstrInfo()->getNoopForMachoTarget(Noop);
     if (Noop.getOpcode()) {
@@ -1206,6 +1218,22 @@ void AsmPrinter::EmitLabelOffsetDifference(const MCSymbol *Hi, uint64_t Offset,
     OutStreamer.EmitSymbolValue(SetLabel, 4, 0/*AddrSpace*/);
   }
 }
+
+/// EmitLabelPlusOffset - Emit something like ".long Label+Offset" 
+/// where the size in bytes of the directive is specified by Size and Label
+/// specifies the label.  This implicitly uses .set if it is available.
+void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
+                                      unsigned Size) 
+  const {
+  
+  // Emit Label+Offset
+  const MCExpr *Plus =
+    MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Label, OutContext), 
+                            MCConstantExpr::Create(Offset, OutContext),
+                            OutContext);
+  
+  OutStreamer.EmitValue(Plus, 4, 0/*AddrSpace*/);
+}
     
 
 //===----------------------------------------------------------------------===//
@@ -1244,6 +1272,7 @@ static const MCExpr *LowerConstant(const Constant *CV, AsmPrinter &AP) {
   
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
     return MCSymbolRefExpr::Create(AP.Mang->getSymbol(GV), Ctx);
+
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
     return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
   
@@ -1262,10 +1291,17 @@ static const MCExpr *LowerConstant(const Constant *CV, AsmPrinter &AP) {
           ConstantFoldConstantExpression(CE, AP.TM.getTargetData()))
       if (C != CE)
         return LowerConstant(C, AP);
-#ifndef NDEBUG
-    CE->dump();
-#endif
-    llvm_unreachable("FIXME: Don't support this constant expr");
+
+    // Otherwise report the problem to the user.
+    {
+      std::string S;
+      raw_string_ostream OS(S);
+      OS << "Unsupported expression in static initializer: ";
+      WriteAsOperand(OS, CE, /*PrintType=*/false,
+                     !AP.MF ? 0 : AP.MF->getFunction()->getParent());
+      report_fatal_error(OS.str());
+    }
+    return MCConstantExpr::Create(0, Ctx);
   case Instruction::GetElementPtr: {
     const TargetData &TD = *AP.TM.getTargetData();
     // Generate a symbolic expression for the byte address
@@ -1413,21 +1449,6 @@ static void EmitGlobalConstantStruct(const ConstantStruct *CS,
          "Layout of constant struct may be incorrect!");
 }
 
-static void EmitGlobalConstantUnion(const ConstantUnion *CU, 
-                                    unsigned AddrSpace, AsmPrinter &AP) {
-  const TargetData *TD = AP.TM.getTargetData();
-  unsigned Size = TD->getTypeAllocSize(CU->getType());
-
-  const Constant *Contents = CU->getOperand(0);
-  unsigned FilledSize = TD->getTypeAllocSize(Contents->getType());
-    
-  // Print the actually filled part
-  EmitGlobalConstantImpl(Contents, AddrSpace, AP);
-
-  // And pad with enough zeroes
-  AP.OutStreamer.EmitZeros(Size-FilledSize, AddrSpace);
-}
-
 static void EmitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
                                  AsmPrinter &AP) {
   // FP Constants are printed as integer constants to avoid losing
@@ -1530,7 +1551,7 @@ static void EmitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
     case 8:
       if (AP.isVerbose())
         AP.OutStreamer.GetCommentOS() << format("0x%llx\n", CI->getZExtValue());
-        AP.OutStreamer.EmitIntValue(CI->getZExtValue(), Size, AddrSpace);
+      AP.OutStreamer.EmitIntValue(CI->getZExtValue(), Size, AddrSpace);
       return;
     default:
       EmitGlobalConstantLargeInt(CI, AddrSpace, AP);
@@ -1553,9 +1574,6 @@ static void EmitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
     return;
   }
   
-  if (const ConstantUnion *CVU = dyn_cast<ConstantUnion>(CV))
-    return EmitGlobalConstantUnion(CVU, AddrSpace, AP);
-  
   if (const ConstantVector *V = dyn_cast<ConstantVector>(CV))
     return EmitGlobalConstantVector(V, AddrSpace, AP);
   
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index b310578584bc9..ce4519c541e3b 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -36,7 +36,7 @@ void AsmPrinter::EmitSLEB128(int Value, const char *Desc) const {
   if (isVerbose() && Desc)
     OutStreamer.AddComment(Desc);
     
-  if (MAI->hasLEB128()) {
+  if (MAI->hasLEB128() && OutStreamer.hasRawTextSupport()) {
     // FIXME: MCize.
     OutStreamer.EmitRawText("\t.sleb128\t" + Twine(Value));
     return;
@@ -61,7 +61,7 @@ void AsmPrinter::EmitULEB128(unsigned Value, const char *Desc,
   if (isVerbose() && Desc)
     OutStreamer.AddComment(Desc);
  
-  if (MAI->hasLEB128() && PadTo == 0) {
+  if (MAI->hasLEB128() && PadTo == 0 && OutStreamer.hasRawTextSupport()) {
     // FIXME: MCize.
     OutStreamer.EmitRawText("\t.uleb128\t" + Twine(Value));
     return;
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 202d9b67fd157..df0316814c08b 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -22,7 +22,6 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCParser/AsmParser.h"
 #include "llvm/Target/TargetAsmParser.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegistry.h"
@@ -72,16 +71,18 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, unsigned LocCookie) const {
   // Tell SrcMgr about this buffer, it takes ownership of the buffer.
   SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
   
-  AsmParser Parser(TM.getTarget(), SrcMgr, OutContext, OutStreamer, *MAI);
-  OwningPtr<TargetAsmParser> TAP(TM.getTarget().createAsmParser(Parser));
+  OwningPtr<MCAsmParser> Parser(createMCAsmParser(TM.getTarget(), SrcMgr,
+                                                  OutContext, OutStreamer,
+                                                  *MAI));
+  OwningPtr<TargetAsmParser> TAP(TM.getTarget().createAsmParser(*Parser, TM));
   if (!TAP)
     report_fatal_error("Inline asm not supported by this streamer because"
                        " we don't have an asm parser for this target\n");
-  Parser.setTargetParser(*TAP.get());
+  Parser->setTargetParser(*TAP.get());
 
   // Don't implicitly switch to the text section before the asm.
-  int Res = Parser.Run(/*NoInitialTextSection*/ true,
-                       /*NoFinalize*/ true);
+  int Res = Parser->Run(/*NoInitialTextSection*/ true,
+                        /*NoFinalize*/ true);
   if (Res && !HasDiagHandler)
     report_fatal_error("Error parsing inline asm\n");
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 65c1d190216fa..c886a5ecc6153 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -44,7 +44,7 @@ using namespace llvm;
 static cl::opt<bool> PrintDbgScope("print-dbgscope", cl::Hidden,
      cl::desc("Print DbgScope information for each machine instruction"));
 
-static cl::opt<bool> DisableDebugInfoPrinting("disable-debug-info-print", 
+static cl::opt<bool> DisableDebugInfoPrinting("disable-debug-info-print",
                                               cl::Hidden,
      cl::desc("Disable debug info printing"));
 
@@ -116,8 +116,8 @@ public:
 
   /// addGlobalType - Add a new global type to the compile unit.
   ///
-  void addGlobalType(StringRef Name, DIE *Die) { 
-    GlobalTypes[Name] = Die; 
+  void addGlobalType(StringRef Name, DIE *Die) {
+    GlobalTypes[Name] = Die;
   }
 
   /// getDIE - Returns the debug information entry map slot for the
@@ -131,8 +131,9 @@ public:
 
   /// getDIEEntry - Returns the debug information entry for the speciefied
   /// debug variable.
-  DIEEntry *getDIEEntry(const MDNode *N) { 
-    DenseMap<const MDNode *, DIEEntry *>::iterator I = MDNodeToDIEEntryMap.find(N);
+  DIEEntry *getDIEEntry(const MDNode *N) {
+    DenseMap<const MDNode *, DIEEntry *>::iterator I =
+      MDNodeToDIEEntryMap.find(N);
     if (I == MDNodeToDIEEntryMap.end())
       return NULL;
     return I->second;
@@ -179,6 +180,73 @@ public:
   DIE *getDIE()                      const { return TheDIE; }
   void setDotDebugLocOffset(unsigned O)    { DotDebugLocOffset = O; }
   unsigned getDotDebugLocOffset()    const { return DotDebugLocOffset; }
+  StringRef getName()                const { return Var.getName(); }
+  unsigned getTag()                  const { return Var.getTag(); }
+  bool variableHasComplexAddress()   const {
+    assert(Var.Verify() && "Invalid complex DbgVariable!");
+    return Var.hasComplexAddress();
+  }
+  bool isBlockByrefVariable()        const {
+    assert(Var.Verify() && "Invalid complex DbgVariable!");
+    return Var.isBlockByrefVariable();
+  }
+  unsigned getNumAddrElements()      const { 
+    assert(Var.Verify() && "Invalid complex DbgVariable!");
+    return Var.getNumAddrElements();
+  }
+  uint64_t getAddrElement(unsigned i) const {
+    return Var.getAddrElement(i);
+  }
+  DIType getType()               const {
+    DIType Ty = Var.getType();
+    // FIXME: isBlockByrefVariable should be reformulated in terms of complex
+    // addresses instead.
+    if (Var.isBlockByrefVariable()) {
+      /* Byref variables, in Blocks, are declared by the programmer as
+         "SomeType VarName;", but the compiler creates a
+         __Block_byref_x_VarName struct, and gives the variable VarName
+         either the struct, or a pointer to the struct, as its type.  This
+         is necessary for various behind-the-scenes things the compiler
+         needs to do with by-reference variables in blocks.
+         
+         However, as far as the original *programmer* is concerned, the
+         variable should still have type 'SomeType', as originally declared.
+         
+         The following function dives into the __Block_byref_x_VarName
+         struct to find the original type of the variable.  This will be
+         passed back to the code generating the type for the Debug
+         Information Entry for the variable 'VarName'.  'VarName' will then
+         have the original type 'SomeType' in its debug information.
+         
+         The original type 'SomeType' will be the type of the field named
+         'VarName' inside the __Block_byref_x_VarName struct.
+         
+         NOTE: In order for this to not completely fail on the debugger
+         side, the Debug Information Entry for the variable VarName needs to
+         have a DW_AT_location that tells the debugger how to unwind through
+         the pointers and __Block_byref_x_VarName struct to find the actual
+         value of the variable.  The function addBlockByrefType does this.  */
+      DIType subType = Ty;
+      unsigned tag = Ty.getTag();
+      
+      if (tag == dwarf::DW_TAG_pointer_type) {
+        DIDerivedType DTy = DIDerivedType(Ty);
+        subType = DTy.getTypeDerivedFrom();
+      }
+      
+      DICompositeType blockStruct = DICompositeType(subType);
+      DIArray Elements = blockStruct.getTypeArray();
+      
+      for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+        DIDescriptor Element = Elements.getElement(i);
+        DIDerivedType DT = DIDerivedType(Element);
+        if (getName() == DT.getName())
+          return (DT.getTypeDerivedFrom());
+      }
+      return Ty;
+    }
+    return Ty;
+  }
 };
 
 //===----------------------------------------------------------------------===//
@@ -194,7 +262,7 @@ class DbgScope {
   DbgScope *Parent;                   // Parent to this scope.
   DIDescriptor Desc;                  // Debug info descriptor for scope.
   // Location at which this scope is inlined.
-  AssertingVH<const MDNode> InlinedAtLocation;  
+  AssertingVH<const MDNode> InlinedAtLocation;
   bool AbstractScope;                 // Abstract Scope
   const MachineInstr *LastInsn;       // Last instruction of this scope.
   const MachineInstr *FirstInsn;      // First instruction of this scope.
@@ -220,19 +288,19 @@ public:
   const MDNode *getInlinedAt()         const { return InlinedAtLocation; }
   const MDNode *getScopeNode()         const { return Desc; }
   const SmallVector<DbgScope *, 4> &getScopes() { return Scopes; }
-  const SmallVector<DbgVariable *, 8> &getVariables() { return Variables; }
+  const SmallVector<DbgVariable *, 8> &getDbgVariables() { return Variables; }
   const SmallVector<DbgRange, 4> &getRanges() { return Ranges; }
 
   /// openInsnRange - This scope covers instruction range starting from MI.
   void openInsnRange(const MachineInstr *MI) {
-    if (!FirstInsn) 
+    if (!FirstInsn)
       FirstInsn = MI;
-    
+
     if (Parent)
       Parent->openInsnRange(MI);
   }
 
-  /// extendInsnRange - Extend the current instruction range covered by 
+  /// extendInsnRange - Extend the current instruction range covered by
   /// this scope.
   void extendInsnRange(const MachineInstr *MI) {
     assert (FirstInsn && "MI Range is not open!");
@@ -247,9 +315,9 @@ public:
   void closeInsnRange(DbgScope *NewScope = NULL) {
     assert (LastInsn && "Last insn missing!");
     Ranges.push_back(DbgRange(FirstInsn, LastInsn));
-    FirstInsn = NULL;    
+    FirstInsn = NULL;
     LastInsn = NULL;
-    // If Parent dominates NewScope then do not close Parent's instruction 
+    // If Parent dominates NewScope then do not close Parent's instruction
     // range.
     if (Parent && (!NewScope || !Parent->dominates(NewScope)))
       Parent->closeInsnRange(NewScope);
@@ -264,7 +332,7 @@ public:
   unsigned getDFSIn() const  { return DFSIn; }
   void setDFSIn(unsigned I)  { DFSIn = I; }
   bool dominates(const DbgScope *S) {
-    if (S == this) 
+    if (S == this)
       return true;
     if (DFSIn < S->getDFSIn() && DFSOut > S->getDFSOut())
       return true;
@@ -313,14 +381,13 @@ DbgScope::~DbgScope() {
 
 DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   : Asm(A), MMI(Asm->MMI), FirstCU(0),
-    AbbreviationsSet(InitAbbreviationsSetSize), 
+    AbbreviationsSet(InitAbbreviationsSetSize),
     CurrentFnDbgScope(0), PrevLabel(NULL) {
   NextStringPoolNumber = 0;
-      
+
   DwarfFrameSectionSym = DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0;
   DwarfStrSectionSym = TextSectionSym = 0;
-  DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = 0; 
-  DwarfDebugLineSectionSym = CurrentLineSectionSym = 0;
+  DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = 0;
   FunctionBeginSym = FunctionEndSym = 0;
   DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
   {
@@ -377,7 +444,7 @@ DIEEntry *DwarfDebug::createDIEEntry(DIE *Entry) {
 void DwarfDebug::addUInt(DIE *Die, unsigned Attribute,
                          unsigned Form, uint64_t Integer) {
   if (!Form) Form = DIEInteger::BestForm(false, Integer);
-  DIEValue *Value = Integer == 1 ? 
+  DIEValue *Value = Integer == 1 ?
     DIEIntegerOne : new (DIEValueAllocator) DIEInteger(Integer);
   Die->addValue(Attribute, Form, Value);
 }
@@ -392,7 +459,7 @@ void DwarfDebug::addSInt(DIE *Die, unsigned Attribute,
 }
 
 /// addString - Add a string attribute data and value. DIEString only
-/// keeps string reference. 
+/// keeps string reference.
 void DwarfDebug::addString(DIE *Die, unsigned Attribute, unsigned Form,
                            StringRef String) {
   DIEValue *Value = new (DIEValueAllocator) DIEString(String);
@@ -434,14 +501,14 @@ void DwarfDebug::addBlock(DIE *Die, unsigned Attribute, unsigned Form,
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
-void DwarfDebug::addSourceLine(DIE *Die, const DIVariable *V) {
+void DwarfDebug::addSourceLine(DIE *Die, DIVariable V) {
   // Verify variable.
-  if (!V->Verify())
+  if (!V.Verify())
     return;
 
-  unsigned Line = V->getLineNumber();
-  unsigned FileID = GetOrCreateSourceID(V->getContext().getDirectory(),
-                                        V->getContext().getFilename());
+  unsigned Line = V.getLineNumber();
+  unsigned FileID = GetOrCreateSourceID(V.getContext().getDirectory(),
+                                        V.getContext().getFilename());
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -449,14 +516,14 @@ void DwarfDebug::addSourceLine(DIE *Die, const DIVariable *V) {
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
-void DwarfDebug::addSourceLine(DIE *Die, const DIGlobalVariable *G) {
+void DwarfDebug::addSourceLine(DIE *Die, DIGlobalVariable G) {
   // Verify global variable.
-  if (!G->Verify())
+  if (!G.Verify())
     return;
 
-  unsigned Line = G->getLineNumber();
-  unsigned FileID = GetOrCreateSourceID(G->getContext().getDirectory(),
-                                        G->getContext().getFilename());
+  unsigned Line = G.getLineNumber();
+  unsigned FileID = GetOrCreateSourceID(G.getContext().getDirectory(),
+                                        G.getContext().getFilename());
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -464,19 +531,19 @@ void DwarfDebug::addSourceLine(DIE *Die, const DIGlobalVariable *G) {
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
-void DwarfDebug::addSourceLine(DIE *Die, const DISubprogram *SP) {
+void DwarfDebug::addSourceLine(DIE *Die, DISubprogram SP) {
   // Verify subprogram.
-  if (!SP->Verify())
+  if (!SP.Verify())
     return;
   // If the line number is 0, don't add it.
-  if (SP->getLineNumber() == 0)
+  if (SP.getLineNumber() == 0)
     return;
 
-  unsigned Line = SP->getLineNumber();
-  if (!SP->getContext().Verify())
+  unsigned Line = SP.getLineNumber();
+  if (!SP.getContext().Verify())
     return;
-  unsigned FileID = GetOrCreateSourceID(SP->getDirectory(),
-                                        SP->getFilename());
+  unsigned FileID = GetOrCreateSourceID(SP.getDirectory(),
+                                        SP.getFilename());
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -484,16 +551,16 @@ void DwarfDebug::addSourceLine(DIE *Die, const DISubprogram *SP) {
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
-void DwarfDebug::addSourceLine(DIE *Die, const DIType *Ty) {
+void DwarfDebug::addSourceLine(DIE *Die, DIType Ty) {
   // Verify type.
-  if (!Ty->Verify())
+  if (!Ty.Verify())
     return;
 
-  unsigned Line = Ty->getLineNumber();
-  if (!Ty->getContext().Verify())
+  unsigned Line = Ty.getLineNumber();
+  if (!Ty.getContext().Verify())
     return;
-  unsigned FileID = GetOrCreateSourceID(Ty->getContext().getDirectory(),
-                                        Ty->getContext().getFilename());
+  unsigned FileID = GetOrCreateSourceID(Ty.getContext().getDirectory(),
+                                        Ty.getContext().getFilename());
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -501,14 +568,14 @@ void DwarfDebug::addSourceLine(DIE *Die, const DIType *Ty) {
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
-void DwarfDebug::addSourceLine(DIE *Die, const DINameSpace *NS) {
+void DwarfDebug::addSourceLine(DIE *Die, DINameSpace NS) {
   // Verify namespace.
-  if (!NS->Verify())
+  if (!NS.Verify())
     return;
 
-  unsigned Line = NS->getLineNumber();
-  StringRef FN = NS->getFilename();
-  StringRef Dir = NS->getDirectory();
+  unsigned Line = NS.getLineNumber();
+  StringRef FN = NS.getFilename();
+  StringRef Dir = NS.getDirectory();
 
   unsigned FileID = GetOrCreateSourceID(Dir, FN);
   assert(FileID && "Invalid file id");
@@ -516,55 +583,21 @@ void DwarfDebug::addSourceLine(DIE *Die, const DINameSpace *NS) {
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
 }
 
-/* Byref variables, in Blocks, are declared by the programmer as
-   "SomeType VarName;", but the compiler creates a
-   __Block_byref_x_VarName struct, and gives the variable VarName
-   either the struct, or a pointer to the struct, as its type.  This
-   is necessary for various behind-the-scenes things the compiler
-   needs to do with by-reference variables in blocks.
-
-   However, as far as the original *programmer* is concerned, the
-   variable should still have type 'SomeType', as originally declared.
-
-   The following function dives into the __Block_byref_x_VarName
-   struct to find the original type of the variable.  This will be
-   passed back to the code generating the type for the Debug
-   Information Entry for the variable 'VarName'.  'VarName' will then
-   have the original type 'SomeType' in its debug information.
-
-   The original type 'SomeType' will be the type of the field named
-   'VarName' inside the __Block_byref_x_VarName struct.
-
-   NOTE: In order for this to not completely fail on the debugger
-   side, the Debug Information Entry for the variable VarName needs to
-   have a DW_AT_location that tells the debugger how to unwind through
-   the pointers and __Block_byref_x_VarName struct to find the actual
-   value of the variable.  The function addBlockByrefType does this.  */
-
-/// Find the type the programmer originally declared the variable to be
-/// and return that type.
-///
-DIType DwarfDebug::getBlockByrefType(DIType Ty, std::string Name) {
-
-  DIType subType = Ty;
-  unsigned tag = Ty.getTag();
-
-  if (tag == dwarf::DW_TAG_pointer_type) {
-    DIDerivedType DTy = DIDerivedType(Ty);
-    subType = DTy.getTypeDerivedFrom();
-  }
-
-  DICompositeType blockStruct = DICompositeType(subType);
-  DIArray Elements = blockStruct.getTypeArray();
-
-  for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
-    DIDescriptor Element = Elements.getElement(i);
-    DIDerivedType DT = DIDerivedType(Element);
-    if (Name == DT.getName())
-      return (DT.getTypeDerivedFrom());
-  }
+/// addVariableAddress - Add DW_AT_location attribute for a DbgVariable based
+/// on provided frame index.
+void DwarfDebug::addVariableAddress(DbgVariable *&DV, DIE *Die, int64_t FI) {
+  MachineLocation Location;
+  unsigned FrameReg;
+  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
+  int Offset = RI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
+  Location.set(FrameReg, Offset);
 
-  return Ty;
+  if (DV->variableHasComplexAddress())
+    addComplexAddress(DV, Die, dwarf::DW_AT_location, Location);
+  else if (DV->isBlockByrefVariable())
+    addBlockByrefAddress(DV, Die, dwarf::DW_AT_location, Location);
+  else
+    addAddress(Die, dwarf::DW_AT_location, Location);
 }
 
 /// addComplexAddress - Start with the address based on the location provided,
@@ -575,8 +608,7 @@ DIType DwarfDebug::getBlockByrefType(DIType Ty, std::string Name) {
 void DwarfDebug::addComplexAddress(DbgVariable *&DV, DIE *Die,
                                    unsigned Attribute,
                                    const MachineLocation &Location) {
-  const DIVariable &VD = DV->getVariable();
-  DIType Ty = VD.getType();
+  DIType Ty = DV->getType();
 
   // Decode the original location, and use that as the start of the byref
   // variable's location.
@@ -603,12 +635,12 @@ void DwarfDebug::addComplexAddress(DbgVariable *&DV, DIE *Die,
     addUInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset());
   }
 
-  for (unsigned i = 0, N = VD.getNumAddrElements(); i < N; ++i) {
-    uint64_t Element = VD.getAddrElement(i);
+  for (unsigned i = 0, N = DV->getNumAddrElements(); i < N; ++i) {
+    uint64_t Element = DV->getAddrElement(i);
 
     if (Element == DIFactory::OpPlus) {
       addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, VD.getAddrElement(++i));
+      addUInt(Block, 0, dwarf::DW_FORM_udata, DV->getAddrElement(++i));
     } else if (Element == DIFactory::OpDeref) {
       addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
     } else llvm_unreachable("unknown DIFactory Opcode");
@@ -681,13 +713,12 @@ void DwarfDebug::addComplexAddress(DbgVariable *&DV, DIE *Die,
 void DwarfDebug::addBlockByrefAddress(DbgVariable *&DV, DIE *Die,
                                       unsigned Attribute,
                                       const MachineLocation &Location) {
-  const DIVariable &VD = DV->getVariable();
-  DIType Ty = VD.getType();
+  DIType Ty = DV->getType();
   DIType TmpTy = Ty;
   unsigned Tag = Ty.getTag();
   bool isPointer = false;
 
-  StringRef varName = VD.getName();
+  StringRef varName = DV->getName();
 
   if (Tag == dwarf::DW_TAG_pointer_type) {
     DIDerivedType DTy = DIDerivedType(Ty);
@@ -835,26 +866,26 @@ bool DwarfDebug::addConstantFPValue(DIE *Die, const MCSymbol *VS,
   assert (MO.isFPImm() && "Invalid machine operand!");
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
   APFloat FPImm = MO.getFPImm()->getValueAPF();
-  
+
   // Get the raw data form of the floating point.
   const APInt FltVal = FPImm.bitcastToAPInt();
   const char *FltPtr = (const char*)FltVal.getRawData();
-  
+
   int NumBytes = FltVal.getBitWidth() / 8; // 8 bits per byte.
   bool LittleEndian = Asm->getTargetData().isLittleEndian();
   int Incr = (LittleEndian ? 1 : -1);
   int Start = (LittleEndian ? 0 : NumBytes - 1);
   int Stop = (LittleEndian ? NumBytes : -1);
-  
+
   // Output the constant to DWARF one byte at a time.
   for (; Start != Stop; Start += Incr)
     addUInt(Block, 0, dwarf::DW_FORM_data1,
             (unsigned char)0xFF & FltPtr[Start]);
-  
+
   addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
   if (VS)
     addLabel(Die, dwarf::DW_AT_start_scope, dwarf::DW_FORM_addr, VS);
-  return true; 
+  return true;
 }
 
 
@@ -872,7 +903,7 @@ void DwarfDebug::addToContextOwner(DIE *Die, DIDescriptor Context) {
     ContextDIE->addChild(Die);
   } else if (DIE *ContextDIE = getCompileUnit(Context)->getDIE(Context))
     ContextDIE->addChild(Die);
-  else 
+  else
     getCompileUnit(Context)->addDie(Die);
 }
 
@@ -965,7 +996,7 @@ void DwarfDebug::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
 
   // Add source line info if available and TyDesc is not a forward declaration.
   if (!DTy.isForwardDecl())
-    addSourceLine(&Buffer, &DTy);
+    addSourceLine(&Buffer, DTy);
 }
 
 /// constructTypeDIE - Construct type DIE from DICompositeType.
@@ -1039,7 +1070,7 @@ void DwarfDebug::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
         addType(ElemDie, DV.getType());
         addUInt(ElemDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
         addUInt(ElemDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
-        addSourceLine(ElemDie, &DV);
+        addSourceLine(ElemDie, DV);
       } else if (Element.isDerivedType())
         ElemDie = createMemberDIE(DIDerivedType(Element));
       else
@@ -1057,7 +1088,7 @@ void DwarfDebug::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
 
     DICompositeType ContainingType = CTy.getContainingType();
     if (DIDescriptor(ContainingType).isCompositeType())
-      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, 
+      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
                   getOrCreateTypeDIE(DIType(ContainingType)));
     else {
       DIDescriptor Context = CTy.getContext();
@@ -1073,7 +1104,7 @@ void DwarfDebug::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
   if (!Name.empty())
     addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
 
-  if (Tag == dwarf::DW_TAG_enumeration_type || Tag == dwarf::DW_TAG_class_type 
+  if (Tag == dwarf::DW_TAG_enumeration_type || Tag == dwarf::DW_TAG_class_type
       || Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
     {
     // Add size if non-zero (derived types might be zero-sized.)
@@ -1089,7 +1120,7 @@ void DwarfDebug::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
 
     // Add source line info if available.
     if (!CTy.isForwardDecl())
-      addSourceLine(&Buffer, &CTy);
+      addSourceLine(&Buffer, CTy);
   }
 }
 
@@ -1149,7 +1180,7 @@ DIE *DwarfDebug::constructEnumTypeDIE(DIEnumerator ETy) {
   return Enumerator;
 }
 
-/// getRealLinkageName - If special LLVM prefix that is used to inform the asm 
+/// getRealLinkageName - If special LLVM prefix that is used to inform the asm
 /// printer to not emit usual symbol prefix before the symbol name is used then
 /// return linkage name after skipping this special LLVM prefix.
 static StringRef getRealLinkageName(StringRef LinkageName) {
@@ -1159,40 +1190,16 @@ static StringRef getRealLinkageName(StringRef LinkageName) {
   return LinkageName;
 }
 
-/// createGlobalVariableDIE - Create new DIE using GV.
-DIE *DwarfDebug::createGlobalVariableDIE(const DIGlobalVariable &GV) {
-  // If the global variable was optmized out then no need to create debug info
-  // entry.
-  if (!GV.getGlobal()) return NULL;
-  if (GV.getDisplayName().empty()) return NULL;
-
-  DIE *GVDie = new DIE(dwarf::DW_TAG_variable);
-  addString(GVDie, dwarf::DW_AT_name, dwarf::DW_FORM_string,
-            GV.getDisplayName());
-
-  StringRef LinkageName = GV.getLinkageName();
-  if (!LinkageName.empty())
-    addString(GVDie, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string,
-              getRealLinkageName(LinkageName));
-
-  addType(GVDie, GV.getType());
-  if (!GV.isLocalToUnit())
-    addUInt(GVDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
-  addSourceLine(GVDie, &GV);
-
-  return GVDie;
-}
-
 /// createMemberDIE - Create new member DIE.
-DIE *DwarfDebug::createMemberDIE(const DIDerivedType &DT) {
+DIE *DwarfDebug::createMemberDIE(DIDerivedType DT) {
   DIE *MemberDie = new DIE(DT.getTag());
   StringRef Name = DT.getName();
   if (!Name.empty())
     addString(MemberDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
-  
+
   addType(MemberDie, DT.getTypeDerivedFrom());
 
-  addSourceLine(MemberDie, &DT);
+  addSourceLine(MemberDie, DT);
 
   DIEBlock *MemLocationDie = new (DIEValueAllocator) DIEBlock();
   addUInt(MemLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
@@ -1240,7 +1247,7 @@ DIE *DwarfDebug::createMemberDIE(const DIDerivedType &DT) {
     addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
     addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
 
-    addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, 
+    addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0,
              VBaseLocationDie);
   } else
     addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, MemLocationDie);
@@ -1261,7 +1268,7 @@ DIE *DwarfDebug::createMemberDIE(const DIDerivedType &DT) {
 }
 
 /// createSubprogramDIE - Create new DIE using SP.
-DIE *DwarfDebug::createSubprogramDIE(const DISubprogram &SP, bool MakeDecl) {
+DIE *DwarfDebug::createSubprogramDIE(DISubprogram SP, bool MakeDecl) {
   CompileUnit *SPCU = getCompileUnit(SP);
   DIE *SPDie = SPCU->getDIE(SP);
   if (SPDie)
@@ -1277,7 +1284,7 @@ DIE *DwarfDebug::createSubprogramDIE(const DISubprogram &SP, bool MakeDecl) {
     addString(SPDie, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string,
               getRealLinkageName(LinkageName));
 
-  addSourceLine(SPDie, &SP);
+  addSourceLine(SPDie, SP);
 
   // Add prototyped tag, if C or ObjC.
   unsigned Lang = SP.getCompileUnit().getLanguage();
@@ -1302,7 +1309,7 @@ DIE *DwarfDebug::createSubprogramDIE(const DISubprogram &SP, bool MakeDecl) {
     addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
     addUInt(Block, 0, dwarf::DW_FORM_data1, SP.getVirtualIndex());
     addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, 0, Block);
-    ContainingTypeMap.insert(std::make_pair(SPDie, 
+    ContainingTypeMap.insert(std::make_pair(SPDie,
                                             SP.getContainingType()));
   }
 
@@ -1331,10 +1338,14 @@ DIE *DwarfDebug::createSubprogramDIE(const DISubprogram &SP, bool MakeDecl) {
 
   if (!SP.isLocalToUnit())
     addUInt(SPDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
-  
+
   if (SP.isOptimized())
     addUInt(SPDie, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1);
 
+  if (unsigned isa = Asm->getISAEncoding()) {
+    addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
+  }
+
   // DW_TAG_inlined_subroutine may refer to this DIE.
   SPCU->insertDIE(SP, SPDie);
 
@@ -1394,18 +1405,18 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(const MDNode *SPNode) {
 
   assert(SPDie && "Unable to find subprogram DIE!");
   DISubprogram SP(SPNode);
-  
+
   // There is not any need to generate specification DIE for a function
   // defined at compile unit level. If a function is defined inside another
   // function then gdb prefers the definition at top level and but does not
-  // expect specification DIE in parent function. So avoid creating 
+  // expect specification DIE in parent function. So avoid creating
   // specification DIE for a function defined inside a function.
   if (SP.isDefinition() && !SP.getContext().isCompileUnit() &&
-      !SP.getContext().isFile() && 
+      !SP.getContext().isFile() &&
       !isSubprogramContext(SP.getContext())) {
     addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
-    
-    // Add arguments. 
+
+    // Add arguments.
     DICompositeType SPTy = SP.getType();
     DIArray Args = SPTy.getTypeArray();
     unsigned SPTag = SPTy.getTag();
@@ -1420,11 +1431,11 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(const MDNode *SPNode) {
       }
     DIE *SPDeclDie = SPDie;
     SPDie = new DIE(dwarf::DW_TAG_subprogram);
-    addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4, 
+    addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4,
                 SPDeclDie);
     SPCU->addDie(SPDie);
   }
-  
+
   // Pick up abstract subprogram DIE.
   if (DIE *AbsSPDIE = AbstractSPDies.lookup(SPNode)) {
     SPDie = new DIE(dwarf::DW_TAG_subprogram);
@@ -1459,7 +1470,7 @@ DIE *DwarfDebug::constructLexicalScopeDIE(DbgScope *Scope) {
   SmallVector<DbgRange, 4>::const_iterator RI = Ranges.begin();
   if (Ranges.size() > 1) {
     // .debug_range section has not been laid out yet. Emit offset in
-    // .debug_range as a uint, size 4, for now. emitDIE will handle 
+    // .debug_range as a uint, size 4, for now. emitDIE will handle
     // DW_AT_ranges appropriately.
     addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4,
             DebugRangeSymbols.size() * Asm->getTargetData().getPointerSize());
@@ -1480,7 +1491,7 @@ DIE *DwarfDebug::constructLexicalScopeDIE(DbgScope *Scope) {
 
   assert(Start->isDefined() && "Invalid starting label for an inlined scope!");
   assert(End->isDefined() && "Invalid end label for an inlined scope!");
-  
+
   addLabel(ScopeDIE, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, Start);
   addLabel(ScopeDIE, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr, End);
 
@@ -1493,7 +1504,7 @@ DIE *DwarfDebug::constructLexicalScopeDIE(DbgScope *Scope) {
 DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) {
 
   const SmallVector<DbgRange, 4> &Ranges = Scope->getRanges();
-  assert (Ranges.empty() == false 
+  assert (Ranges.empty() == false
           && "DbgScope does not have instruction markers!");
 
   // FIXME : .debug_inlined section specification does not clearly state how
@@ -1551,16 +1562,14 @@ DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) {
 
 /// constructVariableDIE - Construct a DIE for the given DbgVariable.
 DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
-  // Get the descriptor.
-  const DIVariable &VD = DV->getVariable();
-  StringRef Name = VD.getName();
+  StringRef Name = DV->getName();
   if (Name.empty())
     return NULL;
 
   // Translate tag to proper Dwarf tag.  The result variable is dropped for
   // now.
   unsigned Tag;
-  switch (VD.getTag()) {
+  switch (DV->getTag()) {
   case dwarf::DW_TAG_return_variable:
     return NULL;
   case dwarf::DW_TAG_arg_variable:
@@ -1586,18 +1595,13 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
                 dwarf::DW_FORM_ref4, AbsDIE);
   else {
     addString(VariableDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
-    addSourceLine(VariableDie, &VD);
+    addSourceLine(VariableDie, DV->getVariable());
 
     // Add variable type.
-    // FIXME: isBlockByrefVariable should be reformulated in terms of complex
-    // addresses instead.
-    if (VD.isBlockByrefVariable())
-      addType(VariableDie, getBlockByrefType(VD.getType(), Name));
-    else
-      addType(VariableDie, VD.getType());
+    addType(VariableDie, DV->getType());
   }
 
-  if (Tag == dwarf::DW_TAG_formal_parameter && VD.getType().isArtificial())
+  if (Tag == dwarf::DW_TAG_formal_parameter && DV->getType().isArtificial())
     addUInt(VariableDie, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
 
   if (Scope->isAbstractScope()) {
@@ -1623,15 +1627,22 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
     const MachineInstr *DVInsn = DVI->second;
     const MCSymbol *DVLabel = findVariableLabel(DV);
     bool updated = false;
-    // FIXME : Handle getNumOperands != 3 
+    // FIXME : Handle getNumOperands != 3
     if (DVInsn->getNumOperands() == 3) {
-      if (DVInsn->getOperand(0).isReg())
-        updated = 
-          addRegisterAddress(VariableDie, DVLabel, DVInsn->getOperand(0));
+      if (DVInsn->getOperand(0).isReg()) {
+        const MachineOperand RegOp = DVInsn->getOperand(0);
+        const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
+        if (DVInsn->getOperand(1).isImm() &&
+            TRI->getFrameRegister(*Asm->MF) == RegOp.getReg()) {
+          addVariableAddress(DV, VariableDie, DVInsn->getOperand(1).getImm());
+          updated = true;
+        } else
+          updated = addRegisterAddress(VariableDie, DVLabel, RegOp);
+      }
       else if (DVInsn->getOperand(0).isImm())
         updated = addConstantValue(VariableDie, DVLabel, DVInsn->getOperand(0));
-      else if (DVInsn->getOperand(0).isFPImm()) 
-        updated = 
+      else if (DVInsn->getOperand(0).isFPImm())
+        updated =
           addConstantFPValue(VariableDie, DVLabel, DVInsn->getOperand(0));
     } else {
       MachineLocation Location = Asm->getDebugValueLocation(DVInsn);
@@ -1651,24 +1662,13 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
     }
     DV->setDIE(VariableDie);
     return VariableDie;
-  } 
+  }
 
   // .. else use frame index, if available.
-  MachineLocation Location;
-  unsigned FrameReg;
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
   int FI = 0;
-  if (findVariableFrameIndex(DV, &FI)) {
-    int Offset = RI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
-    Location.set(FrameReg, Offset);
-    
-    if (VD.hasComplexAddress())
-      addComplexAddress(DV, VariableDie, dwarf::DW_AT_location, Location);
-    else if (VD.isBlockByrefVariable())
-      addBlockByrefAddress(DV, VariableDie, dwarf::DW_AT_location, Location);
-    else
-      addAddress(VariableDie, dwarf::DW_AT_location, Location);
-  }
+  if (findVariableFrameIndex(DV, &FI))
+    addVariableAddress(DV, VariableDie, FI);
+  
   DV->setDIE(VariableDie);
   return VariableDie;
 
@@ -1677,7 +1677,7 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
 void DwarfDebug::addPubTypes(DISubprogram SP) {
   DICompositeType SPTy = SP.getType();
   unsigned SPTag = SPTy.getTag();
-  if (SPTag != dwarf::DW_TAG_subroutine_type) 
+  if (SPTag != dwarf::DW_TAG_subroutine_type)
     return;
 
   DIArray Args = SPTy.getTypeArray();
@@ -1699,7 +1699,7 @@ void DwarfDebug::addPubTypes(DISubprogram SP) {
 DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
   if (!Scope || !Scope->getScopeNode())
     return NULL;
- 
+
   DIScope DS(Scope->getScopeNode());
   DIE *ScopeDIE = NULL;
   if (Scope->getInlinedAt())
@@ -1718,9 +1718,9 @@ DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
   else
     ScopeDIE = constructLexicalScopeDIE(Scope);
   if (!ScopeDIE) return NULL;
-  
+
   // Add variables to scope.
-  const SmallVector<DbgVariable *, 8> &Variables = Scope->getVariables();
+  const SmallVector<DbgVariable *, 8> &Variables = Scope->getDbgVariables();
   for (unsigned i = 0, N = Variables.size(); i < N; ++i) {
     DIE *VariableDIE = constructVariableDIE(Variables[i], Scope);
     if (VariableDIE)
@@ -1736,9 +1736,9 @@ DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
       ScopeDIE->addChild(NestedDIE);
   }
 
-  if (DS.isSubprogram()) 
+  if (DS.isSubprogram())
     addPubTypes(DISubprogram(DS));
- 
+
  return ScopeDIE;
 }
 
@@ -1748,6 +1748,8 @@ DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
 /// maps as well.
 unsigned DwarfDebug::GetOrCreateSourceID(StringRef DirName, StringRef FileName){
   unsigned DId;
+  assert (DirName.empty() == false && "Invalid directory name!");
+
   StringMap<unsigned>::iterator DI = DirectoryIdMap.find(DirName);
   if (DI != DirectoryIdMap.end()) {
     DId = DI->getValue();
@@ -1789,12 +1791,12 @@ DIE *DwarfDebug::getOrCreateNameSpace(DINameSpace NS) {
   TheCU->insertDIE(NS, NDie);
   if (!NS.getName().empty())
     addString(NDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, NS.getName());
-  addSourceLine(NDie, &NS);
+  addSourceLine(NDie, NS);
   addToContextOwner(NDie, NS.getContext());
   return NDie;
 }
 
-/// constructCompileUnit - Create new CompileUnit for the given 
+/// constructCompileUnit - Create new CompileUnit for the given
 /// metadata node with tag DW_TAG_compile_unit.
 void DwarfDebug::constructCompileUnit(const MDNode *N) {
   DICompileUnit DIUnit(N);
@@ -1812,9 +1814,12 @@ void DwarfDebug::constructCompileUnit(const MDNode *N) {
   // simplifies debug range entries.
   addUInt(Die, dwarf::DW_AT_entry_pc, dwarf::DW_FORM_addr, 0);
   // DW_AT_stmt_list is a offset of line number information for this
-  // compile unit in debug_line section. This offset is calculated 
-  // during endMoudle().
-  addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
+  // compile unit in debug_line section.
+  if (Asm->MAI->doesDwarfUsesAbsoluteLabelForStmtList())
+    addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_addr,
+             Asm->GetTempSymbol("section_line"));
+  else
+    addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
 
   if (!Dir.empty())
     addString(Die, dwarf::DW_AT_comp_dir, dwarf::DW_FORM_string, Dir);
@@ -1865,64 +1870,98 @@ CompileUnit *DwarfDebug::getCompileUnit(const MDNode *N) const {
   return I->second;
 }
 
+/// isUnsignedDIType - Return true if type encoding is unsigned.
+static bool isUnsignedDIType(DIType Ty) {
+  DIDerivedType DTy(Ty);
+  if (DTy.Verify())
+    return isUnsignedDIType(DTy.getTypeDerivedFrom());
+
+  DIBasicType BTy(Ty);
+  if (BTy.Verify()) {
+    unsigned Encoding = BTy.getEncoding();
+    if (Encoding == dwarf::DW_ATE_unsigned ||
+        Encoding == dwarf::DW_ATE_unsigned_char)
+      return true;
+  }
+  return false;
+}
 
 /// constructGlobalVariableDIE - Construct global variable DIE.
 void DwarfDebug::constructGlobalVariableDIE(const MDNode *N) {
-  DIGlobalVariable DI_GV(N);
+  DIGlobalVariable GV(N);
 
   // If debug information is malformed then ignore it.
-  if (DI_GV.Verify() == false)
+  if (GV.Verify() == false)
     return;
 
   // Check for pre-existence.
   CompileUnit *TheCU = getCompileUnit(N);
-  if (TheCU->getDIE(DI_GV))
+  if (TheCU->getDIE(GV))
     return;
 
-  DIE *VariableDie = createGlobalVariableDIE(DI_GV);
-  if (!VariableDie)
-    return;
-
-  // Add to map.
-  TheCU->insertDIE(N, VariableDie);
+  DIType GTy = GV.getType();
+  DIE *VariableDIE = new DIE(GV.getTag());
 
-  // Add to context owner.
-  DIDescriptor GVContext = DI_GV.getContext();
-  // Do not create specification DIE if context is either compile unit
-  // or a subprogram.
-  if (DI_GV.isDefinition() && !GVContext.isCompileUnit() &&
-      !GVContext.isFile() && 
-      !isSubprogramContext(GVContext)) {
-    // Create specification DIE.
-    DIE *VariableSpecDIE = new DIE(dwarf::DW_TAG_variable);
-    addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification,
-                dwarf::DW_FORM_ref4, VariableDie);
-    DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
-    addLabel(Block, 0, dwarf::DW_FORM_udata,
-             Asm->Mang->getSymbol(DI_GV.getGlobal()));
-    addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block);
-    addUInt(VariableDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
-    TheCU->addDie(VariableSpecDIE);
-  } else {
-    DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
-    addLabel(Block, 0, dwarf::DW_FORM_udata,
-             Asm->Mang->getSymbol(DI_GV.getGlobal()));
-    addBlock(VariableDie, dwarf::DW_AT_location, 0, Block);
-  }
-  addToContextOwner(VariableDie, GVContext);
-  
-  // Expose as global. FIXME - need to check external flag.
-  TheCU->addGlobal(DI_GV.getName(), VariableDie);
+  bool isGlobalVariable = GV.getGlobal() != NULL;
 
-  DIType GTy = DI_GV.getType();
+  // Add name.
+  addString(VariableDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string,
+            GV.getDisplayName());
+  StringRef LinkageName = GV.getLinkageName();
+  if (!LinkageName.empty() && isGlobalVariable)
+    addString(VariableDIE, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string,
+              getRealLinkageName(LinkageName));
+  // Add type.
+  addType(VariableDIE, GTy);
   if (GTy.isCompositeType() && !GTy.getName().empty()
       && !GTy.isForwardDecl()) {
     DIEEntry *Entry = TheCU->getDIEEntry(GTy);
     assert(Entry && "Missing global type!");
     TheCU->addGlobalType(GTy.getName(), Entry->getEntry());
   }
+  // Add scoping info.
+  if (!GV.isLocalToUnit()) {
+    addUInt(VariableDIE, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
+    // Expose as global. 
+    TheCU->addGlobal(GV.getName(), VariableDIE);
+  }
+  // Add line number info.
+  addSourceLine(VariableDIE, GV);
+  // Add to map.
+  TheCU->insertDIE(N, VariableDIE);
+  // Add to context owner.
+  DIDescriptor GVContext = GV.getContext();
+  addToContextOwner(VariableDIE, GVContext);
+  // Add location.
+  if (isGlobalVariable) {
+    DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+    addLabel(Block, 0, dwarf::DW_FORM_udata,
+             Asm->Mang->getSymbol(GV.getGlobal()));
+    // Do not create specification DIE if context is either compile unit
+    // or a subprogram.
+    if (GV.isDefinition() && !GVContext.isCompileUnit() &&
+        !GVContext.isFile() && !isSubprogramContext(GVContext)) {
+      // Create specification DIE.
+      DIE *VariableSpecDIE = new DIE(dwarf::DW_TAG_variable);
+      addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification,
+                  dwarf::DW_FORM_ref4, VariableDIE);
+      addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block);
+      addUInt(VariableDIE, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+      TheCU->addDie(VariableSpecDIE);
+    } else {
+      addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
+    } 
+  } else if (Constant *C = GV.getConstant()) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+      if (isUnsignedDIType(GTy))
+          addUInt(VariableDIE, dwarf::DW_AT_const_value, dwarf::DW_FORM_udata,
+                  CI->getZExtValue());
+        else
+          addSInt(VariableDIE, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata,
+                 CI->getSExtValue());
+    }
+  }
   return;
 }
 
@@ -1965,7 +2004,7 @@ void DwarfDebug::beginModule(Module *M) {
   DbgFinder.processModule(*M);
 
   bool HasDebugInfo = false;
-  
+
   // Scan all the compile-units to see if there are any marked as the main unit.
   // if not, we do not generate debug info.
   for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
@@ -1975,15 +2014,15 @@ void DwarfDebug::beginModule(Module *M) {
       break;
     }
   }
-  
+
   if (!HasDebugInfo) return;
 
   // Tell MMI that we have debug info.
   MMI->setDebugInfoAvailability(true);
-  
+
   // Emit initial sections.
   EmitSectionLabels();
-  
+
   // Create all the compile unit DIEs.
   for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
          E = DbgFinder.compile_unit_end(); I != E; ++I)
@@ -1999,6 +2038,11 @@ void DwarfDebug::beginModule(Module *M) {
          E = DbgFinder.global_variable_end(); I != E; ++I)
     constructGlobalVariableDIE(*I);
 
+  //getOrCreateTypeDIE
+  if (NamedMDNode *NMD = M->getNamedMetadata("llvm.dbg.enum"))
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
+      getOrCreateTypeDIE(DIType(NMD->getOperand(i)));
+
   // Prime section data.
   SectionMap.insert(Asm->getObjFileLowering().getTextSection());
 
@@ -2025,6 +2069,7 @@ void DwarfDebug::beginModule(Module *M) {
 void DwarfDebug::endModule() {
   if (!FirstCU) return;
   const Module *M = MMI->getModule();
+  DenseMap<const MDNode *, DbgScope *> DeadFnScopeMap;
   if (NamedMDNode *AllSPs = M->getNamedMetadata("llvm.dbg.sp")) {
     for (unsigned SI = 0, SE = AllSPs->getNumOperands(); SI != SE; ++SI) {
       if (ProcessedSPNodes.count(AllSPs->getOperand(SI)) != 0) continue;
@@ -2032,25 +2077,27 @@ void DwarfDebug::endModule() {
       if (!SP.Verify()) continue;
 
       // Collect info for variables that were optimized out.
+      if (!SP.isDefinition()) continue;
       StringRef FName = SP.getLinkageName();
       if (FName.empty())
         FName = SP.getName();
-      NamedMDNode *NMD = 
+      NamedMDNode *NMD =
         M->getNamedMetadata(Twine("llvm.dbg.lv.", getRealLinkageName(FName)));
       if (!NMD) continue;
       unsigned E = NMD->getNumOperands();
       if (!E) continue;
       DbgScope *Scope = new DbgScope(NULL, DIDescriptor(SP), NULL);
+      DeadFnScopeMap[SP] = Scope;
       for (unsigned I = 0; I != E; ++I) {
         DIVariable DV(NMD->getOperand(I));
         if (!DV.Verify()) continue;
         Scope->addVariable(new DbgVariable(DV));
       }
-      
+
       // Construct subprogram DIE and add variables DIEs.
       constructSubprogramDIE(SP);
       DIE *ScopeDIE = getCompileUnit(SP)->getDIE(SP);
-      const SmallVector<DbgVariable *, 8> &Variables = Scope->getVariables();
+      const SmallVector<DbgVariable *, 8> &Variables = Scope->getDbgVariables();
       for (unsigned i = 0, N = Variables.size(); i < N; ++i) {
         DIE *VariableDIE = constructVariableDIE(Variables[i], Scope);
         if (VariableDIE)
@@ -2099,15 +2146,15 @@ void DwarfDebug::endModule() {
   // Compute DIE offsets and sizes.
   computeSizeAndOffsets();
 
-  // Emit source line correspondence into a debug line section.
-  emitDebugLines();
-
   // Emit all the DIEs into a debug info section
   emitDebugInfo();
 
   // Corresponding abbreviations into a abbrev section.
   emitAbbreviations();
 
+  // Emit source line correspondence into a debug line section.
+  emitDebugLines();
+
   // Emit info into a debug pubnames section.
   emitDebugPubNames();
 
@@ -2131,7 +2178,9 @@ void DwarfDebug::endModule() {
 
   // Emit info into a debug str section.
   emitDebugStr();
-  
+
+  // clean up.
+  DeleteContainerSeconds(DeadFnScopeMap);
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
          E = CUMap.end(); I != E; ++I)
     delete I->second;
@@ -2139,7 +2188,7 @@ void DwarfDebug::endModule() {
 }
 
 /// findAbstractVariable - Find abstract variable, if any, associated with Var.
-DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &Var, 
+DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &Var,
                                               DebugLoc ScopeLoc) {
 
   DbgVariable *AbsDbgVariable = AbstractVariables.lookup(Var);
@@ -2159,7 +2208,7 @@ DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &Var,
 
 /// collectVariableInfoFromMMITable - Collect variable information from
 /// side table maintained by MMI.
-void 
+void
 DwarfDebug::collectVariableInfoFromMMITable(const MachineFunction * MF,
                                    SmallPtrSet<const MDNode *, 16> &Processed) {
   const LLVMContext &Ctx = Asm->MF->getFunction()->getContext();
@@ -2177,7 +2226,7 @@ DwarfDebug::collectVariableInfoFromMMITable(const MachineFunction * MF,
       Scope = ConcreteScopes.lookup(IA);
     if (Scope == 0)
       Scope = DbgScopeMap.lookup(VP.second.getScope(Ctx));
-    
+
     // If variable scope is not found then skip this variable.
     if (Scope == 0)
       continue;
@@ -2193,7 +2242,7 @@ DwarfDebug::collectVariableInfoFromMMITable(const MachineFunction * MF,
   }
 }
 
-/// isDbgValueInUndefinedReg - Return true if debug value, encoded by 
+/// isDbgValueInUndefinedReg - Return true if debug value, encoded by
 /// DBG_VALUE instruction, is in undefined reg.
 static bool isDbgValueInUndefinedReg(const MachineInstr *MI) {
   assert (MI->isDebugValue() && "Invalid DBG_VALUE machine instruction!");
@@ -2202,7 +2251,7 @@ static bool isDbgValueInUndefinedReg(const MachineInstr *MI) {
   return false;
 }
 
-/// isDbgValueInDefinedReg - Return true if debug value, encoded by 
+/// isDbgValueInDefinedReg - Return true if debug value, encoded by
 /// DBG_VALUE instruction, is in a defined reg.
 static bool isDbgValueInDefinedReg(const MachineInstr *MI) {
   assert (MI->isDebugValue() && "Invalid DBG_VALUE machine instruction!");
@@ -2212,10 +2261,10 @@ static bool isDbgValueInDefinedReg(const MachineInstr *MI) {
 }
 
 /// collectVariableInfo - Populate DbgScope entries with variables' info.
-void 
+void
 DwarfDebug::collectVariableInfo(const MachineFunction *MF,
                                 SmallPtrSet<const MDNode *, 16> &Processed) {
-  
+
   /// collection info from MMI table.
   collectVariableInfoFromMMITable(MF, Processed);
 
@@ -2244,11 +2293,11 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
       continue;
 
     const MachineInstr *PrevMI = MInsn;
-    for (SmallVector<const MachineInstr *, 8>::iterator MI = I+1, 
+    for (SmallVector<const MachineInstr *, 8>::iterator MI = I+1,
            ME = DbgValues.end(); MI != ME; ++MI) {
-      const MDNode *Var = 
+      const MDNode *Var =
         (*MI)->getOperand((*MI)->getNumOperands()-1).getMetadata();
-      if (Var == DV && isDbgValueInDefinedReg(*MI) && 
+      if (Var == DV && isDbgValueInDefinedReg(*MI) &&
           !PrevMI->isIdenticalTo(*MI))
         MultipleValues.push_back(*MI);
       PrevMI = *MI;
@@ -2269,7 +2318,7 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
     DbgVariable *RegVar = new DbgVariable(DV);
     Scope->addVariable(RegVar);
     if (!CurFnArg)
-      DbgVariableLabelsMap[RegVar] = getLabelBeforeInsn(MInsn); 
+      DbgVariableLabelsMap[RegVar] = getLabelBeforeInsn(MInsn);
     if (DbgVariable *AbsVar = findAbstractVariable(DV, MInsn->getDebugLoc())) {
       DbgVariableToDbgInstMap[AbsVar] = MInsn;
       VarToAbstractVarMap[RegVar] = AbsVar;
@@ -2286,26 +2335,39 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
       RegVar->setDotDebugLocOffset(DotDebugLocEntries.size());
     const MachineInstr *Begin = NULL;
     const MachineInstr *End = NULL;
-    for (SmallVector<const MachineInstr *, 4>::iterator 
-           MVI = MultipleValues.begin(), MVE = MultipleValues.end(); 
+    for (SmallVector<const MachineInstr *, 4>::iterator
+           MVI = MultipleValues.begin(), MVE = MultipleValues.end();
          MVI != MVE; ++MVI) {
       if (!Begin) {
         Begin = *MVI;
         continue;
-      } 
+      }
       End = *MVI;
       MachineLocation MLoc;
-      MLoc.set(Begin->getOperand(0).getReg(), 0);
+      if (Begin->getNumOperands() == 3) {
+        if (Begin->getOperand(0).isReg() && Begin->getOperand(1).isImm())
+          MLoc.set(Begin->getOperand(0).getReg(), Begin->getOperand(1).getImm());
+      } else
+        MLoc = Asm->getDebugValueLocation(Begin);
+
       const MCSymbol *FLabel = getLabelBeforeInsn(Begin);
       const MCSymbol *SLabel = getLabelBeforeInsn(End);
-      DotDebugLocEntries.push_back(DotDebugLocEntry(FLabel, SLabel, MLoc));
+      if (MLoc.getReg())
+        DotDebugLocEntries.push_back(DotDebugLocEntry(FLabel, SLabel, MLoc));
+
       Begin = End;
       if (MVI + 1 == MVE) {
         // If End is the last instruction then its value is valid
         // until the end of the funtion.
-        MLoc.set(End->getOperand(0).getReg(), 0);
-        DotDebugLocEntries.
-          push_back(DotDebugLocEntry(SLabel, FunctionEndSym, MLoc));
+        MachineLocation EMLoc;
+        if (End->getNumOperands() == 3) {
+          if (End->getOperand(0).isReg() && Begin->getOperand(1).isImm())
+          EMLoc.set(Begin->getOperand(0).getReg(), Begin->getOperand(1).getImm());
+        } else
+          EMLoc = Asm->getDebugValueLocation(End);
+        if (EMLoc.getReg()) 
+          DotDebugLocEntries.
+            push_back(DotDebugLocEntry(SLabel, FunctionEndSym, EMLoc));
       }
     }
     DotDebugLocEntries.push_back(DotDebugLocEntry());
@@ -2314,11 +2376,11 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
   // Collect info for variables that were optimized out.
   const Function *F = MF->getFunction();
   const Module *M = F->getParent();
-  if (NamedMDNode *NMD = 
-      M->getNamedMetadata(Twine("llvm.dbg.lv.", 
+  if (NamedMDNode *NMD =
+      M->getNamedMetadata(Twine("llvm.dbg.lv.",
                                 getRealLinkageName(F->getName())))) {
     for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
-      DIVariable DV(cast_or_null<MDNode>(NMD->getOperand(i)));
+      DIVariable DV(cast<MDNode>(NMD->getOperand(i)));
       if (!DV || !Processed.insert(DV))
         continue;
       DbgScope *Scope = DbgScopeMap.lookup(DV.getContext());
@@ -2364,7 +2426,7 @@ void DwarfDebug::beginScope(const MachineInstr *MI) {
     return;
   }
 
-  // If location is unknown then use temp label for this DBG_VALUE 
+  // If location is unknown then use temp label for this DBG_VALUE
   // instruction.
   if (MI->isDebugValue()) {
     PrevLabel = MMI->getContext().CreateTempSymbol();
@@ -2393,7 +2455,7 @@ void DwarfDebug::endScope(const MachineInstr *MI) {
 }
 
 /// getOrCreateDbgScope - Create DbgScope for the scope.
-DbgScope *DwarfDebug::getOrCreateDbgScope(const MDNode *Scope, 
+DbgScope *DwarfDebug::getOrCreateDbgScope(const MDNode *Scope,
                                           const MDNode *InlinedAt) {
   if (!InlinedAt) {
     DbgScope *WScope = DbgScopeMap.lookup(Scope);
@@ -2402,7 +2464,7 @@ DbgScope *DwarfDebug::getOrCreateDbgScope(const MDNode *Scope,
     WScope = new DbgScope(NULL, DIDescriptor(Scope), NULL);
     DbgScopeMap.insert(std::make_pair(Scope, WScope));
     if (DIDescriptor(Scope).isLexicalBlock()) {
-      DbgScope *Parent = 
+      DbgScope *Parent =
         getOrCreateDbgScope(DILexicalBlock(Scope).getContext(), NULL);
       WScope->setParent(Parent);
       Parent->addScope(WScope);
@@ -2419,7 +2481,7 @@ DbgScope *DwarfDebug::getOrCreateDbgScope(const MDNode *Scope,
           DISubprogram(Scope).getFunction() == Asm->MF->getFunction())
         CurrentFnDbgScope = WScope;
     }
-    
+
     return WScope;
   }
 
@@ -2448,14 +2510,14 @@ static bool hasValidLocation(LLVMContext &Ctx,
                              const MDNode *&Scope, const MDNode *&InlinedAt) {
   DebugLoc DL = MInsn->getDebugLoc();
   if (DL.isUnknown()) return false;
-      
+
   const MDNode *S = DL.getScope(Ctx);
-  
+
   // There is no need to create another DIE for compile unit. For all
   // other scopes, create one DbgScope now. This will be translated
   // into a scope DIE at the end.
   if (DIScope(S).isCompileUnit()) return false;
-     
+
   Scope = S;
   InlinedAt = DL.getInlinedAt(Ctx);
   return true;
@@ -2490,7 +2552,7 @@ static void calculateDominanceGraph(DbgScope *Scope) {
 }
 
 /// printDbgScopeInfo - Print DbgScope info for each machine instruction.
-static 
+static
 void printDbgScopeInfo(LLVMContext &Ctx, const MachineFunction *MF,
                        DenseMap<const MachineInstr *, DbgScope *> &MI2ScopeMap)
 {
@@ -2507,9 +2569,9 @@ void printDbgScopeInfo(LLVMContext &Ctx, const MachineFunction *MF,
       // Check if instruction has valid location information.
       if (hasValidLocation(Ctx, MInsn, Scope, InlinedAt)) {
         dbgs() << " [ ";
-        if (InlinedAt) 
+        if (InlinedAt)
           dbgs() << "*";
-        DenseMap<const MachineInstr *, DbgScope *>::iterator DI = 
+        DenseMap<const MachineInstr *, DbgScope *>::iterator DI =
           MI2ScopeMap.find(MInsn);
         if (DI != MI2ScopeMap.end()) {
           DbgScope *S = DI->second;
@@ -2517,7 +2579,7 @@ void printDbgScopeInfo(LLVMContext &Ctx, const MachineFunction *MF,
           PrevDFSIn = S->getDFSIn();
         } else
           dbgs() << PrevDFSIn;
-      } else 
+      } else
         dbgs() << " [ x" << PrevDFSIn;
       dbgs() << " ]";
       MInsn->dump();
@@ -2555,26 +2617,26 @@ bool DwarfDebug::extractScopeInformation() {
         PrevMI = MInsn;
         continue;
       }
-      
+
       // If scope has not changed then skip this instruction.
       if (Scope == PrevScope && PrevInlinedAt == InlinedAt) {
         PrevMI = MInsn;
         continue;
       }
 
-      if (RangeBeginMI) {      
-        // If we have alread seen a beginning of a instruction range and 
+      if (RangeBeginMI) {
+        // If we have alread seen a beginning of a instruction range and
         // current instruction scope does not match scope of first instruction
         // in this range then create a new instruction range.
         DbgRange R(RangeBeginMI, PrevMI);
-        MI2ScopeMap[RangeBeginMI] = getOrCreateDbgScope(PrevScope, 
+        MI2ScopeMap[RangeBeginMI] = getOrCreateDbgScope(PrevScope,
                                                         PrevInlinedAt);
         MIRanges.push_back(R);
-      } 
+      }
 
       // This is a beginning of a new instruction range.
       RangeBeginMI = MInsn;
-      
+
       // Reset previous markers.
       PrevMI = MInsn;
       PrevScope = Scope;
@@ -2588,7 +2650,7 @@ bool DwarfDebug::extractScopeInformation() {
     MIRanges.push_back(R);
     MI2ScopeMap[RangeBeginMI] = getOrCreateDbgScope(PrevScope, PrevInlinedAt);
   }
-  
+
   if (!CurrentFnDbgScope)
     return false;
 
@@ -2618,7 +2680,7 @@ bool DwarfDebug::extractScopeInformation() {
   return !DbgScopeMap.empty();
 }
 
-/// identifyScopeMarkers() - 
+/// identifyScopeMarkers() -
 /// Each DbgScope has first instruction and last instruction to mark beginning
 /// and end of a scope respectively. Create an inverse map that list scopes
 /// starts (and ends) with an instruction. One instruction may start (or end)
@@ -2628,23 +2690,23 @@ void DwarfDebug::identifyScopeMarkers() {
   WorkList.push_back(CurrentFnDbgScope);
   while (!WorkList.empty()) {
     DbgScope *S = WorkList.pop_back_val();
-    
+
     const SmallVector<DbgScope *, 4> &Children = S->getScopes();
-    if (!Children.empty()) 
+    if (!Children.empty())
       for (SmallVector<DbgScope *, 4>::const_iterator SI = Children.begin(),
              SE = Children.end(); SI != SE; ++SI)
         WorkList.push_back(*SI);
 
     if (S->isAbstractScope())
       continue;
-    
+
     const SmallVector<DbgRange, 4> &Ranges = S->getRanges();
     if (Ranges.empty())
       continue;
     for (SmallVector<DbgRange, 4>::const_iterator RI = Ranges.begin(),
            RE = Ranges.end(); RI != RE; ++RI) {
-      assert(RI->first && "DbgRange does not have first instruction!");      
-      assert(RI->second && "DbgRange does not have second instruction!");      
+      assert(RI->first && "DbgRange does not have first instruction!");
+      assert(RI->second && "DbgRange does not have second instruction!");
       InsnsEndScopeSet.insert(RI->second);
     }
   }
@@ -2680,20 +2742,23 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   // function.
   DebugLoc FDL = FindFirstDebugLoc(MF);
   if (FDL.isUnknown()) return;
-  
+
   const MDNode *Scope = FDL.getScope(MF->getFunction()->getContext());
-  
+  const MDNode *TheScope = 0;
+
   DISubprogram SP = getDISubprogram(Scope);
   unsigned Line, Col;
   if (SP.Verify()) {
     Line = SP.getLineNumber();
     Col = 0;
+    TheScope = SP;
   } else {
     Line = FDL.getLine();
     Col = FDL.getCol();
+    TheScope = Scope;
   }
-  
-  recordSourceLine(Line, Col, Scope);
+
+  recordSourceLine(Line, Col, TheScope);
 
   /// ProcessedArgs - Collection of arguments already processed.
   SmallPtrSet<const MDNode *, 8> ProcessedArgs;
@@ -2710,7 +2775,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
         DIVariable DV(MI->getOperand(MI->getNumOperands() - 1).getMetadata());
         if (!DV.Verify()) continue;
         // If DBG_VALUE is for a local variable then it needs a label.
-        if (DV.getTag() != dwarf::DW_TAG_arg_variable 
+        if (DV.getTag() != dwarf::DW_TAG_arg_variable
             && isDbgValueInUndefinedReg(MI) == false)
           InsnNeedsLabel.insert(MI);
         // DBG_VALUE for inlined functions argument needs a label.
@@ -2718,10 +2783,11 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
                  describes(MF->getFunction()))
           InsnNeedsLabel.insert(MI);
         // DBG_VALUE indicating argument location change needs a label.
-        else if (isDbgValueInUndefinedReg(MI) == false && !ProcessedArgs.insert(DV))
+        else if (isDbgValueInUndefinedReg(MI) == false
+                 && !ProcessedArgs.insert(DV))
           InsnNeedsLabel.insert(MI);
       } else {
-        // If location is unknown then instruction needs a location only if 
+        // If location is unknown then instruction needs a location only if
         // UnknownLocations flag is set.
         if (DL.isUnknown()) {
           if (UnknownLocations && !PrevLoc.isUnknown())
@@ -2730,7 +2796,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
           // Otherwise, instruction needs a location only if it is new location.
           InsnNeedsLabel.insert(MI);
       }
-      
+
       if (!DL.isUnknown() || UnknownLocations)
         PrevLoc = DL;
     }
@@ -2750,7 +2816,7 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
                                         Asm->getFunctionNumber());
     // Assumes in correct section after the entry point.
     Asm->OutStreamer.EmitLabel(FunctionEndSym);
-    
+
     SmallPtrSet<const MDNode *, 16> ProcessedVars;
     collectVariableInfo(MF, ProcessedVars);
 
@@ -2764,7 +2830,7 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
       SectionLineInfos.insert(SectionLineInfos.end(),
                               Lines.begin(), Lines.end());
     }
-    
+
     // Construct abstract scopes.
     for (SmallVector<DbgScope *, 4>::iterator AI = AbstractScopesList.begin(),
            AE = AbstractScopesList.end(); AI != AE; ++AI) {
@@ -2775,11 +2841,11 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
         if (FName.empty())
           FName = SP.getName();
         const Module *M = MF->getFunction()->getParent();
-        if (NamedMDNode *NMD = 
-            M->getNamedMetadata(Twine("llvm.dbg.lv.", 
+        if (NamedMDNode *NMD =
+            M->getNamedMetadata(Twine("llvm.dbg.lv.",
                                       getRealLinkageName(FName)))) {
           for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
-          DIVariable DV(cast_or_null<MDNode>(NMD->getOperand(i)));
+          DIVariable DV(cast<MDNode>(NMD->getOperand(i)));
           if (!DV || !ProcessedVars.insert(DV))
             continue;
           DbgScope *Scope = AbstractScopes.lookup(DV.getContext());
@@ -2793,9 +2859,9 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
     }
 
     DIE *CurFnDIE = constructScopeDIE(CurrentFnDbgScope);
-    
+
     if (!DisableFramePointerElim(*MF))
-      addUInt(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr, 
+      addUInt(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr,
               dwarf::DW_FORM_flag, 1);
 
 
@@ -2849,22 +2915,22 @@ const MCSymbol *DwarfDebug::findVariableLabel(const DbgVariable *V) {
   else return I->second;
 }
 
-/// findDbgScope - Find DbgScope for the debug loc attached with an 
+/// findDbgScope - Find DbgScope for the debug loc attached with an
 /// instruction.
 DbgScope *DwarfDebug::findDbgScope(const MachineInstr *MInsn) {
   DbgScope *Scope = NULL;
-  LLVMContext &Ctx = 
+  LLVMContext &Ctx =
     MInsn->getParent()->getParent()->getFunction()->getContext();
   DebugLoc DL = MInsn->getDebugLoc();
 
-  if (DL.isUnknown()) 
+  if (DL.isUnknown())
     return Scope;
 
   if (const MDNode *IA = DL.getInlinedAt(Ctx))
     Scope = ConcreteScopes.lookup(IA);
   if (Scope == 0)
     Scope = DbgScopeMap.lookup(DL.getScope(Ctx));
-    
+
   return Scope;
 }
 
@@ -2872,7 +2938,7 @@ DbgScope *DwarfDebug::findDbgScope(const MachineInstr *MInsn) {
 /// recordSourceLine - Register a source line with debug info. Returns the
 /// unique label that was emitted and which provides correspondence to
 /// the source line list.
-MCSymbol *DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, 
+MCSymbol *DwarfDebug::recordSourceLine(unsigned Line, unsigned Col,
                                        const MDNode *S) {
   StringRef Dir;
   StringRef Fn;
@@ -2899,16 +2965,6 @@ MCSymbol *DwarfDebug::recordSourceLine(unsigned Line, unsigned Col,
     Src = GetOrCreateSourceID(Dir, Fn);
   }
 
-#if 0
-  if (!Lines.empty()) {
-    SrcLineInfo lastSrcLineInfo = Lines.back();
-    // Emitting sequential line records with the same line number (but
-    // different addresses) seems to confuse GDB.  Avoid this.
-    if (lastSrcLineInfo.getLine() == Line)
-      return NULL;
-  }
-#endif
-
   MCSymbol *Label = MMI->getContext().CreateTempSymbol();
   Lines.push_back(SrcLineInfo(Line, Col, Src, Label));
 
@@ -2991,7 +3047,7 @@ static MCSymbol *EmitSectionSym(AsmPrinter *Asm, const MCSection *Section,
                                 const char *SymbolStem = 0) {
   Asm->OutStreamer.SwitchSection(Section);
   if (!SymbolStem) return 0;
-  
+
   MCSymbol *TmpSym = Asm->GetTempSymbol(SymbolStem);
   Asm->OutStreamer.EmitLabel(TmpSym);
   return TmpSym;
@@ -3008,21 +3064,20 @@ void DwarfDebug::EmitSectionLabels() {
       EmitSectionSym(Asm, TLOF.getDwarfFrameSection(), "section_debug_frame");
    }
 
-  DwarfInfoSectionSym = 
+  DwarfInfoSectionSym =
     EmitSectionSym(Asm, TLOF.getDwarfInfoSection(), "section_info");
-  DwarfAbbrevSectionSym = 
+  DwarfAbbrevSectionSym =
     EmitSectionSym(Asm, TLOF.getDwarfAbbrevSection(), "section_abbrev");
   EmitSectionSym(Asm, TLOF.getDwarfARangesSection());
-  
+
   if (const MCSection *MacroInfo = TLOF.getDwarfMacroInfoSection())
     EmitSectionSym(Asm, MacroInfo);
 
-  DwarfDebugLineSectionSym = 
-    EmitSectionSym(Asm, TLOF.getDwarfLineSection(), "section_line");
+  EmitSectionSym(Asm, TLOF.getDwarfLineSection(), "section_line");
   EmitSectionSym(Asm, TLOF.getDwarfLocSection());
   EmitSectionSym(Asm, TLOF.getDwarfPubNamesSection());
   EmitSectionSym(Asm, TLOF.getDwarfPubTypesSection());
-  DwarfStrSectionSym = 
+  DwarfStrSectionSym =
     EmitSectionSym(Asm, TLOF.getDwarfStrSection(), "section_str");
   DwarfDebugRangeSectionSym = EmitSectionSym(Asm, TLOF.getDwarfRangesSection(),
                                              "debug_range");
@@ -3060,7 +3115,7 @@ void DwarfDebug::emitDIE(DIE *Die) {
 
     if (Asm->isVerbose())
       Asm->OutStreamer.AddComment(dwarf::AttributeString(Attr));
-    
+
     switch (Attr) {
     case dwarf::DW_AT_sibling:
       Asm->EmitInt32(Die->getSiblingOffset());
@@ -3075,15 +3130,17 @@ void DwarfDebug::emitDIE(DIE *Die) {
     case dwarf::DW_AT_ranges: {
       // DW_AT_range Value encodes offset in debug_range section.
       DIEInteger *V = cast<DIEInteger>(Values[i]);
-      Asm->EmitLabelOffsetDifference(DwarfDebugRangeSectionSym,
-                                     V->getValue(),
-                                     DwarfDebugRangeSectionSym,
-                                     4);
-      break;
-    }
-    case dwarf::DW_AT_stmt_list: {
-      Asm->EmitLabelDifference(CurrentLineSectionSym, 
-                               DwarfDebugLineSectionSym, 4);
+
+      if (Asm->MAI->doesDwarfUsesLabelOffsetForRanges()) {
+        Asm->EmitLabelPlusOffset(DwarfDebugRangeSectionSym,
+                                 V->getValue(),
+                                 4);
+      } else {
+        Asm->EmitLabelOffsetDifference(DwarfDebugRangeSectionSym,
+                                       V->getValue(),
+                                       DwarfDebugRangeSectionSym,
+                                       4);
+      }
       break;
     }
     case dwarf::DW_AT_location: {
@@ -3124,18 +3181,18 @@ void DwarfDebug::emitDebugInfo() {
          E = CUMap.end(); I != E; ++I) {
     CompileUnit *TheCU = I->second;
     DIE *Die = TheCU->getCUDie();
-    
+
     // Emit the compile units header.
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("info_begin",
                                                   TheCU->getID()));
-    
+
     // Emit size of content not including length itself
     unsigned ContentSize = Die->getSize() +
       sizeof(int16_t) + // DWARF version number
       sizeof(int32_t) + // Offset Into Abbrev. Section
       sizeof(int8_t) +  // Pointer Size (in bytes)
       sizeof(int32_t);  // FIXME - extra pad for gdb bug.
-    
+
     Asm->OutStreamer.AddComment("Length of Compilation Unit Info");
     Asm->EmitInt32(ContentSize);
     Asm->OutStreamer.AddComment("DWARF version number");
@@ -3145,7 +3202,7 @@ void DwarfDebug::emitDebugInfo() {
                            DwarfAbbrevSectionSym);
     Asm->OutStreamer.AddComment("Address Size (in bytes)");
     Asm->EmitInt8(Asm->getTargetData().getPointerSize());
-    
+
     emitDIE(Die);
     // FIXME - extra padding for gdb bug.
     Asm->OutStreamer.AddComment("4 extra padding bytes for GDB");
@@ -3194,7 +3251,7 @@ void DwarfDebug::emitEndOfLineMatrix(unsigned SectionEnd) {
   // Define last address of section.
   Asm->OutStreamer.AddComment("Extended Op");
   Asm->EmitInt8(0);
-  
+
   Asm->OutStreamer.AddComment("Op size");
   Asm->EmitInt8(Asm->getTargetData().getPointerSize() + 1);
   Asm->OutStreamer.AddComment("DW_LNE_set_address");
@@ -3231,15 +3288,13 @@ void DwarfDebug::emitDebugLines() {
                             Asm->getObjFileLowering().getDwarfLineSection());
 
   // Construct the section header.
-  CurrentLineSectionSym = Asm->GetTempSymbol("section_line_begin");
-  Asm->OutStreamer.EmitLabel(CurrentLineSectionSym);
   Asm->OutStreamer.AddComment("Length of Source Line Info");
   Asm->EmitLabelDifference(Asm->GetTempSymbol("line_end"),
                            Asm->GetTempSymbol("line_begin"), 4);
   Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("line_begin"));
 
   Asm->OutStreamer.AddComment("DWARF version number");
-  Asm->EmitInt16(dwarf::DWARF_VERSION); 
+  Asm->EmitInt16(dwarf::DWARF_VERSION);
 
   Asm->OutStreamer.AddComment("Prolog Length");
   Asm->EmitLabelDifference(Asm->GetTempSymbol("line_prolog_end"),
@@ -3294,7 +3349,7 @@ void DwarfDebug::emitDebugLines() {
     const std::string &FN = getSourceFileName(Id.second);
     if (Asm->isVerbose()) Asm->OutStreamer.AddComment("Source");
     Asm->OutStreamer.EmitBytes(StringRef(FN.c_str(), FN.size()+1), 0);
-    
+
     Asm->EmitULEB128(Id.first, "Directory #");
     Asm->EmitULEB128(0, "Mod date");
     Asm->EmitULEB128(0, "File size");
@@ -3338,18 +3393,18 @@ void DwarfDebug::emitDebugLines() {
       Asm->EmitInt8(Asm->getTargetData().getPointerSize() + 1);
 
       Asm->OutStreamer.AddComment("DW_LNE_set_address");
-      Asm->EmitInt8(dwarf::DW_LNE_set_address); 
+      Asm->EmitInt8(dwarf::DW_LNE_set_address);
 
       Asm->OutStreamer.AddComment("Location label");
       Asm->OutStreamer.EmitSymbolValue(Label,
                                        Asm->getTargetData().getPointerSize(),
                                        0/*AddrSpace*/);
-      
+
       // If change of source, then switch to the new source.
       if (Source != LineInfo.getSourceID()) {
         Source = LineInfo.getSourceID();
         Asm->OutStreamer.AddComment("DW_LNS_set_file");
-        Asm->EmitInt8(dwarf::DW_LNS_set_file); 
+        Asm->EmitInt8(dwarf::DW_LNS_set_file);
         Asm->EmitULEB128(Source, "New Source");
       }
 
@@ -3457,7 +3512,7 @@ emitFunctionDebugFrame(const FunctionDebugFrameInfo &DebugFrameInfo) {
   Asm->OutStreamer.EmitLabel(DebugFrameBegin);
 
   Asm->OutStreamer.AddComment("FDE CIE offset");
-  Asm->EmitSectionOffset(Asm->GetTempSymbol("debug_frame_common"), 
+  Asm->EmitSectionOffset(Asm->GetTempSymbol("debug_frame_common"),
                          DwarfFrameSectionSym);
 
   Asm->OutStreamer.AddComment("FDE initial location");
@@ -3466,8 +3521,8 @@ emitFunctionDebugFrame(const FunctionDebugFrameInfo &DebugFrameInfo) {
   Asm->OutStreamer.EmitSymbolValue(FuncBeginSym,
                                    Asm->getTargetData().getPointerSize(),
                                    0/*AddrSpace*/);
-  
-  
+
+
   Asm->OutStreamer.AddComment("FDE address range");
   Asm->EmitLabelDifference(Asm->GetTempSymbol("func_end",DebugFrameInfo.Number),
                            FuncBeginSym, Asm->getTargetData().getPointerSize());
@@ -3487,41 +3542,41 @@ void DwarfDebug::emitDebugPubNames() {
     // Start the dwarf pubnames section.
     Asm->OutStreamer.SwitchSection(
       Asm->getObjFileLowering().getDwarfPubNamesSection());
-    
+
     Asm->OutStreamer.AddComment("Length of Public Names Info");
     Asm->EmitLabelDifference(
       Asm->GetTempSymbol("pubnames_end", TheCU->getID()),
       Asm->GetTempSymbol("pubnames_begin", TheCU->getID()), 4);
-    
+
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubnames_begin",
                                                   TheCU->getID()));
-    
+
     Asm->OutStreamer.AddComment("DWARF Version");
-    Asm->EmitInt16(dwarf::DWARF_VERSION); 
-    
+    Asm->EmitInt16(dwarf::DWARF_VERSION);
+
     Asm->OutStreamer.AddComment("Offset of Compilation Unit Info");
-    Asm->EmitSectionOffset(Asm->GetTempSymbol("info_begin", TheCU->getID()), 
+    Asm->EmitSectionOffset(Asm->GetTempSymbol("info_begin", TheCU->getID()),
                            DwarfInfoSectionSym);
-    
+
     Asm->OutStreamer.AddComment("Compilation Unit Length");
     Asm->EmitLabelDifference(Asm->GetTempSymbol("info_end", TheCU->getID()),
                              Asm->GetTempSymbol("info_begin", TheCU->getID()),
                              4);
-    
+
     const StringMap<DIE*> &Globals = TheCU->getGlobals();
     for (StringMap<DIE*>::const_iterator
            GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
       const char *Name = GI->getKeyData();
       DIE *Entity = GI->second;
-      
+
       Asm->OutStreamer.AddComment("DIE offset");
       Asm->EmitInt32(Entity->getOffset());
-      
+
       if (Asm->isVerbose())
         Asm->OutStreamer.AddComment("External Name");
       Asm->OutStreamer.EmitBytes(StringRef(Name, strlen(Name)+1), 0);
     }
-    
+
     Asm->OutStreamer.AddComment("End Mark");
     Asm->EmitInt32(0);
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubnames_end",
@@ -3540,37 +3595,37 @@ void DwarfDebug::emitDebugPubTypes() {
     Asm->EmitLabelDifference(
       Asm->GetTempSymbol("pubtypes_end", TheCU->getID()),
       Asm->GetTempSymbol("pubtypes_begin", TheCU->getID()), 4);
-    
+
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubtypes_begin",
                                                   TheCU->getID()));
-    
+
     if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DWARF Version");
     Asm->EmitInt16(dwarf::DWARF_VERSION);
-    
+
     Asm->OutStreamer.AddComment("Offset of Compilation Unit Info");
     Asm->EmitSectionOffset(Asm->GetTempSymbol("info_begin", TheCU->getID()),
                            DwarfInfoSectionSym);
-    
+
     Asm->OutStreamer.AddComment("Compilation Unit Length");
     Asm->EmitLabelDifference(Asm->GetTempSymbol("info_end", TheCU->getID()),
                              Asm->GetTempSymbol("info_begin", TheCU->getID()),
                              4);
-    
+
     const StringMap<DIE*> &Globals = TheCU->getGlobalTypes();
     for (StringMap<DIE*>::const_iterator
            GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
       const char *Name = GI->getKeyData();
       DIE * Entity = GI->second;
-      
+
       if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DIE offset");
       Asm->EmitInt32(Entity->getOffset());
-      
+
       if (Asm->isVerbose()) Asm->OutStreamer.AddComment("External Name");
       Asm->OutStreamer.EmitBytes(StringRef(Name, GI->getKeyLength()+1), 0);
     }
-    
+
     Asm->OutStreamer.AddComment("End Mark");
-    Asm->EmitInt32(0); 
+    Asm->EmitInt32(0);
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubtypes_end",
                                                   TheCU->getID()));
   }
@@ -3581,26 +3636,26 @@ void DwarfDebug::emitDebugPubTypes() {
 void DwarfDebug::emitDebugStr() {
   // Check to see if it is worth the effort.
   if (StringPool.empty()) return;
-  
+
   // Start the dwarf str section.
   Asm->OutStreamer.SwitchSection(
                                 Asm->getObjFileLowering().getDwarfStrSection());
 
   // Get all of the string pool entries and put them in an array by their ID so
   // we can sort them.
-  SmallVector<std::pair<unsigned, 
+  SmallVector<std::pair<unsigned,
       StringMapEntry<std::pair<MCSymbol*, unsigned> >*>, 64> Entries;
-  
+
   for (StringMap<std::pair<MCSymbol*, unsigned> >::iterator
        I = StringPool.begin(), E = StringPool.end(); I != E; ++I)
     Entries.push_back(std::make_pair(I->second.second, &*I));
-  
+
   array_pod_sort(Entries.begin(), Entries.end());
-  
+
   for (unsigned i = 0, e = Entries.size(); i != e; ++i) {
     // Emit a label for reference from debug information entries.
     Asm->OutStreamer.EmitLabel(Entries[i].second->getValue().first);
-    
+
     // Emit the string itself.
     Asm->OutStreamer.EmitBytes(Entries[i].second->getKey(), 0/*addrspace*/);
   }
@@ -3618,8 +3673,8 @@ void DwarfDebug::emitDebugLoc() {
   unsigned char Size = Asm->getTargetData().getPointerSize();
   Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_loc", 0));
   unsigned index = 1;
-  for (SmallVector<DotDebugLocEntry, 4>::iterator 
-         I = DotDebugLocEntries.begin(), E = DotDebugLocEntries.end(); 
+  for (SmallVector<DotDebugLocEntry, 4>::iterator
+         I = DotDebugLocEntries.begin(), E = DotDebugLocEntries.end();
        I != E; ++I, ++index) {
     DotDebugLocEntry Entry = *I;
     if (Entry.isEmpty()) {
@@ -3631,15 +3686,30 @@ void DwarfDebug::emitDebugLoc() {
       Asm->OutStreamer.EmitSymbolValue(Entry.End, Size, 0);
       const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
       unsigned Reg = RI->getDwarfRegNum(Entry.Loc.getReg(), false);
-      if (Reg < 32) {
+      if (int Offset =  Entry.Loc.getOffset()) {
+        // If the value is at a certain offset from frame register then
+        // use DW_OP_fbreg.
+        unsigned OffsetSize = Offset ? MCAsmInfo::getSLEB128Size(Offset) : 1;
         Asm->OutStreamer.AddComment("Loc expr size");
-        Asm->EmitInt16(1);
-        Asm->EmitInt8(dwarf::DW_OP_reg0 + Reg);
+        Asm->EmitInt16(1 + OffsetSize);
+        Asm->OutStreamer.AddComment(
+          dwarf::OperationEncodingString(dwarf::DW_OP_fbreg));
+        Asm->EmitInt8(dwarf::DW_OP_fbreg);
+        Asm->OutStreamer.AddComment("Offset");
+        Asm->EmitSLEB128(Offset);
       } else {
-        Asm->OutStreamer.AddComment("Loc expr size");
-        Asm->EmitInt16(1+MCAsmInfo::getULEB128Size(Reg));
-        Asm->EmitInt8(dwarf::DW_OP_regx);
-        Asm->EmitULEB128(Reg);
+        if (Reg < 32) {
+          Asm->OutStreamer.AddComment("Loc expr size");
+          Asm->EmitInt16(1);
+          Asm->OutStreamer.AddComment(
+            dwarf::OperationEncodingString(dwarf::DW_OP_reg0 + Reg));
+          Asm->EmitInt8(dwarf::DW_OP_reg0 + Reg);
+        } else {
+          Asm->OutStreamer.AddComment("Loc expr size");
+          Asm->EmitInt16(1 + MCAsmInfo::getULEB128Size(Reg));
+          Asm->EmitInt8(dwarf::DW_OP_regx);
+          Asm->EmitULEB128(Reg);
+        }
       }
     }
   }
@@ -3661,7 +3731,7 @@ void DwarfDebug::emitDebugRanges() {
     Asm->getObjFileLowering().getDwarfRangesSection());
   unsigned char Size = Asm->getTargetData().getPointerSize();
   for (SmallVector<const MCSymbol *, 8>::iterator
-         I = DebugRangeSymbols.begin(), E = DebugRangeSymbols.end(); 
+         I = DebugRangeSymbols.begin(), E = DebugRangeSymbols.end();
        I != E; ++I) {
     if (*I)
       Asm->OutStreamer.EmitSymbolValue(const_cast<MCSymbol*>(*I), Size, 0);
@@ -3734,7 +3804,7 @@ void DwarfDebug::emitDebugInlineInfo() {
     if (LName.empty()) {
       Asm->OutStreamer.EmitBytes(Name, 0);
       Asm->OutStreamer.EmitIntValue(0, 1, 0); // nul terminator.
-    } else 
+    } else
       Asm->EmitSectionOffset(getStringPoolEntry(getRealLinkageName(LName)),
                              DwarfStrSectionSym);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 5a281c8517481..f0ff3bc71699a 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -261,7 +261,6 @@ class DwarfDebug {
   MCSymbol *DwarfFrameSectionSym, *DwarfInfoSectionSym, *DwarfAbbrevSectionSym;
   MCSymbol *DwarfStrSectionSym, *TextSectionSym, *DwarfDebugRangeSectionSym;
   MCSymbol *DwarfDebugLocSectionSym;
-  MCSymbol *DwarfDebugLineSectionSym, *CurrentLineSectionSym;
   MCSymbol *FunctionBeginSym, *FunctionEndSym;
 
   DIEInteger *DIEIntegerOne;
@@ -338,11 +337,11 @@ private:
 
   /// addSourceLine - Add location information to specified debug information
   /// entry.
-  void addSourceLine(DIE *Die, const DIVariable *V);
-  void addSourceLine(DIE *Die, const DIGlobalVariable *G);
-  void addSourceLine(DIE *Die, const DISubprogram *SP);
-  void addSourceLine(DIE *Die, const DIType *Ty);
-  void addSourceLine(DIE *Die, const DINameSpace *NS);
+  void addSourceLine(DIE *Die, DIVariable V);
+  void addSourceLine(DIE *Die, DIGlobalVariable G);
+  void addSourceLine(DIE *Die, DISubprogram SP);
+  void addSourceLine(DIE *Die, DIType Ty);
+  void addSourceLine(DIE *Die, DINameSpace NS);
 
   /// addAddress - Add an address attribute to a die based on the location
   /// provided.
@@ -376,6 +375,10 @@ private:
   void addBlockByrefAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute,
                             const MachineLocation &Location);
 
+  /// addVariableAddress - Add DW_AT_location attribute for a DbgVariable based
+  /// on provided frame index.
+  void addVariableAddress(DbgVariable *&DV, DIE *Die, int64_t FI);
+
   /// addToContextOwner - Add Die into the list of its context owner's children.
   void addToContextOwner(DIE *Die, DIDescriptor Context);
 
@@ -414,14 +417,11 @@ private:
   /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
   DIE *constructEnumTypeDIE(DIEnumerator ETy);
 
-  /// createGlobalVariableDIE - Create new DIE using GV.
-  DIE *createGlobalVariableDIE(const DIGlobalVariable &GV);
-
   /// createMemberDIE - Create new member DIE.
-  DIE *createMemberDIE(const DIDerivedType &DT);
+  DIE *createMemberDIE(DIDerivedType DT);
 
   /// createSubprogramDIE - Create new DIE using SP.
-  DIE *createSubprogramDIE(const DISubprogram &SP, bool MakeDecl = false);
+  DIE *createSubprogramDIE(DISubprogram SP, bool MakeDecl = false);
 
   /// getOrCreateDbgScope - Create DbgScope for the scope.
   DbgScope *getOrCreateDbgScope(const MDNode *Scope, const MDNode *InlinedAt);
@@ -560,12 +560,6 @@ private:
   /// construct SubprogramDIE - Construct subprogram DIE.
   void constructSubprogramDIE(const MDNode *N);
 
-  // FIXME: This should go away in favor of complex addresses.
-  /// Find the type the programmer originally declared the variable to be
-  /// and return that type.  Obsolete, use GetComplexAddrType instead.
-  ///
-  DIType getBlockByrefType(DIType Ty, std::string Name);
-
   /// recordSourceLine - Register a source line with debug info. Returns the
   /// unique label that was emitted and which provides correspondence to
   /// the source line list.
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.cpp b/lib/CodeGen/AsmPrinter/DwarfException.cpp
index c87284083cded..86a368831e0e2 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfException.cpp
@@ -894,7 +894,7 @@ void DwarfException::EndModule() {
   if (!shouldEmitMovesModule && !shouldEmitTableModule)
     return;
 
-  const std::vector<const Function *> Personalities = MMI->getPersonalities();
+  const std::vector<const Function*> &Personalities = MMI->getPersonalities();
 
   for (unsigned I = 0, E = Personalities.size(); I < E; ++I)
     EmitCIE(Personalities[I], I);
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 7f98df0d22ea4..cb81aa3c88ce6 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -65,7 +65,7 @@ namespace {
   public:
     static char ID;
     explicit BranchFolderPass(bool defaultEnableTailMerge)
-      : MachineFunctionPass(&ID), BranchFolder(defaultEnableTailMerge) {}
+      : MachineFunctionPass(ID), BranchFolder(defaultEnableTailMerge) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
     virtual const char *getPassName() const { return "Control Flow Optimizer"; }
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index ffeff1ee27a64..2ef115dbd2056 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -22,6 +22,7 @@ add_llvm_library(LLVMCodeGen
   LiveIntervalAnalysis.cpp
   LiveStackAnalysis.cpp
   LiveVariables.cpp
+  LocalStackSlotAllocation.cpp
   LowerSubregs.cpp
   MachineBasicBlock.cpp
   MachineCSE.cpp
@@ -42,10 +43,10 @@ add_llvm_library(LLVMCodeGen
   MachineVerifier.cpp
   ObjectCodeEmitter.cpp
   OcamlGC.cpp
-  OptimizeExts.cpp
   OptimizePHIs.cpp
   PHIElimination.cpp
   Passes.cpp
+  PeepholeOptimizer.cpp
   PostRAHazardRecognizer.cpp
   PostRASchedulerList.cpp
   PreAllocSplitting.cpp
@@ -57,6 +58,7 @@ add_llvm_library(LLVMCodeGen
   RegAllocPBQP.cpp
   RegisterCoalescer.cpp
   RegisterScavenging.cpp
+  RenderMachineFunction.cpp
   ScheduleDAG.cpp
   ScheduleDAGEmit.cpp
   ScheduleDAGInstrs.cpp
@@ -67,6 +69,8 @@ add_llvm_library(LLVMCodeGen
   SjLjEHPrepare.cpp
   SlotIndexes.cpp
   Spiller.cpp
+  SplitKit.cpp
+  Splitter.cpp
   StackProtector.cpp
   StackSlotColoring.cpp
   StrongPHIElimination.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index 240a7b94fccfd..1b7e08a8b6bb8 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -25,8 +25,8 @@
 using namespace llvm;
 
 char CalculateSpillWeights::ID = 0;
-static RegisterPass<CalculateSpillWeights> X("calcspillweights",
-                                             "Calculate spill weights");
+INITIALIZE_PASS(CalculateSpillWeights, "calcspillweights",
+                "Calculate spill weights", false, false);
 
 void CalculateSpillWeights::getAnalysisUsage(AnalysisUsage &au) const {
   au.addRequired<LiveIntervals>();
@@ -41,108 +41,184 @@ bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &fn) {
                << "********** Function: "
                << fn.getFunction()->getName() << '\n');
 
-  LiveIntervals *lis = &getAnalysis<LiveIntervals>();
-  MachineLoopInfo *loopInfo = &getAnalysis<MachineLoopInfo>();
-  const TargetInstrInfo *tii = fn.getTarget().getInstrInfo();
-  MachineRegisterInfo *mri = &fn.getRegInfo();
-
-  SmallSet<unsigned, 4> processed;
-  for (MachineFunction::iterator mbbi = fn.begin(), mbbe = fn.end();
-       mbbi != mbbe; ++mbbi) {
-    MachineBasicBlock* mbb = mbbi;
-    SlotIndex mbbEnd = lis->getMBBEndIdx(mbb);
-    MachineLoop* loop = loopInfo->getLoopFor(mbb);
-    unsigned loopDepth = loop ? loop->getLoopDepth() : 0;
-    bool isExiting = loop ? loop->isLoopExiting(mbb) : false;
-
-    for (MachineBasicBlock::const_iterator mii = mbb->begin(), mie = mbb->end();
-         mii != mie; ++mii) {
-      const MachineInstr *mi = mii;
-      if (tii->isIdentityCopy(*mi) || mi->isImplicitDef() || mi->isDebugValue())
-        continue;
-
-      for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
-        const MachineOperand &mopi = mi->getOperand(i);
-        if (!mopi.isReg() || mopi.getReg() == 0)
-          continue;
-        unsigned reg = mopi.getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(mopi.getReg()))
-          continue;
-        // Multiple uses of reg by the same instruction. It should not
-        // contribute to spill weight again.
-        if (!processed.insert(reg))
-          continue;
-
-        bool hasDef = mopi.isDef();
-        bool hasUse = !hasDef;
-        for (unsigned j = i+1; j != e; ++j) {
-          const MachineOperand &mopj = mi->getOperand(j);
-          if (!mopj.isReg() || mopj.getReg() != reg)
-            continue;
-          hasDef |= mopj.isDef();
-          hasUse |= mopj.isUse();
-          if (hasDef && hasUse)
-            break;
-        }
-
-        LiveInterval &regInt = lis->getInterval(reg);
-        float weight = lis->getSpillWeight(hasDef, hasUse, loopDepth);
-        if (hasDef && isExiting) {
-          // Looks like this is a loop count variable update.
-          SlotIndex defIdx = lis->getInstructionIndex(mi).getDefIndex();
-          const LiveRange *dlr =
-            lis->getInterval(reg).getLiveRangeContaining(defIdx);
-          if (dlr->end >= mbbEnd)
-            weight *= 3.0F;
-        }
-        regInt.weight += weight;
-      }
-      processed.clear();
-    }
+  LiveIntervals &lis = getAnalysis<LiveIntervals>();
+  VirtRegAuxInfo vrai(fn, lis, getAnalysis<MachineLoopInfo>());
+  for (LiveIntervals::iterator I = lis.begin(), E = lis.end(); I != E; ++I) {
+    LiveInterval &li = *I->second;
+    if (TargetRegisterInfo::isVirtualRegister(li.reg))
+      vrai.CalculateWeightAndHint(li);
+  }
+  return false;
+}
+
+// Return the preferred allocation register for reg, given a COPY instruction.
+static unsigned copyHint(const MachineInstr *mi, unsigned reg,
+                         const TargetRegisterInfo &tri,
+                         const MachineRegisterInfo &mri) {
+  unsigned sub, hreg, hsub;
+  if (mi->getOperand(0).getReg() == reg) {
+    sub = mi->getOperand(0).getSubReg();
+    hreg = mi->getOperand(1).getReg();
+    hsub = mi->getOperand(1).getSubReg();
+  } else {
+    sub = mi->getOperand(1).getSubReg();
+    hreg = mi->getOperand(0).getReg();
+    hsub = mi->getOperand(0).getSubReg();
   }
 
-  for (LiveIntervals::iterator I = lis->begin(), E = lis->end(); I != E; ++I) {
-    LiveInterval &li = *I->second;
-    if (TargetRegisterInfo::isVirtualRegister(li.reg)) {
-      // If the live interval length is essentially zero, i.e. in every live
-      // range the use follows def immediately, it doesn't make sense to spill
-      // it and hope it will be easier to allocate for this li.
-      if (isZeroLengthInterval(&li)) {
-        li.weight = HUGE_VALF;
-        continue;
-      }
-
-      bool isLoad = false;
-      SmallVector<LiveInterval*, 4> spillIs;
-      if (lis->isReMaterializable(li, spillIs, isLoad)) {
-        // If all of the definitions of the interval are re-materializable,
-        // it is a preferred candidate for spilling. If none of the defs are
-        // loads, then it's potentially very cheap to re-materialize.
-        // FIXME: this gets much more complicated once we support non-trivial
-        // re-materialization.
-        if (isLoad)
-          li.weight *= 0.9F;
-        else
-          li.weight *= 0.5F;
-      }
-
-      // Slightly prefer live interval that has been assigned a preferred reg.
-      std::pair<unsigned, unsigned> Hint = mri->getRegAllocationHint(li.reg);
-      if (Hint.first || Hint.second)
-        li.weight *= 1.01F;
-
-      lis->normalizeSpillWeight(li);
+  if (!hreg)
+    return 0;
+
+  if (TargetRegisterInfo::isVirtualRegister(hreg))
+    return sub == hsub ? hreg : 0;
+
+  const TargetRegisterClass *rc = mri.getRegClass(reg);
+
+  // Only allow physreg hints in rc.
+  if (sub == 0)
+    return rc->contains(hreg) ? hreg : 0;
+
+  // reg:sub should match the physreg hreg.
+  return tri.getMatchingSuperReg(hreg, sub, rc);
+}
+
+void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
+  MachineRegisterInfo &mri = mf_.getRegInfo();
+  const TargetRegisterInfo &tri = *mf_.getTarget().getRegisterInfo();
+  MachineBasicBlock *mbb = 0;
+  MachineLoop *loop = 0;
+  unsigned loopDepth = 0;
+  bool isExiting = false;
+  float totalWeight = 0;
+  SmallPtrSet<MachineInstr*, 8> visited;
+
+  // Find the best physreg hist and the best virtreg hint.
+  float bestPhys = 0, bestVirt = 0;
+  unsigned hintPhys = 0, hintVirt = 0;
+
+  // Don't recompute a target specific hint.
+  bool noHint = mri.getRegAllocationHint(li.reg).first != 0;
+
+  for (MachineRegisterInfo::reg_iterator I = mri.reg_begin(li.reg);
+       MachineInstr *mi = I.skipInstruction();) {
+    if (mi->isIdentityCopy() || mi->isImplicitDef() || mi->isDebugValue())
+      continue;
+    if (!visited.insert(mi))
+      continue;
+
+    // Get loop info for mi.
+    if (mi->getParent() != mbb) {
+      mbb = mi->getParent();
+      loop = loops_.getLoopFor(mbb);
+      loopDepth = loop ? loop->getLoopDepth() : 0;
+      isExiting = loop ? loop->isLoopExiting(mbb) : false;
+    }
+
+    // Calculate instr weight.
+    bool reads, writes;
+    tie(reads, writes) = mi->readsWritesVirtualRegister(li.reg);
+    float weight = LiveIntervals::getSpillWeight(writes, reads, loopDepth);
+
+    // Give extra weight to what looks like a loop induction variable update.
+    if (writes && isExiting && lis_.isLiveOutOfMBB(li, mbb))
+      weight *= 3;
+
+    totalWeight += weight;
+
+    // Get allocation hints from copies.
+    if (noHint || !mi->isCopy())
+      continue;
+    unsigned hint = copyHint(mi, li.reg, tri, mri);
+    if (!hint)
+      continue;
+    float hweight = hint_[hint] += weight;
+    if (TargetRegisterInfo::isPhysicalRegister(hint)) {
+      if (hweight > bestPhys && lis_.isAllocatable(hint))
+        bestPhys = hweight, hintPhys = hint;
+    } else {
+      if (hweight > bestVirt)
+        bestVirt = hweight, hintVirt = hint;
     }
   }
-  
-  return false;
+
+  hint_.clear();
+
+  // Always prefer the physreg hint.
+  if (unsigned hint = hintPhys ? hintPhys : hintVirt) {
+    mri.setRegAllocationHint(li.reg, 0, hint);
+    // Weakly boost the spill weifght of hinted registers.
+    totalWeight *= 1.01F;
+  }
+
+  // Mark li as unspillable if all live ranges are tiny.
+  if (li.isZeroLength()) {
+    li.markNotSpillable();
+    return;
+  }
+
+  // If all of the definitions of the interval are re-materializable,
+  // it is a preferred candidate for spilling. If none of the defs are
+  // loads, then it's potentially very cheap to re-materialize.
+  // FIXME: this gets much more complicated once we support non-trivial
+  // re-materialization.
+  bool isLoad = false;
+  SmallVector<LiveInterval*, 4> spillIs;
+  if (lis_.isReMaterializable(li, spillIs, isLoad)) {
+    if (isLoad)
+      totalWeight *= 0.9F;
+    else
+      totalWeight *= 0.5F;
+  }
+
+  li.weight = totalWeight;
+  lis_.normalizeSpillWeight(li);
 }
 
-/// Returns true if the given live interval is zero length.
-bool CalculateSpillWeights::isZeroLengthInterval(LiveInterval *li) const {
-  for (LiveInterval::Ranges::const_iterator
-       i = li->ranges.begin(), e = li->ranges.end(); i != e; ++i)
-    if (i->end.getPrevIndex() > i->start)
-      return false;
-  return true;
+void VirtRegAuxInfo::CalculateRegClass(unsigned reg) {
+  MachineRegisterInfo &mri = mf_.getRegInfo();
+  const TargetRegisterInfo *tri = mf_.getTarget().getRegisterInfo();
+  const TargetRegisterClass *orc = mri.getRegClass(reg);
+  SmallPtrSet<const TargetRegisterClass*,8> rcs;
+
+  for (MachineRegisterInfo::reg_nodbg_iterator I = mri.reg_nodbg_begin(reg),
+       E = mri.reg_nodbg_end(); I != E; ++I) {
+    // The targets don't have accurate enough regclass descriptions that we can
+    // handle subregs. We need something similar to
+    // TRI::getMatchingSuperRegClass, but returning a super class instead of a
+    // sub class.
+    if (I.getOperand().getSubReg()) {
+      DEBUG(dbgs() << "Cannot handle subregs: " << I.getOperand() << '\n');
+      return;
+    }
+    if (const TargetRegisterClass *rc =
+                                I->getDesc().getRegClass(I.getOperandNo(), tri))
+      rcs.insert(rc);
+  }
+
+  // If we found no regclass constraints, just leave reg as is.
+  // In theory, we could inflate to the largest superclass of reg's existing
+  // class, but that might not be legal for the current cpu setting.
+  // This could happen if reg is only used by COPY instructions, so we may need
+  // to improve on this.
+  if (rcs.empty()) {
+    return;
+  }
+
+  // Compute the intersection of all classes in rcs.
+  // This ought to be independent of iteration order, but if the target register
+  // classes don't form a proper algebra, it is possible to get different
+  // results. The solution is to make sure the intersection of any two register
+  // classes is also a register class or the null set.
+  const TargetRegisterClass *rc = 0;
+  for (SmallPtrSet<const TargetRegisterClass*,8>::iterator I = rcs.begin(),
+         E = rcs.end(); I != E; ++I) {
+    rc = rc ? getCommonSubClass(rc, *I) : *I;
+    assert(rc && "Incompatible regclass constraints found");
+  }
+
+  if (rc == orc)
+    return;
+  DEBUG(dbgs() << "Inflating " << orc->getName() << ":%reg" << reg << " to "
+               << rc->getName() <<".\n");
+  mri.setRegClass(reg, rc);
 }
diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp
index e0e315c6c677f..91a9536e7757c 100644
--- a/lib/CodeGen/CodePlacementOpt.cpp
+++ b/lib/CodeGen/CodePlacementOpt.cpp
@@ -36,7 +36,7 @@ namespace {
 
   public:
     static char ID;
-    CodePlacementOpt() : MachineFunctionPass(&ID) {}
+    CodePlacementOpt() : MachineFunctionPass(ID) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
     virtual const char *getPassName() const {
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index e3746a9856447..335d2d8e9bac7 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -32,21 +32,21 @@ CriticalAntiDepBreaker(MachineFunction& MFi) :
   MRI(MF.getRegInfo()),
   TII(MF.getTarget().getInstrInfo()),
   TRI(MF.getTarget().getRegisterInfo()),
-  AllocatableSet(TRI->getAllocatableSet(MF))
-{
-}
+  AllocatableSet(TRI->getAllocatableSet(MF)),
+  Classes(TRI->getNumRegs(), static_cast<const TargetRegisterClass *>(0)),
+  KillIndices(TRI->getNumRegs(), 0),
+  DefIndices(TRI->getNumRegs(), 0) {}
 
 CriticalAntiDepBreaker::~CriticalAntiDepBreaker() {
 }
 
 void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
-  // Clear out the register class data.
-  std::fill(Classes, array_endof(Classes),
-            static_cast<const TargetRegisterClass *>(0));
-
-  // Initialize the indices to indicate that no registers are live.
   const unsigned BBSize = BB->size();
-  for (unsigned i = 0; i < TRI->getNumRegs(); ++i) {
+  for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i) {
+    // Clear out the register class data.
+    Classes[i] = static_cast<const TargetRegisterClass *>(0);
+
+    // Initialize the indices to indicate that no registers are live.
     KillIndices[i] = ~0u;
     DefIndices[i] = BBSize;
   }
@@ -65,6 +65,7 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
       Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
       KillIndices[Reg] = BB->size();
       DefIndices[Reg] = ~0u;
+
       // Repeat, for all aliases.
       for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
         unsigned AliasReg = *Alias;
@@ -86,6 +87,7 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
       Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
       KillIndices[Reg] = BB->size();
       DefIndices[Reg] = ~0u;
+
       // Repeat, for all aliases.
       for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
         unsigned AliasReg = *Alias;
@@ -106,6 +108,7 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
     Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
     KillIndices[Reg] = BB->size();
     DefIndices[Reg] = ~0u;
+
     // Repeat, for all aliases.
     for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
       unsigned AliasReg = *Alias;
@@ -134,8 +137,10 @@ void CriticalAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
   for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg)
     if (DefIndices[Reg] < InsertPosIndex && DefIndices[Reg] >= Count) {
       assert(KillIndices[Reg] == ~0u && "Clobbered register is live!");
+
       // Mark this register to be non-renamable.
       Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
       // Move the def index to the end of the previous region, to reflect
       // that the def could theoretically have been scheduled at the end.
       DefIndices[Reg] = InsertPosIndex;
@@ -325,6 +330,8 @@ CriticalAntiDepBreaker::findSuitableFreeRegister(MachineInstr *MI,
   for (TargetRegisterClass::iterator R = RC->allocation_order_begin(MF),
        RE = RC->allocation_order_end(MF); R != RE; ++R) {
     unsigned NewReg = *R;
+    // Don't consider non-allocatable registers
+    if (!AllocatableSet.test(NewReg)) continue;
     // Don't replace a register with itself.
     if (NewReg == AntiDepReg) continue;
     // Don't replace a register with one that was recently used to repair
@@ -433,7 +440,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
   // fix that remaining critical edge too. This is a little more involved,
   // because unlike the most recent register, less recent registers should
   // still be considered, though only if no other registers are available.
-  unsigned LastNewReg[TargetRegisterInfo::FirstVirtualRegister] = {};
+  std::vector<unsigned> LastNewReg(TRI->getNumRegs(), 0);
 
   // Attempt to break anti-dependence edges on the critical path. Walk the
   // instructions from the bottom up, tracking information about liveness
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h
index 540630083bcc6..0ed7c35b0f0ca 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -46,19 +46,18 @@ class TargetRegisterInfo;
     /// corresponding value is null. If the register is live but used in
     /// multiple register classes, the corresponding value is -1 casted to a
     /// pointer.
-    const TargetRegisterClass *
-      Classes[TargetRegisterInfo::FirstVirtualRegister];
+    std::vector<const TargetRegisterClass*> Classes;
 
     /// RegRegs - Map registers to all their references within a live range.
     std::multimap<unsigned, MachineOperand *> RegRefs;
 
     /// KillIndices - The index of the most recent kill (proceding bottom-up),
     /// or ~0u if the register is not live.
-    unsigned KillIndices[TargetRegisterInfo::FirstVirtualRegister];
+    std::vector<unsigned> KillIndices;
 
     /// DefIndices - The index of the most recent complete def (proceding bottom
     /// up), or ~0u if the register is live.
-    unsigned DefIndices[TargetRegisterInfo::FirstVirtualRegister];
+    std::vector<unsigned> DefIndices;
 
     /// KeepRegs - A set of registers which are live and cannot be changed to
     /// break anti-dependencies.
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index d69c995b3e037..318d922adebf1 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -36,7 +36,7 @@ namespace {
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    DeadMachineInstructionElim() : MachineFunctionPass(&ID) {}
+    DeadMachineInstructionElim() : MachineFunctionPass(ID) {}
 
   private:
     bool isDead(const MachineInstr *MI) const;
@@ -44,9 +44,8 @@ namespace {
 }
 char DeadMachineInstructionElim::ID = 0;
 
-static RegisterPass<DeadMachineInstructionElim>
-Y("dead-mi-elimination",
-  "Remove dead machine instructions");
+INITIALIZE_PASS(DeadMachineInstructionElim, "dead-mi-elimination",
+                "Remove dead machine instructions", false, false);
 
 FunctionPass *llvm::createDeadMachineInstructionElimPass() {
   return new DeadMachineInstructionElim();
@@ -81,9 +80,8 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getTarget().getRegisterInfo();
   TII = MF.getTarget().getInstrInfo();
 
-  // Compute a bitvector to represent all non-allocatable physregs.
-  BitVector NonAllocatableRegs = TRI->getAllocatableSet(MF);
-  NonAllocatableRegs.flip();
+  // Treat reserved registers as always live.
+  BitVector ReservedRegs = TRI->getReservedRegs(MF);
 
   // Loop over all instructions in all blocks, from bottom to top, so that it's
   // more likely that chains of dependent but ultimately dead instructions will
@@ -92,9 +90,8 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
        I != E; ++I) {
     MachineBasicBlock *MBB = &*I;
 
-    // Start out assuming that all non-allocatable registers are live
-    // out of this block.
-    LivePhysRegs = NonAllocatableRegs;
+    // Start out assuming that reserved registers are live out of this block.
+    LivePhysRegs = ReservedRegs;
 
     // Also add any explicit live-out physregs for this block.
     if (!MBB->empty() && MBB->back().getDesc().isReturn())
@@ -105,6 +102,10 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
           LivePhysRegs.set(Reg);
       }
 
+    // FIXME: Add live-ins from sucessors to LivePhysRegs. Normally, physregs
+    // are not live across blocks, but some targets (x86) can have flags live
+    // out of a block.
+
     // Now scan the instructions and delete dead ones, tracking physreg
     // liveness as we go.
     for (MachineBasicBlock::reverse_iterator MII = MBB->rbegin(),
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index 01b31b4209318..550fd3e25fb7a 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -25,19 +25,17 @@
 #include "llvm/Support/CallSite.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
 STATISTIC(NumLandingPadsSplit,     "Number of landing pads split");
 STATISTIC(NumUnwindsLowered,       "Number of unwind instructions lowered");
 STATISTIC(NumExceptionValuesMoved, "Number of eh.exception calls moved");
-STATISTIC(NumStackTempsIntroduced, "Number of stack temporaries introduced");
 
 namespace {
   class DwarfEHPrepare : public FunctionPass {
     const TargetMachine *TM;
     const TargetLowering *TLI;
-    bool CompileFast;
 
     // The eh.exception intrinsic.
     Function *ExceptionValueIntrinsic;
@@ -54,9 +52,8 @@ namespace {
     // _Unwind_Resume or the target equivalent.
     Constant *RewindFunction;
 
-    // Dominator info is used when turning stack temporaries into registers.
+    // We both use and preserve dominator info.
     DominatorTree *DT;
-    DominanceFrontier *DF;
 
     // The function we are running on.
     Function *F;
@@ -65,28 +62,14 @@ namespace {
     typedef SmallPtrSet<BasicBlock*, 8> BBSet;
     BBSet LandingPads;
 
-    // Stack temporary used to hold eh.exception values.
-    AllocaInst *ExceptionValueVar;
-
     bool NormalizeLandingPads();
     bool LowerUnwinds();
     bool MoveExceptionValueCalls();
-    bool FinishStackTemporaries();
-    bool PromoteStackTemporaries();
 
     Instruction *CreateExceptionValueCall(BasicBlock *BB);
-    Instruction *CreateValueLoad(BasicBlock *BB);
-
-    /// CreateReadOfExceptionValue - Return the result of the eh.exception
-    /// intrinsic by calling the intrinsic if in a landing pad, or loading it
-    /// from the exception value variable otherwise.
-    Instruction *CreateReadOfExceptionValue(BasicBlock *BB) {
-      return LandingPads.count(BB) ?
-        CreateExceptionValueCall(BB) : CreateValueLoad(BB);
-    }
 
     /// CleanupSelectors - Any remaining eh.selector intrinsic calls which still
-    /// use the ".llvm.eh.catch.all.value" call need to convert to using its
+    /// use the "llvm.eh.catch.all.value" call need to convert to using its
     /// initializer instead.
     bool CleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels);
 
@@ -112,69 +95,19 @@ namespace {
     bool FindSelectorAndURoR(Instruction *Inst, bool &URoRInvoke,
                              SmallPtrSet<IntrinsicInst*, 8> &SelCalls);
       
-    /// DoMem2RegPromotion - Take an alloca call and promote it from memory to a
-    /// register.
-    bool DoMem2RegPromotion(Value *V) {
-      AllocaInst *AI = dyn_cast<AllocaInst>(V);
-      if (!AI || !isAllocaPromotable(AI)) return false;
-
-      // Turn the alloca into a register.
-      std::vector<AllocaInst*> Allocas(1, AI);
-      PromoteMemToReg(Allocas, *DT, *DF);
-      return true;
-    }
-
-    /// PromoteStoreInst - Perform Mem2Reg on a StoreInst.
-    bool PromoteStoreInst(StoreInst *SI) {
-      if (!SI || !DT || !DF) return false;
-      if (DoMem2RegPromotion(SI->getOperand(1)))
-        return true;
-      return false;
-    }
-
-    /// PromoteEHPtrStore - Promote the storing of an EH pointer into a
-    /// register. This should get rid of the store and subsequent loads.
-    bool PromoteEHPtrStore(IntrinsicInst *II) {
-      if (!DT || !DF) return false;
-
-      bool Changed = false;
-      StoreInst *SI;
-
-      while (1) {
-        SI = 0;
-        for (Value::use_iterator
-               I = II->use_begin(), E = II->use_end(); I != E; ++I) {
-          SI = dyn_cast<StoreInst>(I);
-          if (SI) break;
-        }
-
-        if (!PromoteStoreInst(SI))
-          break;
-
-        Changed = true;
-      }
-
-      return Changed;
-    }
-
   public:
     static char ID; // Pass identification, replacement for typeid.
-    DwarfEHPrepare(const TargetMachine *tm, bool fast) :
-      FunctionPass(&ID), TM(tm), TLI(TM->getTargetLowering()),
-      CompileFast(fast),
+    DwarfEHPrepare(const TargetMachine *tm) :
+      FunctionPass(ID), TM(tm), TLI(TM->getTargetLowering()),
       ExceptionValueIntrinsic(0), SelectorIntrinsic(0),
       URoR(0), EHCatchAllValue(0), RewindFunction(0) {}
 
     virtual bool runOnFunction(Function &Fn);
 
-    // getAnalysisUsage - We need dominance frontiers for memory promotion.
+    // getAnalysisUsage - We need the dominator tree for handling URoR.
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      if (!CompileFast)
-        AU.addRequired<DominatorTree>();
+      AU.addRequired<DominatorTree>();
       AU.addPreserved<DominatorTree>();
-      if (!CompileFast)
-        AU.addRequired<DominanceFrontier>();
-      AU.addPreserved<DominanceFrontier>();
     }
 
     const char *getPassName() const {
@@ -186,8 +119,8 @@ namespace {
 
 char DwarfEHPrepare::ID = 0;
 
-FunctionPass *llvm::createDwarfEHPass(const TargetMachine *tm, bool fast) {
-  return new DwarfEHPrepare(tm, fast);
+FunctionPass *llvm::createDwarfEHPass(const TargetMachine *tm) {
+  return new DwarfEHPrepare(tm);
 }
 
 /// HasCatchAllInSelector - Return true if the intrinsic instruction has a
@@ -207,7 +140,7 @@ FindAllCleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels,
   for (Value::use_iterator
          I = SelectorIntrinsic->use_begin(),
          E = SelectorIntrinsic->use_end(); I != E; ++I) {
-    IntrinsicInst *II = cast<IntrinsicInst>(I);
+    IntrinsicInst *II = cast<IntrinsicInst>(*I);
 
     if (II->getParent()->getParent() != F)
       continue;
@@ -225,13 +158,13 @@ FindAllURoRInvokes(SmallPtrSet<InvokeInst*, 32> &URoRInvokes) {
   for (Value::use_iterator
          I = URoR->use_begin(),
          E = URoR->use_end(); I != E; ++I) {
-    if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+    if (InvokeInst *II = dyn_cast<InvokeInst>(*I))
       URoRInvokes.insert(II);
   }
 }
 
 /// CleanupSelectors - Any remaining eh.selector intrinsic calls which still use
-/// the ".llvm.eh.catch.all.value" call need to convert to using its
+/// the "llvm.eh.catch.all.value" call need to convert to using its
 /// initializer instead.
 bool DwarfEHPrepare::CleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels) {
   if (!EHCatchAllValue) return false;
@@ -247,7 +180,7 @@ bool DwarfEHPrepare::CleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels) {
          I = Sels.begin(), E = Sels.end(); I != E; ++I) {
     IntrinsicInst *Sel = *I;
 
-    // Index of the ".llvm.eh.catch.all.value" variable.
+    // Index of the "llvm.eh.catch.all.value" variable.
     unsigned OpIdx = Sel->getNumArgOperands() - 1;
     GlobalVariable *GV = dyn_cast<GlobalVariable>(Sel->getArgOperand(OpIdx));
     if (GV != EHCatchAllValue) continue;
@@ -268,10 +201,9 @@ DwarfEHPrepare::FindSelectorAndURoR(Instruction *Inst, bool &URoRInvoke,
   SmallPtrSet<PHINode*, 32> SeenPHIs;
   bool Changed = false;
 
- restart:
   for (Value::use_iterator
          I = Inst->use_begin(), E = Inst->use_end(); I != E; ++I) {
-    Instruction *II = dyn_cast<Instruction>(I);
+    Instruction *II = dyn_cast<Instruction>(*I);
     if (!II || II->getParent()->getParent() != F) continue;
     
     if (IntrinsicInst *Sel = dyn_cast<IntrinsicInst>(II)) {
@@ -282,11 +214,6 @@ DwarfEHPrepare::FindSelectorAndURoR(Instruction *Inst, bool &URoRInvoke,
         URoRInvoke = true;
     } else if (CastInst *CI = dyn_cast<CastInst>(II)) {
       Changed |= FindSelectorAndURoR(CI, URoRInvoke, SelCalls);
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(II)) {
-      if (!PromoteStoreInst(SI)) continue;
-      Changed = true;
-      SeenPHIs.clear();
-      goto restart;             // Uses may have changed, restart loop.
     } else if (PHINode *PN = dyn_cast<PHINode>(II)) {
       if (SeenPHIs.insert(PN))
         // Don't process a PHI node more than once.
@@ -304,7 +231,7 @@ DwarfEHPrepare::FindSelectorAndURoR(Instruction *Inst, bool &URoRInvoke,
 bool DwarfEHPrepare::HandleURoRInvokes() {
   if (!EHCatchAllValue) {
     EHCatchAllValue =
-      F->getParent()->getNamedGlobal(".llvm.eh.catch.all.value");
+      F->getParent()->getNamedGlobal("llvm.eh.catch.all.value");
     if (!EHCatchAllValue) return false;
   }
 
@@ -318,10 +245,6 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
   SmallPtrSet<IntrinsicInst*, 32> CatchAllSels;
   FindAllCleanupSelectors(Sels, CatchAllSels);
 
-  if (!DT)
-    // We require DominatorTree information.
-    return CleanupSelectors(CatchAllSels);
-
   if (!URoR) {
     URoR = F->getParent()->getFunction("_Unwind_Resume_or_Rethrow");
     if (!URoR) return CleanupSelectors(CatchAllSels);
@@ -338,7 +261,7 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
     for (SmallPtrSet<InvokeInst*, 32>::iterator
            UI = URoRInvokes.begin(), UE = URoRInvokes.end(); UI != UE; ++UI) {
       const BasicBlock *URoRBB = (*UI)->getParent();
-      if (SelBB == URoRBB || DT->dominates(SelBB, URoRBB)) {
+      if (DT->dominates(SelBB, URoRBB)) {
         SelsToConvert.insert(*SI);
         break;
       }
@@ -360,11 +283,9 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
     for (Value::use_iterator
            I = ExceptionValueIntrinsic->use_begin(),
            E = ExceptionValueIntrinsic->use_end(); I != E; ++I) {
-      IntrinsicInst *EHPtr = dyn_cast<IntrinsicInst>(I);
+      IntrinsicInst *EHPtr = dyn_cast<IntrinsicInst>(*I);
       if (!EHPtr || EHPtr->getParent()->getParent() != F) continue;
 
-      Changed |= PromoteEHPtrStore(EHPtr);
-
       bool URoRInvoke = false;
       SmallPtrSet<IntrinsicInst*, 8> SelCalls;
       Changed |= FindSelectorAndURoR(EHPtr, URoRInvoke, SelCalls);
@@ -532,11 +453,8 @@ bool DwarfEHPrepare::NormalizeLandingPads() {
     // Add a fallthrough from NewBB to the original landing pad.
     BranchInst::Create(LPad, NewBB);
 
-    // Now update DominatorTree and DominanceFrontier analysis information.
-    if (DT)
-      DT->splitBlock(NewBB);
-    if (DF)
-      DF->splitBlock(NewBB);
+    // Now update DominatorTree analysis information.
+    DT->splitBlock(NewBB);
 
     // Remember the newly constructed landing pad.  The original landing pad
     // LPad is no longer a landing pad now that all unwind edges have been
@@ -586,7 +504,7 @@ bool DwarfEHPrepare::LowerUnwinds() {
 
     // Create the call...
     CallInst *CI = CallInst::Create(RewindFunction,
-                                    CreateReadOfExceptionValue(TI->getParent()),
+                                    CreateExceptionValueCall(TI->getParent()),
                                     "", TI);
     CI->setCallingConv(TLI->getLibcallCallingConv(RTLIB::UNWIND_RESUME));
     // ...followed by an UnreachableInst.
@@ -602,9 +520,11 @@ bool DwarfEHPrepare::LowerUnwinds() {
 }
 
 /// MoveExceptionValueCalls - Ensure that eh.exception is only ever called from
-/// landing pads by replacing calls outside of landing pads with loads from a
-/// stack temporary.  Move eh.exception calls inside landing pads to the start
-/// of the landing pad (optional, but may make things simpler for later passes).
+/// landing pads by replacing calls outside of landing pads with direct use of
+/// a register holding the appropriate value; this requires adding calls inside
+/// all landing pads to initialize the register.  Also, move eh.exception calls
+/// inside landing pads to the start of the landing pad (optional, but may make
+/// things simpler for later passes).
 bool DwarfEHPrepare::MoveExceptionValueCalls() {
   // If the eh.exception intrinsic is not declared in the module then there is
   // nothing to do.  Speed up compilation by checking for this common case.
@@ -614,61 +534,87 @@ bool DwarfEHPrepare::MoveExceptionValueCalls() {
 
   bool Changed = false;
 
-  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;)
-      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++))
-        if (CI->getIntrinsicID() == Intrinsic::eh_exception) {
-          if (!CI->use_empty()) {
-            Value *ExceptionValue = CreateReadOfExceptionValue(BB);
-            if (CI == ExceptionValue) {
-              // The call was at the start of a landing pad - leave it alone.
-              assert(LandingPads.count(BB) &&
-                     "Created eh.exception call outside landing pad!");
-              continue;
-            }
-            CI->replaceAllUsesWith(ExceptionValue);
-          }
-          CI->eraseFromParent();
-          ++NumExceptionValuesMoved;
-          Changed = true;
+  // Move calls to eh.exception that are inside a landing pad to the start of
+  // the landing pad.
+  for (BBSet::const_iterator LI = LandingPads.begin(), LE = LandingPads.end();
+       LI != LE; ++LI) {
+    BasicBlock *LP = *LI;
+    for (BasicBlock::iterator II = LP->getFirstNonPHIOrDbg(), IE = LP->end();
+         II != IE;)
+      if (EHExceptionInst *EI = dyn_cast<EHExceptionInst>(II++)) {
+        // Found a call to eh.exception.
+        if (!EI->use_empty()) {
+          // If there is already a call to eh.exception at the start of the
+          // landing pad, then get hold of it; otherwise create such a call.
+          Value *CallAtStart = CreateExceptionValueCall(LP);
+
+          // If the call was at the start of a landing pad then leave it alone.
+          if (EI == CallAtStart)
+            continue;
+          EI->replaceAllUsesWith(CallAtStart);
         }
+        EI->eraseFromParent();
+        ++NumExceptionValuesMoved;
+        Changed = true;
+      }
   }
 
-  return Changed;
-}
-
-/// FinishStackTemporaries - If we introduced a stack variable to hold the
-/// exception value then initialize it in each landing pad.
-bool DwarfEHPrepare::FinishStackTemporaries() {
-  if (!ExceptionValueVar)
-    // Nothing to do.
-    return false;
+  // Look for calls to eh.exception that are not in a landing pad.  If one is
+  // found, then a register that holds the exception value will be created in
+  // each landing pad, and the SSAUpdater will be used to compute the values
+  // returned by eh.exception calls outside of landing pads.
+  SSAUpdater SSA;
+
+  // Remember where we found the eh.exception call, to avoid rescanning earlier
+  // basic blocks which we already know contain no eh.exception calls.
+  bool FoundCallOutsideLandingPad = false;
+  Function::iterator BB = F->begin();
+  for (Function::iterator BE = F->end(); BB != BE; ++BB) {
+    // Skip over landing pads.
+    if (LandingPads.count(BB))
+      continue;
 
-  bool Changed = false;
+    for (BasicBlock::iterator II = BB->getFirstNonPHIOrDbg(), IE = BB->end();
+         II != IE; ++II)
+      if (isa<EHExceptionInst>(II)) {
+        SSA.Initialize(II->getType(), II->getName());
+        FoundCallOutsideLandingPad = true;
+        break;
+      }
 
-  // Make sure that there is a store of the exception value at the start of
-  // each landing pad.
-  for (BBSet::iterator LI = LandingPads.begin(), LE = LandingPads.end();
-       LI != LE; ++LI) {
-    Instruction *ExceptionValue = CreateReadOfExceptionValue(*LI);
-    Instruction *Store = new StoreInst(ExceptionValue, ExceptionValueVar);
-    Store->insertAfter(ExceptionValue);
-    Changed = true;
+    if (FoundCallOutsideLandingPad)
+      break;
   }
 
-  return Changed;
-}
+  // If all calls to eh.exception are in landing pads then we are done.
+  if (!FoundCallOutsideLandingPad)
+    return Changed;
 
-/// PromoteStackTemporaries - Turn any stack temporaries we introduced into
-/// registers if possible.
-bool DwarfEHPrepare::PromoteStackTemporaries() {
-  if (ExceptionValueVar && DT && DF && isAllocaPromotable(ExceptionValueVar)) {
-    // Turn the exception temporary into registers and phi nodes if possible.
-    std::vector<AllocaInst*> Allocas(1, ExceptionValueVar);
-    PromoteMemToReg(Allocas, *DT, *DF);
-    return true;
+  // Add a call to eh.exception at the start of each landing pad, and tell the
+  // SSAUpdater that this is the value produced by the landing pad.
+  for (BBSet::iterator LI = LandingPads.begin(), LE = LandingPads.end();
+       LI != LE; ++LI)
+    SSA.AddAvailableValue(*LI, CreateExceptionValueCall(*LI));
+
+  // Now turn all calls to eh.exception that are not in a landing pad into a use
+  // of the appropriate register.
+  for (Function::iterator BE = F->end(); BB != BE; ++BB) {
+    // Skip over landing pads.
+    if (LandingPads.count(BB))
+      continue;
+
+    for (BasicBlock::iterator II = BB->getFirstNonPHIOrDbg(), IE = BB->end();
+         II != IE;)
+      if (EHExceptionInst *EI = dyn_cast<EHExceptionInst>(II++)) {
+        // Found a call to eh.exception, replace it with the value from any
+        // upstream landing pad(s).
+        EI->replaceAllUsesWith(SSA.GetValueAtEndOfBlock(BB));
+        EI->eraseFromParent();
+        ++NumExceptionValuesMoved;
+      }
   }
-  return false;
+
+  return true;
 }
 
 /// CreateExceptionValueCall - Insert a call to the eh.exception intrinsic at
@@ -691,36 +637,11 @@ Instruction *DwarfEHPrepare::CreateExceptionValueCall(BasicBlock *BB) {
   return CallInst::Create(ExceptionValueIntrinsic, "eh.value.call", Start);
 }
 
-/// CreateValueLoad - Insert a load of the exception value stack variable
-/// (creating it if necessary) at the start of the basic block (unless
-/// there already is a load, in which case the existing load is returned).
-Instruction *DwarfEHPrepare::CreateValueLoad(BasicBlock *BB) {
-  Instruction *Start = BB->getFirstNonPHIOrDbg();
-  // Is this a load of the exception temporary?
-  if (ExceptionValueVar)
-    if (LoadInst* LI = dyn_cast<LoadInst>(Start))
-      if (LI->getPointerOperand() == ExceptionValueVar)
-        // Reuse the existing load.
-        return Start;
-
-  // Create the temporary if we didn't already.
-  if (!ExceptionValueVar) {
-    ExceptionValueVar = new AllocaInst(PointerType::getUnqual(
-           Type::getInt8Ty(BB->getContext())), "eh.value", F->begin()->begin());
-    ++NumStackTempsIntroduced;
-  }
-
-  // Load the value.
-  return new LoadInst(ExceptionValueVar, "eh.value.load", Start);
-}
-
 bool DwarfEHPrepare::runOnFunction(Function &Fn) {
   bool Changed = false;
 
   // Initialize internal state.
-  DT = getAnalysisIfAvailable<DominatorTree>();
-  DF = getAnalysisIfAvailable<DominanceFrontier>();
-  ExceptionValueVar = 0;
+  DT = &getAnalysis<DominatorTree>();
   F = &Fn;
 
   // Ensure that only unwind edges end at landing pads (a landing pad is a
@@ -735,13 +656,6 @@ bool DwarfEHPrepare::runOnFunction(Function &Fn) {
   // Move eh.exception calls to landing pads.
   Changed |= MoveExceptionValueCalls();
 
-  // Initialize any stack temporaries we introduced.
-  Changed |= FinishStackTemporaries();
-
-  // Turn any stack temporaries into registers if possible.
-  if (!CompileFast)
-    Changed |= PromoteStackTemporaries();
-
   Changed |= HandleURoRInvokes();
 
   LandingPads.clear();
diff --git a/lib/CodeGen/ELF.h b/lib/CodeGen/ELF.h
index cb5a8c0eae1d3..fb884c9e8b712 100644
--- a/lib/CodeGen/ELF.h
+++ b/lib/CodeGen/ELF.h
@@ -22,36 +22,12 @@
 
 #include "llvm/CodeGen/BinaryObject.h"
 #include "llvm/CodeGen/MachineRelocation.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/System/DataTypes.h"
 
 namespace llvm {
   class GlobalValue;
 
-  // Identification Indexes
-  enum {
-    EI_MAG0 = 0,
-    EI_MAG1 = 1,
-    EI_MAG2 = 2,
-    EI_MAG3 = 3
-  };
-
-  // File types
-  enum {
-    ET_NONE   = 0,      // No file type
-    ET_REL    = 1,      // Relocatable file
-    ET_EXEC   = 2,      // Executable file
-    ET_DYN    = 3,      // Shared object file
-    ET_CORE   = 4,      // Core file
-    ET_LOPROC = 0xff00, // Beginning of processor-specific codes
-    ET_HIPROC = 0xffff  // Processor-specific
-  };
-
-  // Versioning
-  enum {
-    EV_NONE = 0,
-    EV_CURRENT = 1
-  };
-
   /// ELFSym - This struct contains information about each symbol that is
   /// added to logical symbol table for the module.  This is eventually
   /// turned into a real symbol table in the file.
@@ -108,9 +84,9 @@ namespace llvm {
     static ELFSym *getExtSym(const char *Ext) {
       ELFSym *Sym = new ELFSym();
       Sym->Source.Ext = Ext;
-      Sym->setBind(STB_GLOBAL);
-      Sym->setType(STT_NOTYPE);
-      Sym->setVisibility(STV_DEFAULT);
+      Sym->setBind(ELF::STB_GLOBAL);
+      Sym->setType(ELF::STT_NOTYPE);
+      Sym->setVisibility(ELF::STV_DEFAULT);
       Sym->SourceType = isExtSym;
       return Sym;
     }
@@ -118,9 +94,9 @@ namespace llvm {
     // getSectionSym - Returns a elf symbol to represent an elf section
     static ELFSym *getSectionSym() {
       ELFSym *Sym = new ELFSym();
-      Sym->setBind(STB_LOCAL);
-      Sym->setType(STT_SECTION);
-      Sym->setVisibility(STV_DEFAULT);
+      Sym->setBind(ELF::STB_LOCAL);
+      Sym->setType(ELF::STT_SECTION);
+      Sym->setVisibility(ELF::STV_DEFAULT);
       Sym->SourceType = isOther;
       return Sym;
     }
@@ -128,9 +104,9 @@ namespace llvm {
     // getFileSym - Returns a elf symbol to represent the module identifier
     static ELFSym *getFileSym() {
       ELFSym *Sym = new ELFSym();
-      Sym->setBind(STB_LOCAL);
-      Sym->setType(STT_FILE);
-      Sym->setVisibility(STV_DEFAULT);
+      Sym->setBind(ELF::STB_LOCAL);
+      Sym->setType(ELF::STT_FILE);
+      Sym->setVisibility(ELF::STV_DEFAULT);
       Sym->SectionIdx = 0xfff1;  // ELFSection::SHN_ABS;
       Sym->SourceType = isOther;
       return Sym;
@@ -141,8 +117,8 @@ namespace llvm {
       ELFSym *Sym = new ELFSym();
       Sym->Source.GV = GV;
       Sym->setBind(Bind);
-      Sym->setType(STT_NOTYPE);
-      Sym->setVisibility(STV_DEFAULT);
+      Sym->setType(ELF::STT_NOTYPE);
+      Sym->setVisibility(ELF::STV_DEFAULT);
       Sym->SectionIdx = 0;  //ELFSection::SHN_UNDEF;
       Sym->SourceType = isGV;
       return Sym;
@@ -159,35 +135,14 @@ namespace llvm {
     // Symbol index into the Symbol table
     unsigned SymTabIdx;
 
-    enum {
-      STB_LOCAL = 0,  // Local sym, not visible outside obj file containing def
-      STB_GLOBAL = 1, // Global sym, visible to all object files being combined
-      STB_WEAK = 2    // Weak symbol, like global but lower-precedence
-    };
-
-    enum {
-      STT_NOTYPE = 0,  // Symbol's type is not specified
-      STT_OBJECT = 1,  // Symbol is a data object (variable, array, etc.)
-      STT_FUNC = 2,    // Symbol is executable code (function, etc.)
-      STT_SECTION = 3, // Symbol refers to a section
-      STT_FILE = 4     // Local, absolute symbol that refers to a file
-    };
-
-    enum {
-      STV_DEFAULT = 0,  // Visibility is specified by binding type
-      STV_INTERNAL = 1, // Defined by processor supplements
-      STV_HIDDEN = 2,   // Not visible to other components
-      STV_PROTECTED = 3 // Visible in other components but not preemptable
-    };
-
     ELFSym() : SourceType(isOther), NameIdx(0), Value(0),
-               Size(0), Info(0), Other(STV_DEFAULT), SectionIdx(0),
+               Size(0), Info(0), Other(ELF::STV_DEFAULT), SectionIdx(0),
                SymTabIdx(0) {}
 
     unsigned getBind() const { return (Info >> 4) & 0xf; }
     unsigned getType() const { return Info & 0xf; }
-    bool isLocalBind() const { return getBind() == STB_LOCAL; }
-    bool isFileType() const { return getType() == STT_FILE; }
+    bool isLocalBind() const { return getBind() == ELF::STB_LOCAL; }
+    bool isFileType() const { return getType() == ELF::STT_FILE; }
 
     void setBind(unsigned X) {
       assert(X == (X & 0xF) && "Bind value out of range!");
@@ -222,51 +177,6 @@ namespace llvm {
     unsigned Align;     // sh_addralign - Alignment of section.
     unsigned EntSize;   // sh_entsize - Size of entries in the section e
 
-    // Section Header Flags
-    enum {
-      SHF_WRITE            = 1 << 0, // Writable
-      SHF_ALLOC            = 1 << 1, // Mapped into the process addr space
-      SHF_EXECINSTR        = 1 << 2, // Executable
-      SHF_MERGE            = 1 << 4, // Might be merged if equal
-      SHF_STRINGS          = 1 << 5, // Contains null-terminated strings
-      SHF_INFO_LINK        = 1 << 6, // 'sh_info' contains SHT index
-      SHF_LINK_ORDER       = 1 << 7, // Preserve order after combining
-      SHF_OS_NONCONFORMING = 1 << 8, // nonstandard OS support required
-      SHF_GROUP            = 1 << 9, // Section is a member of a group
-      SHF_TLS              = 1 << 10 // Section holds thread-local data
-    };
-
-    // Section Types
-    enum {
-      SHT_NULL     = 0,  // No associated section (inactive entry).
-      SHT_PROGBITS = 1,  // Program-defined contents.
-      SHT_SYMTAB   = 2,  // Symbol table.
-      SHT_STRTAB   = 3,  // String table.
-      SHT_RELA     = 4,  // Relocation entries; explicit addends.
-      SHT_HASH     = 5,  // Symbol hash table.
-      SHT_DYNAMIC  = 6,  // Information for dynamic linking.
-      SHT_NOTE     = 7,  // Information about the file.
-      SHT_NOBITS   = 8,  // Data occupies no space in the file.
-      SHT_REL      = 9,  // Relocation entries; no explicit addends.
-      SHT_SHLIB    = 10, // Reserved.
-      SHT_DYNSYM   = 11, // Symbol table.
-      SHT_LOPROC   = 0x70000000, // Lowest processor arch-specific type.
-      SHT_HIPROC   = 0x7fffffff, // Highest processor arch-specific type.
-      SHT_LOUSER   = 0x80000000, // Lowest type reserved for applications.
-      SHT_HIUSER   = 0xffffffff  // Highest type reserved for applications.
-    };
-
-    // Special section indices.
-    enum {
-      SHN_UNDEF     = 0,      // Undefined, missing, irrelevant
-      SHN_LORESERVE = 0xff00, // Lowest reserved index
-      SHN_LOPROC    = 0xff00, // Lowest processor-specific index
-      SHN_HIPROC    = 0xff1f, // Highest processor-specific index
-      SHN_ABS       = 0xfff1, // Symbol has absolute value; no relocation
-      SHN_COMMON    = 0xfff2, // FORTRAN COMMON or C external global variables
-      SHN_HIRESERVE = 0xffff  // Highest reserved index
-    };
-
     /// SectionIdx - The number of the section in the Section Table.
     unsigned short SectionIdx;
 
diff --git a/lib/CodeGen/ELFCodeEmitter.cpp b/lib/CodeGen/ELFCodeEmitter.cpp
index 36b0e6514b3a9..3fb087c5ea8b8 100644
--- a/lib/CodeGen/ELFCodeEmitter.cpp
+++ b/lib/CodeGen/ELFCodeEmitter.cpp
@@ -71,7 +71,7 @@ void ELFCodeEmitter::startFunction(MachineFunction &MF) {
 bool ELFCodeEmitter::finishFunction(MachineFunction &MF) {
   // Add a symbol to represent the function.
   const Function *F = MF.getFunction();
-  ELFSym *FnSym = ELFSym::getGV(F, EW.getGlobalELFBinding(F), ELFSym::STT_FUNC,
+  ELFSym *FnSym = ELFSym::getGV(F, EW.getGlobalELFBinding(F), ELF::STT_FUNC,
                                 EW.getGlobalELFVisibility(F));
   FnSym->SectionIdx = ES->SectionIdx;
   FnSym->Size = ES->getCurrentPCOffset()-FnStartOff;
diff --git a/lib/CodeGen/ELFWriter.cpp b/lib/CodeGen/ELFWriter.cpp
index b644ebeb4be53..d14728d8a36c9 100644
--- a/lib/CodeGen/ELFWriter.cpp
+++ b/lib/CodeGen/ELFWriter.cpp
@@ -63,7 +63,7 @@ char ELFWriter::ID = 0;
 //===----------------------------------------------------------------------===//
 
 ELFWriter::ELFWriter(raw_ostream &o, TargetMachine &tm)
-  : MachineFunctionPass(&ID), O(o), TM(tm),
+  : MachineFunctionPass(ID), O(o), TM(tm),
     OutContext(*new MCContext(*TM.getMCAsmInfo())),
     TLOF(TM.getTargetLowering()->getObjFileLowering()),
     is64Bit(TM.getTargetData()->getPointerSizeInBits() == 64),
@@ -129,12 +129,12 @@ bool ELFWriter::doInitialization(Module &M) {
 
   ElfHdr.emitByte(TEW->getEIClass()); // e_ident[EI_CLASS]
   ElfHdr.emitByte(TEW->getEIData());  // e_ident[EI_DATA]
-  ElfHdr.emitByte(EV_CURRENT);        // e_ident[EI_VERSION]
+  ElfHdr.emitByte(ELF::EV_CURRENT);   // e_ident[EI_VERSION]
   ElfHdr.emitAlignment(16);           // e_ident[EI_NIDENT-EI_PAD]
 
-  ElfHdr.emitWord16(ET_REL);             // e_type
+  ElfHdr.emitWord16(ELF::ET_REL);        // e_type
   ElfHdr.emitWord16(TEW->getEMachine()); // e_machine = target
-  ElfHdr.emitWord32(EV_CURRENT);         // e_version
+  ElfHdr.emitWord32(ELF::EV_CURRENT);    // e_version
   ElfHdr.emitWord(0);                    // e_entry, no entry point in .o file
   ElfHdr.emitWord(0);                    // e_phoff, no program header for .o
   ELFHdr_e_shoff_Offset = ElfHdr.size();
@@ -252,7 +252,7 @@ ELFSection &ELFWriter::getConstantPoolSection(MachineConstantPoolEntry &CPE) {
 // is true if the relocation section contains entries with addends.
 ELFSection &ELFWriter::getRelocSection(ELFSection &S) {
   unsigned SectionType = TEW->hasRelocationAddend() ?
-                ELFSection::SHT_RELA : ELFSection::SHT_REL;
+                ELF::SHT_RELA : ELF::SHT_REL;
 
   std::string SectionName(".rel");
   if (TEW->hasRelocationAddend())
@@ -268,11 +268,11 @@ unsigned ELFWriter::getGlobalELFVisibility(const GlobalValue *GV) {
   default:
     llvm_unreachable("unknown visibility type");
   case GlobalValue::DefaultVisibility:
-    return ELFSym::STV_DEFAULT;
+    return ELF::STV_DEFAULT;
   case GlobalValue::HiddenVisibility:
-    return ELFSym::STV_HIDDEN;
+    return ELF::STV_HIDDEN;
   case GlobalValue::ProtectedVisibility:
-    return ELFSym::STV_PROTECTED;
+    return ELF::STV_PROTECTED;
   }
   return 0;
 }
@@ -280,23 +280,23 @@ unsigned ELFWriter::getGlobalELFVisibility(const GlobalValue *GV) {
 // getGlobalELFBinding - Returns the ELF specific binding type
 unsigned ELFWriter::getGlobalELFBinding(const GlobalValue *GV) {
   if (GV->hasInternalLinkage())
-    return ELFSym::STB_LOCAL;
+    return ELF::STB_LOCAL;
 
   if (GV->isWeakForLinker() && !GV->hasCommonLinkage())
-    return ELFSym::STB_WEAK;
+    return ELF::STB_WEAK;
 
-  return ELFSym::STB_GLOBAL;
+  return ELF::STB_GLOBAL;
 }
 
 // getGlobalELFType - Returns the ELF specific type for a global
 unsigned ELFWriter::getGlobalELFType(const GlobalValue *GV) {
   if (GV->isDeclaration())
-    return ELFSym::STT_NOTYPE;
+    return ELF::STT_NOTYPE;
 
   if (isa<Function>(GV))
-    return ELFSym::STT_FUNC;
+    return ELF::STT_FUNC;
 
-  return ELFSym::STT_OBJECT;
+  return ELF::STT_OBJECT;
 }
 
 // IsELFUndefSym - True if the global value must be marked as a symbol
@@ -364,7 +364,7 @@ void ELFWriter::EmitGlobal(const GlobalValue *GV) {
     GblSym->Size = Size;
 
     if (S->HasCommonSymbols()) { // Symbol must go to a common section
-      GblSym->SectionIdx = ELFSection::SHN_COMMON;
+      GblSym->SectionIdx = ELF::SHN_COMMON;
 
       // A new linkonce section is created for each global in the
       // common section, the default alignment is 1 and the symbol
diff --git a/lib/CodeGen/ELFWriter.h b/lib/CodeGen/ELFWriter.h
index db66ecc6dd83b..b8bac5598ecfc 100644
--- a/lib/CodeGen/ELFWriter.h
+++ b/lib/CodeGen/ELFWriter.h
@@ -39,6 +39,7 @@ namespace llvm {
   class raw_ostream;
   class SectionKind;
   class MCContext;
+  class TargetMachine;
 
   typedef std::vector<ELFSym*>::iterator ELFSymIter;
   typedef std::vector<ELFSection*>::iterator ELFSectionIter;
@@ -160,29 +161,29 @@ namespace llvm {
       SN->SectionIdx = NumSections++;
       SN->Type = Type;
       SN->Flags = Flags;
-      SN->Link = ELFSection::SHN_UNDEF;
+      SN->Link = ELF::SHN_UNDEF;
       SN->Align = Align;
       return *SN;
     }
 
     ELFSection &getNonExecStackSection() {
-      return getSection(".note.GNU-stack", ELFSection::SHT_PROGBITS, 0, 1);
+      return getSection(".note.GNU-stack", ELF::SHT_PROGBITS, 0, 1);
     }
 
     ELFSection &getSymbolTableSection() {
-      return getSection(".symtab", ELFSection::SHT_SYMTAB, 0);
+      return getSection(".symtab", ELF::SHT_SYMTAB, 0);
     }
 
     ELFSection &getStringTableSection() {
-      return getSection(".strtab", ELFSection::SHT_STRTAB, 0, 1);
+      return getSection(".strtab", ELF::SHT_STRTAB, 0, 1);
     }
 
     ELFSection &getSectionHeaderStringTableSection() {
-      return getSection(".shstrtab", ELFSection::SHT_STRTAB, 0, 1);
+      return getSection(".shstrtab", ELF::SHT_STRTAB, 0, 1);
     }
 
     ELFSection &getNullSection() {
-      return getSection("", ELFSection::SHT_NULL, 0);
+      return getSection("", ELF::SHT_NULL, 0);
     }
 
     ELFSection &getDataSection();
diff --git a/lib/CodeGen/GCMetadata.cpp b/lib/CodeGen/GCMetadata.cpp
index ab0a800225311..0f6e882a7be43 100644
--- a/lib/CodeGen/GCMetadata.cpp
+++ b/lib/CodeGen/GCMetadata.cpp
@@ -30,8 +30,8 @@ namespace {
     raw_ostream &OS;
     
   public:
-    Printer() : FunctionPass(&ID), OS(errs()) {}
-    explicit Printer(raw_ostream &OS) : FunctionPass(&ID), OS(OS) {}
+    Printer() : FunctionPass(ID), OS(errs()) {}
+    explicit Printer(raw_ostream &OS) : FunctionPass(ID), OS(OS) {}
 
     
     const char *getPassName() const;
@@ -55,8 +55,8 @@ namespace {
   
 }
 
-static RegisterPass<GCModuleInfo>
-X("collector-metadata", "Create Garbage Collector Module Metadata");
+INITIALIZE_PASS(GCModuleInfo, "collector-metadata",
+                "Create Garbage Collector Module Metadata", false, false);
 
 // -----------------------------------------------------------------------------
 
@@ -70,7 +70,7 @@ GCFunctionInfo::~GCFunctionInfo() {}
 char GCModuleInfo::ID = 0;
 
 GCModuleInfo::GCModuleInfo()
-  : ImmutablePass(&ID) {}
+  : ImmutablePass(ID) {}
 
 GCModuleInfo::~GCModuleInfo() {
   clear();
@@ -189,7 +189,7 @@ FunctionPass *llvm::createGCInfoDeleter() {
   return new Deleter();
 }
 
-Deleter::Deleter() : FunctionPass(&ID) {}
+Deleter::Deleter() : FunctionPass(ID) {}
 
 const char *Deleter::getPassName() const {
   return "Delete Garbage Collector Information";
diff --git a/lib/CodeGen/GCStrategy.cpp b/lib/CodeGen/GCStrategy.cpp
index 71506cc6abb9c..719fa194d8da1 100644
--- a/lib/CodeGen/GCStrategy.cpp
+++ b/lib/CodeGen/GCStrategy.cpp
@@ -130,7 +130,7 @@ FunctionPass *llvm::createGCLoweringPass() {
 char LowerIntrinsics::ID = 0;
 
 LowerIntrinsics::LowerIntrinsics()
-  : FunctionPass(&ID) {}
+  : FunctionPass(ID) {}
 
 const char *LowerIntrinsics::getPassName() const {
   return "Lower Garbage Collection Instructions";
@@ -260,7 +260,7 @@ bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
   bool LowerRd = !S.customReadBarrier();
   bool InitRoots = S.initializeRoots();
   
-  SmallVector<AllocaInst*,32> Roots;
+  SmallVector<AllocaInst*, 32> Roots;
   
   bool MadeChange = false;
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
@@ -271,7 +271,8 @@ bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
         case Intrinsic::gcwrite:
           if (LowerWr) {
             // Replace a write barrier with a simple store.
-            Value *St = new StoreInst(CI->getArgOperand(0), CI->getArgOperand(2), CI);
+            Value *St = new StoreInst(CI->getArgOperand(0),
+                                      CI->getArgOperand(2), CI);
             CI->replaceAllUsesWith(St);
             CI->eraseFromParent();
           }
@@ -317,7 +318,7 @@ FunctionPass *llvm::createGCMachineCodeAnalysisPass() {
 char MachineCodeAnalysis::ID = 0;
 
 MachineCodeAnalysis::MachineCodeAnalysis()
-  : MachineFunctionPass(&ID) {}
+  : MachineFunctionPass(ID) {}
 
 const char *MachineCodeAnalysis::getPassName() const {
   return "Analyze Machine Code For Garbage Collection";
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 6b445e0b8e0f0..0ea30d7a7929d 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -154,7 +154,7 @@ namespace {
     int FnNum;
   public:
     static char ID;
-    IfConverter() : MachineFunctionPass(&ID), FnNum(-1) {}
+    IfConverter() : MachineFunctionPass(ID), FnNum(-1) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
     virtual const char *getPassName() const { return "If Converter"; }
@@ -230,8 +230,7 @@ namespace {
   char IfConverter::ID = 0;
 }
 
-static RegisterPass<IfConverter>
-X("if-converter", "If Converter");
+INITIALIZE_PASS(IfConverter, "if-converter", "If Converter", false, false);
 
 FunctionPass *llvm::createIfConverterPass() { return new IfConverter(); }
 
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 12adcaa3a22ee..b965bfdcf3b84 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -14,10 +14,12 @@
 
 #define DEBUG_TYPE "spiller"
 #include "Spiller.h"
+#include "SplitKit.h"
 #include "VirtRegMap.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -28,8 +30,10 @@ using namespace llvm;
 
 namespace {
 class InlineSpiller : public Spiller {
+  MachineFunctionPass &pass_;
   MachineFunction &mf_;
   LiveIntervals &lis_;
+  MachineLoopInfo &loops_;
   VirtRegMap &vrm_;
   MachineFrameInfo &mfi_;
   MachineRegisterInfo &mri_;
@@ -37,9 +41,11 @@ class InlineSpiller : public Spiller {
   const TargetRegisterInfo &tri_;
   const BitVector reserved_;
 
+  SplitAnalysis splitAnalysis_;
+
   // Variables that are valid during spill(), but used by multiple methods.
   LiveInterval *li_;
-  std::vector<LiveInterval*> *newIntervals_;
+  SmallVectorImpl<LiveInterval*> *newIntervals_;
   const TargetRegisterClass *rc_;
   int stackSlot_;
   const SmallVectorImpl<LiveInterval*> *spillIs_;
@@ -53,25 +59,34 @@ class InlineSpiller : public Spiller {
   ~InlineSpiller() {}
 
 public:
-  InlineSpiller(MachineFunction *mf, LiveIntervals *lis, VirtRegMap *vrm)
-    : mf_(*mf), lis_(*lis), vrm_(*vrm),
-      mfi_(*mf->getFrameInfo()),
-      mri_(mf->getRegInfo()),
-      tii_(*mf->getTarget().getInstrInfo()),
-      tri_(*mf->getTarget().getRegisterInfo()),
-      reserved_(tri_.getReservedRegs(mf_)) {}
+  InlineSpiller(MachineFunctionPass &pass,
+                MachineFunction &mf,
+                VirtRegMap &vrm)
+    : pass_(pass),
+      mf_(mf),
+      lis_(pass.getAnalysis<LiveIntervals>()),
+      loops_(pass.getAnalysis<MachineLoopInfo>()),
+      vrm_(vrm),
+      mfi_(*mf.getFrameInfo()),
+      mri_(mf.getRegInfo()),
+      tii_(*mf.getTarget().getInstrInfo()),
+      tri_(*mf.getTarget().getRegisterInfo()),
+      reserved_(tri_.getReservedRegs(mf_)),
+      splitAnalysis_(mf, lis_, loops_) {}
 
   void spill(LiveInterval *li,
-             std::vector<LiveInterval*> &newIntervals,
-             SmallVectorImpl<LiveInterval*> &spillIs,
-             SlotIndex *earliestIndex);
+             SmallVectorImpl<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &spillIs);
 
 private:
+  bool split();
+
   bool allUsesAvailableAt(const MachineInstr *OrigMI, SlotIndex OrigIdx,
                           SlotIndex UseIdx);
   bool reMaterializeFor(MachineBasicBlock::iterator MI);
   void reMaterializeAll();
 
+  bool coalesceStackAccess(MachineInstr *MI);
   bool foldMemoryOperand(MachineBasicBlock::iterator MI,
                          const SmallVectorImpl<unsigned> &Ops);
   void insertReload(LiveInterval &NewLI, MachineBasicBlock::iterator MI);
@@ -80,12 +95,43 @@ private:
 }
 
 namespace llvm {
-Spiller *createInlineSpiller(MachineFunction *mf,
-                             LiveIntervals *lis,
-                             const MachineLoopInfo *mli,
-                             VirtRegMap *vrm) {
-  return new InlineSpiller(mf, lis, vrm);
+Spiller *createInlineSpiller(MachineFunctionPass &pass,
+                             MachineFunction &mf,
+                             VirtRegMap &vrm) {
+  return new InlineSpiller(pass, mf, vrm);
+}
 }
+
+/// split - try splitting the current interval into pieces that may allocate
+/// separately. Return true if successful.
+bool InlineSpiller::split() {
+  splitAnalysis_.analyze(li_);
+
+  if (const MachineLoop *loop = splitAnalysis_.getBestSplitLoop()) {
+    // We can split, but li_ may be left intact with fewer uses.
+    if (SplitEditor(splitAnalysis_, lis_, vrm_, *newIntervals_)
+          .splitAroundLoop(loop))
+      return true;
+  }
+
+  // Try splitting into single block intervals.
+  SplitAnalysis::BlockPtrSet blocks;
+  if (splitAnalysis_.getMultiUseBlocks(blocks)) {
+    if (SplitEditor(splitAnalysis_, lis_, vrm_, *newIntervals_)
+          .splitSingleBlocks(blocks))
+      return true;
+  }
+
+  // Try splitting inside a basic block.
+  if (const MachineBasicBlock *MBB = splitAnalysis_.getBlockForInsideSplit()) {
+    if (SplitEditor(splitAnalysis_, lis_, vrm_, *newIntervals_)
+          .splitInsideBlock(MBB))
+      return true;
+  }
+
+  // We may have been able to split out some uses, but the original interval is
+  // intact, and it should still be spilled.
+  return false;
 }
 
 /// allUsesAvailableAt - Return true if all registers used by OrigMI at
@@ -237,7 +283,7 @@ void InlineSpiller::reMaterializeAll() {
     lis_.RemoveMachineInstrFromMaps(DefMI);
     vrm_.RemoveMachineInstrFromMaps(DefMI);
     DefMI->eraseFromParent();
-    li_->removeValNo(VNI);
+    VNI->setIsDefAccurate(false);
     anyRemoved = true;
   }
 
@@ -253,8 +299,8 @@ void InlineSpiller::reMaterializeAll() {
     MachineBasicBlock::iterator NextMI = MI;
     ++NextMI;
     if (NextMI != MI->getParent()->end() && !lis_.isNotInMIMap(NextMI)) {
-      SlotIndex NearIdx = lis_.getInstructionIndex(NextMI);
-      if (li_->liveAt(NearIdx))
+      VNInfo *VNI = li_->getVNInfoAt(lis_.getInstructionIndex(NextMI));
+      if (VNI && (VNI->hasPHIKill() || usedValues_.count(VNI)))
         continue;
     }
     DEBUG(dbgs() << "Removing debug info due to remat:" << "\t" << *MI);
@@ -262,6 +308,24 @@ void InlineSpiller::reMaterializeAll() {
   }
 }
 
+/// If MI is a load or store of stackSlot_, it can be removed.
+bool InlineSpiller::coalesceStackAccess(MachineInstr *MI) {
+  int FI = 0;
+  unsigned reg;
+  if (!(reg = tii_.isLoadFromStackSlot(MI, FI)) &&
+      !(reg = tii_.isStoreToStackSlot(MI, FI)))
+    return false;
+
+  // We have a stack access. Is it the right register and slot?
+  if (reg != li_->reg || FI != stackSlot_)
+    return false;
+
+  DEBUG(dbgs() << "Coalescing stack access: " << *MI);
+  lis_.RemoveMachineInstrFromMaps(MI);
+  MI->eraseFromParent();
+  return true;
+}
+
 /// foldMemoryOperand - Try folding stack slot references in Ops into MI.
 /// Return true on success, and MI will be erased.
 bool InlineSpiller::foldMemoryOperand(MachineBasicBlock::iterator MI,
@@ -323,9 +387,8 @@ void InlineSpiller::insertSpill(LiveInterval &NewLI,
 }
 
 void InlineSpiller::spill(LiveInterval *li,
-                          std::vector<LiveInterval*> &newIntervals,
-                          SmallVectorImpl<LiveInterval*> &spillIs,
-                          SlotIndex *earliestIndex) {
+                          SmallVectorImpl<LiveInterval*> &newIntervals,
+                          SmallVectorImpl<LiveInterval*> &spillIs) {
   DEBUG(dbgs() << "Inline spilling " << *li << "\n");
   assert(li->isSpillable() && "Attempting to spill already spilled value.");
   assert(!li->isStackSlot() && "Trying to spill a stack slot.");
@@ -335,13 +398,18 @@ void InlineSpiller::spill(LiveInterval *li,
   rc_ = mri_.getRegClass(li->reg);
   spillIs_ = &spillIs;
 
+  if (split())
+    return;
+
   reMaterializeAll();
 
   // Remat may handle everything.
   if (li_->empty())
     return;
 
-  stackSlot_ = vrm_.assignVirt2StackSlot(li->reg);
+  stackSlot_ = vrm_.getStackSlot(li->reg);
+  if (stackSlot_ == VirtRegMap::NO_STACK_SLOT)
+    stackSlot_ = vrm_.assignVirt2StackSlot(li->reg);
 
   // Iterate over instructions using register.
   for (MachineRegisterInfo::reg_iterator RI = mri_.reg_begin(li->reg);
@@ -365,6 +433,10 @@ void InlineSpiller::spill(LiveInterval *li,
       continue;
     }
 
+    // Stack slot accesses may coalesce away.
+    if (coalesceStackAccess(MI))
+      continue;
+
     // Analyze instruction.
     bool Reads, Writes;
     SmallVector<unsigned, 8> Ops;
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index 03ae214ae7dab..3852ebaf64253 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -481,7 +481,8 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     Value *Ops[3];
     Ops[0] = CI->getArgOperand(0);
     // Extend the amount to i32.
-    Ops[1] = Builder.CreateIntCast(CI->getArgOperand(1), Type::getInt32Ty(Context),
+    Ops[1] = Builder.CreateIntCast(CI->getArgOperand(1),
+                                   Type::getInt32Ty(Context),
                                    /* isSigned */ false);
     Ops[2] = Size;
     ReplaceCallWith("memset", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index bf3137e49536e..36038027b2594 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -85,7 +85,7 @@ static bool getVerboseAsm() {
   case cl::BOU_UNSET: return TargetMachine::getAsmVerbosityDefault();
   case cl::BOU_TRUE:  return true;
   case cl::BOU_FALSE: return false;
-  }      
+  }
 }
 
 // Enable or disable FastISel. Both options are needed, because
@@ -139,8 +139,6 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
       getTarget().createMCInstPrinter(MAI.getAssemblerDialect(), MAI);
 
     // Create a code emitter if asked to show the encoding.
-    //
-    // FIXME: These are currently leaked.
     MCCodeEmitter *MCE = 0;
     if (ShowMCEncoding)
       MCE = getTarget().createCodeEmitter(*this, *Context);
@@ -154,8 +152,6 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
   case CGFT_ObjectFile: {
     // Create the code emitter for the target if it exists.  If not, .o file
     // emission fails.
-    //
-    // FIXME: These are currently leaked.
     MCCodeEmitter *MCE = getTarget().createCodeEmitter(*this, *Context);
     TargetAsmBackend *TAB = getTarget().createAsmBackend(TargetTriple);
     if (MCE == 0 || TAB == 0)
@@ -180,12 +176,12 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
   FunctionPass *Printer = getTarget().createAsmPrinter(*this, *AsmStreamer);
   if (Printer == 0)
     return true;
-  
+
   // If successful, createAsmPrinter took ownership of AsmStreamer.
   AsmStreamer.take();
-  
+
   PM.add(Printer);
-  
+
   // Make sure the code model is set.
   setCodeModelForStatic();
   PM.add(createGCInfoDeleter());
@@ -204,7 +200,7 @@ bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
                                                    bool DisableVerify) {
   // Make sure the code model is set.
   setCodeModelForJIT();
-  
+
   // Add common CodeGen passes.
   MCContext *Ctx = 0;
   if (addCommonCodeGenPasses(PM, OptLevel, DisableVerify, Ctx))
@@ -216,19 +212,36 @@ bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
   return false; // success!
 }
 
+/// addPassesToEmitMC - Add passes to the specified pass manager to get
+/// machine code emitted with the MCJIT. This method returns true if machine
+/// code is not supported. It fills the MCContext Ctx pointer which can be
+/// used to build custom MCStreamer.
+///
+bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
+                                          MCContext *&Ctx,
+                                          CodeGenOpt::Level OptLevel,
+                                          bool DisableVerify) {
+  // Add common CodeGen passes.
+  if (addCommonCodeGenPasses(PM, OptLevel, DisableVerify, Ctx))
+    return true;
+  // Make sure the code model is set.
+  setCodeModelForJIT();
+
+  return false; // success!
+}
+
 static void printNoVerify(PassManagerBase &PM, const char *Banner) {
   if (PrintMachineCode)
     PM.add(createMachineFunctionPrinterPass(dbgs(), Banner));
 }
 
 static void printAndVerify(PassManagerBase &PM,
-                           const char *Banner,
-                           bool allowDoubleDefs = false) {
+                           const char *Banner) {
   if (PrintMachineCode)
     PM.add(createMachineFunctionPrinterPass(dbgs(), Banner));
 
   if (VerifyMachineCode)
-    PM.add(createMachineVerifierPass(allowDoubleDefs));
+    PM.add(createMachineVerifierPass());
 }
 
 /// addCommonCodeGenPasses - Add standard LLVM codegen passes used for both
@@ -258,6 +271,11 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
       PM.add(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs()));
   }
 
+  PM.add(createGCLoweringPass());
+
+  // Make sure that no unreachable blocks are instruction selected.
+  PM.add(createUnreachableBlockEliminationPass());
+
   // Turn exception handling constructs into something the code generators can
   // handle.
   switch (getMCAsmInfo()->getExceptionHandlingType()) {
@@ -269,26 +287,25 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
     // pad is shared by multiple invokes and is also a target of a normal
     // edge from elsewhere.
     PM.add(createSjLjEHPass(getTargetLowering()));
-    PM.add(createDwarfEHPass(this, OptLevel==CodeGenOpt::None));
-    break;
+    // FALLTHROUGH
   case ExceptionHandling::Dwarf:
-    PM.add(createDwarfEHPass(this, OptLevel==CodeGenOpt::None));
+    PM.add(createDwarfEHPass(this));
     break;
   case ExceptionHandling::None:
     PM.add(createLowerInvokePass(getTargetLowering()));
+
+    // The lower invoke pass may create unreachable code. Remove it.
+    PM.add(createUnreachableBlockEliminationPass());
     break;
   }
 
-  PM.add(createGCLoweringPass());
-
-  // Make sure that no unreachable blocks are instruction selected.
-  PM.add(createUnreachableBlockEliminationPass());
-
   if (OptLevel != CodeGenOpt::None && !DisableCGP)
     PM.add(createCodeGenPreparePass(getTargetLowering()));
 
   PM.add(createStackProtectorPass(getTargetLowering()));
 
+  addPreISel(PM, OptLevel);
+
   if (PrintISelInput)
     PM.add(createPrintFunctionPass("\n\n"
                                    "*** Final LLVM Code input to ISel ***\n",
@@ -300,13 +317,12 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
     PM.add(createVerifierPass());
 
   // Standard Lower-Level Passes.
-  
+
   // Install a MachineModuleInfo class, which is an immutable pass that holds
   // all the per-module stuff we're generating, including MCContext.
   MachineModuleInfo *MMI = new MachineModuleInfo(*getMCAsmInfo());
   PM.add(MMI);
   OutContext = &MMI->getContext(); // Return the MCContext specifically by-ref.
-  
 
   // Set up a MachineFunction for the rest of CodeGen to work on.
   PM.add(new MachineFunctionAnalysis(*this, OptLevel));
@@ -321,44 +337,43 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
     return true;
 
   // Print the instruction selected machine code...
-  printAndVerify(PM, "After Instruction Selection",
-                 /* allowDoubleDefs= */ true);
+  printAndVerify(PM, "After Instruction Selection");
 
   // Optimize PHIs before DCE: removing dead PHI cycles may make more
   // instructions dead.
   if (OptLevel != CodeGenOpt::None)
     PM.add(createOptimizePHIsPass());
 
+  // If the target requests it, assign local variables to stack slots relative
+  // to one another and simplify frame index references where possible.
+  PM.add(createLocalStackSlotAllocationPass());
+
   if (OptLevel != CodeGenOpt::None) {
     // With optimization, dead code should already be eliminated. However
     // there is one known exception: lowered code for arguments that are only
     // used by tail calls, where the tail calls reuse the incoming stack
     // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
     PM.add(createDeadMachineInstructionElimPass());
-    printAndVerify(PM, "After codegen DCE pass",
-                   /* allowDoubleDefs= */ true);
+    printAndVerify(PM, "After codegen DCE pass");
 
-    PM.add(createOptimizeExtsPass());
+    PM.add(createPeepholeOptimizerPass());
     if (!DisableMachineLICM)
       PM.add(createMachineLICMPass());
     PM.add(createMachineCSEPass());
     if (!DisableMachineSink)
       PM.add(createMachineSinkingPass());
-    printAndVerify(PM, "After Machine LICM, CSE and Sinking passes",
-                   /* allowDoubleDefs= */ true);
+    printAndVerify(PM, "After Machine LICM, CSE and Sinking passes");
   }
 
   // Pre-ra tail duplication.
   if (OptLevel != CodeGenOpt::None && !DisableEarlyTailDup) {
     PM.add(createTailDuplicatePass(true));
-    printAndVerify(PM, "After Pre-RegAlloc TailDuplicate",
-                   /* allowDoubleDefs= */ true);
+    printAndVerify(PM, "After Pre-RegAlloc TailDuplicate");
   }
 
   // Run pre-ra passes.
   if (addPreRegAlloc(PM, OptLevel))
-    printAndVerify(PM, "After PreRegAlloc passes",
-                   /* allowDoubleDefs= */ true);
+    printAndVerify(PM, "After PreRegAlloc passes");
 
   // Perform register allocation.
   PM.add(createRegisterAllocator(OptLevel));
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index ad5728458062f..59f380ad26414 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -166,6 +166,56 @@ bool LiveInterval::overlaps(SlotIndex Start, SlotIndex End) const {
   return I != begin() && (--I)->end > Start;
 }
 
+
+/// ValNo is dead, remove it.  If it is the largest value number, just nuke it
+/// (and any other deleted values neighboring it), otherwise mark it as ~1U so
+/// it can be nuked later.
+void LiveInterval::markValNoForDeletion(VNInfo *ValNo) {
+  if (ValNo->id == getNumValNums()-1) {
+    do {
+      valnos.pop_back();
+    } while (!valnos.empty() && valnos.back()->isUnused());
+  } else {
+    ValNo->setIsUnused(true);
+  }
+}
+
+/// RenumberValues - Renumber all values in order of appearance and delete the
+/// remaining unused values.
+void LiveInterval::RenumberValues(LiveIntervals &lis) {
+  SmallPtrSet<VNInfo*, 8> Seen;
+  bool seenPHIDef = false;
+  valnos.clear();
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    VNInfo *VNI = I->valno;
+    if (!Seen.insert(VNI))
+      continue;
+    assert(!VNI->isUnused() && "Unused valno used by live range");
+    VNI->id = (unsigned)valnos.size();
+    valnos.push_back(VNI);
+    VNI->setHasPHIKill(false);
+    if (VNI->isPHIDef())
+      seenPHIDef = true;
+  }
+
+  // Recompute phi kill flags.
+  if (!seenPHIDef)
+    return;
+  for (const_vni_iterator I = vni_begin(), E = vni_end(); I != E; ++I) {
+    VNInfo *VNI = *I;
+    if (!VNI->isPHIDef())
+      continue;
+    const MachineBasicBlock *PHIBB = lis.getMBBFromIndex(VNI->def);
+    assert(PHIBB && "No basic block for phi-def");
+    for (MachineBasicBlock::const_pred_iterator PI = PHIBB->pred_begin(),
+         PE = PHIBB->pred_end(); PI != PE; ++PI) {
+      VNInfo *KVNI = getVNInfoAt(lis.getMBBEndIdx(*PI).getPrevSlot());
+      if (KVNI)
+        KVNI->setHasPHIKill(true);
+    }
+  }
+}
+
 /// extendIntervalEndTo - This method is used when we want to extend the range
 /// specified by I to end at the specified endpoint.  To do this, we should
 /// merge and eliminate all ranges that this will overlap with.  The iterator is
@@ -175,7 +225,7 @@ void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) {
   VNInfo *ValNo = I->valno;
 
   // Search for the first interval that we can't merge with.
-  Ranges::iterator MergeTo = next(I);
+  Ranges::iterator MergeTo = llvm::next(I);
   for (; MergeTo != ranges.end() && NewEnd >= MergeTo->end; ++MergeTo) {
     assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
   }
@@ -184,11 +234,11 @@ void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) {
   I->end = std::max(NewEnd, prior(MergeTo)->end);
 
   // Erase any dead ranges.
-  ranges.erase(next(I), MergeTo);
+  ranges.erase(llvm::next(I), MergeTo);
 
   // If the newly formed range now touches the range after it and if they have
   // the same value number, merge the two ranges into one range.
-  Ranges::iterator Next = next(I);
+  Ranges::iterator Next = llvm::next(I);
   if (Next != ranges.end() && Next->start <= I->end && Next->valno == ValNo) {
     I->end = Next->end;
     ranges.erase(Next);
@@ -227,7 +277,7 @@ LiveInterval::extendIntervalStartTo(Ranges::iterator I, SlotIndex NewStart) {
     MergeTo->end = I->end;
   }
 
-  ranges.erase(next(MergeTo), next(I));
+  ranges.erase(llvm::next(MergeTo), llvm::next(I));
   return MergeTo;
 }
 
@@ -280,7 +330,7 @@ LiveInterval::addRangeFrom(LiveRange LR, iterator From) {
   return ranges.insert(it, LR);
 }
 
-/// isInOneLiveRange - Return true if the range specified is entirely in 
+/// isInOneLiveRange - Return true if the range specified is entirely in
 /// a single LiveRange of the live interval.
 bool LiveInterval::isInOneLiveRange(SlotIndex Start, SlotIndex End) {
   Ranges::iterator I = std::upper_bound(ranges.begin(), ranges.end(), Start);
@@ -314,16 +364,8 @@ void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
             break;
           }
         if (isDead) {
-          // Now that ValNo is dead, remove it.  If it is the largest value
-          // number, just nuke it (and any other deleted values neighboring it),
-          // otherwise mark it as ~1U so it can be nuked later.
-          if (ValNo->id == getNumValNums()-1) {
-            do {
-              valnos.pop_back();
-            } while (!valnos.empty() && valnos.back()->isUnused());
-          } else {
-            ValNo->setIsUnused(true);
-          }
+          // Now that ValNo is dead, remove it.
+          markValNoForDeletion(ValNo);
         }
       }
 
@@ -345,7 +387,7 @@ void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
   I->end = Start;   // Trim the old interval.
 
   // Insert the new one.
-  ranges.insert(next(I), LiveRange(End, OldEnd, ValNo));
+  ranges.insert(llvm::next(I), LiveRange(End, OldEnd, ValNo));
 }
 
 /// removeValNo - Remove all the ranges defined by the specified value#.
@@ -359,21 +401,13 @@ void LiveInterval::removeValNo(VNInfo *ValNo) {
     if (I->valno == ValNo)
       ranges.erase(I);
   } while (I != E);
-  // Now that ValNo is dead, remove it.  If it is the largest value
-  // number, just nuke it (and any other deleted values neighboring it),
-  // otherwise mark it as ~1U so it can be nuked later.
-  if (ValNo->id == getNumValNums()-1) {
-    do {
-      valnos.pop_back();
-    } while (!valnos.empty() && valnos.back()->isUnused());
-  } else {
-    ValNo->setIsUnused(true);
-  }
+  // Now that ValNo is dead, remove it.
+  markValNoForDeletion(ValNo);
 }
 
 /// getLiveRangeContaining - Return the live range that contains the
 /// specified index, or null if there is none.
-LiveInterval::const_iterator 
+LiveInterval::const_iterator
 LiveInterval::FindLiveRangeContaining(SlotIndex Idx) const {
   const_iterator It = std::upper_bound(begin(), end(), Idx);
   if (It != ranges.begin()) {
@@ -385,7 +419,7 @@ LiveInterval::FindLiveRangeContaining(SlotIndex Idx) const {
   return end();
 }
 
-LiveInterval::iterator 
+LiveInterval::iterator
 LiveInterval::FindLiveRangeContaining(SlotIndex Idx) {
   iterator It = std::upper_bound(begin(), end(), Idx);
   if (It != begin()) {
@@ -393,7 +427,7 @@ LiveInterval::FindLiveRangeContaining(SlotIndex Idx) {
     if (It->contains(Idx))
       return It;
   }
-  
+
   return end();
 }
 
@@ -425,11 +459,11 @@ VNInfo *LiveInterval::findDefinedVNInfoForStackInt(unsigned reg) const {
 /// the intervals are not joinable, this aborts.
 void LiveInterval::join(LiveInterval &Other,
                         const int *LHSValNoAssignments,
-                        const int *RHSValNoAssignments, 
+                        const int *RHSValNoAssignments,
                         SmallVector<VNInfo*, 16> &NewVNInfo,
                         MachineRegisterInfo *MRI) {
   // Determine if any of our live range values are mapped.  This is uncommon, so
-  // we want to avoid the interval scan if not. 
+  // we want to avoid the interval scan if not.
   bool MustMapCurValNos = false;
   unsigned NumVals = getNumValNums();
   unsigned NumNewVals = NewVNInfo.size();
@@ -449,7 +483,7 @@ void LiveInterval::join(LiveInterval &Other,
     ++OutIt;
     for (iterator I = OutIt, E = end(); I != E; ++I) {
       OutIt->valno = NewVNInfo[LHSValNoAssignments[I->valno->id]];
-      
+
       // If this live range has the same value # as its immediate predecessor,
       // and if they are neighbors, remove one LiveRange.  This happens when we
       // have [0,3:0)[4,7:1) and map 0/1 onto the same value #.
@@ -460,12 +494,12 @@ void LiveInterval::join(LiveInterval &Other,
           OutIt->start = I->start;
           OutIt->end = I->end;
         }
-        
+
         // Didn't merge, on to the next one.
         ++OutIt;
       }
     }
-    
+
     // If we merge some live ranges, chop off the end.
     ranges.erase(OutIt, end());
   }
@@ -483,7 +517,7 @@ void LiveInterval::join(LiveInterval &Other,
     if (VNI) {
       if (NumValNos >= NumVals)
         valnos.push_back(VNI);
-      else 
+      else
         valnos[NumValNos] = VNI;
       VNI->id = NumValNos++;  // Renumber val#.
     }
@@ -502,25 +536,13 @@ void LiveInterval::join(LiveInterval &Other,
   }
 
   ComputeJoinedWeight(Other);
-
-  // Update regalloc hint if currently there isn't one.
-  if (TargetRegisterInfo::isVirtualRegister(reg) &&
-      TargetRegisterInfo::isVirtualRegister(Other.reg)) {
-    std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(reg);
-    if (Hint.first == 0 && Hint.second == 0) {
-      std::pair<unsigned, unsigned> OtherHint =
-        MRI->getRegAllocationHint(Other.reg);
-      if (OtherHint.first || OtherHint.second)
-        MRI->setRegAllocationHint(reg, OtherHint.first, OtherHint.second);
-    }
-  }
 }
 
 /// MergeRangesInAsValue - Merge all of the intervals in RHS into this live
 /// interval as the specified value number.  The LiveRanges in RHS are
 /// allowed to overlap with LiveRanges in the current interval, but only if
 /// the overlapping LiveRanges have the specified value number.
-void LiveInterval::MergeRangesInAsValue(const LiveInterval &RHS, 
+void LiveInterval::MergeRangesInAsValue(const LiveInterval &RHS,
                                         VNInfo *LHSValNo) {
   // TODO: Make this more efficient.
   iterator InsertPos = begin();
@@ -569,7 +591,7 @@ void LiveInterval::MergeValueInAsValue(
       // If this trimmed away the whole range, ignore it.
       if (Start == End) continue;
     }
-    
+
     // Map the valno in the other live range to the current live range.
     IP = addRangeFrom(LiveRange(Start, End, LHSValNo), IP);
   }
@@ -584,18 +606,10 @@ void LiveInterval::MergeValueInAsValue(
         if (I->valno == V1) {
           isDead = false;
           break;
-        }          
-      if (isDead) {
-        // Now that V1 is dead, remove it.  If it is the largest value number,
-        // just nuke it (and any other deleted values neighboring it), otherwise
-        // mark it as ~1U so it can be nuked later.
-        if (V1->id == getNumValNums()-1) {
-          do {
-            valnos.pop_back();
-          } while (!valnos.empty() && valnos.back()->isUnused());
-        } else {
-          V1->setIsUnused(true);
         }
+      if (isDead) {
+        // Now that V1 is dead, remove it.
+        markValNoForDeletion(V1);
       }
     }
   }
@@ -609,7 +623,7 @@ void LiveInterval::MergeInClobberRanges(LiveIntervals &li_,
                                         const LiveInterval &Clobbers,
                                         VNInfo::Allocator &VNInfoAllocator) {
   if (Clobbers.empty()) return;
-  
+
   DenseMap<VNInfo*, VNInfo*> ValNoMaps;
   VNInfo *UnusedValNo = 0;
   iterator IP = begin();
@@ -679,10 +693,10 @@ void LiveInterval::MergeInClobberRange(LiveIntervals &li_,
   // for unknown values, use it.
   VNInfo *ClobberValNo =
     getNextValue(li_.getInvalidIndex(), 0, false, VNInfoAllocator);
-  
+
   iterator IP = begin();
   IP = std::upper_bound(IP, end(), Start);
-    
+
   // If the start of this range overlaps with an existing liverange, trim it.
   if (IP != begin() && IP[-1].end > Start) {
     Start = IP[-1].end;
@@ -695,7 +709,7 @@ void LiveInterval::MergeInClobberRange(LiveIntervals &li_,
     // If this trimmed away the whole range, ignore it.
     if (Start == End) return;
   }
-    
+
   // Insert the clobber interval.
   addRangeFrom(LiveRange(Start, End, ClobberValNo), IP);
 }
@@ -722,7 +736,7 @@ VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
   for (iterator I = begin(); I != end(); ) {
     iterator LR = I++;
     if (LR->valno != V1) continue;  // Not a V1 LiveRange.
-    
+
     // Okay, we found a V1 live range.  If it had a previous, touching, V2 live
     // range, extend it.
     if (LR != begin()) {
@@ -736,11 +750,11 @@ VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
         LR = Prev;
       }
     }
-    
+
     // Okay, now we have a V1 or V2 live range that is maximally merged forward.
     // Ensure that it is a V2 live-range.
     LR->valno = V2;
-    
+
     // If we can merge it into later V2 live ranges, do so now.  We ignore any
     // following V1 live ranges, as they will be merged in subsequent iterations
     // of the loop.
@@ -752,18 +766,10 @@ VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
       }
     }
   }
-  
-  // Now that V1 is dead, remove it.  If it is the largest value number, just
-  // nuke it (and any other deleted values neighboring it), otherwise mark it as
-  // ~1U so it can be nuked later.
-  if (V1->id == getNumValNums()-1) {
-    do {
-      valnos.pop_back();
-    } while (valnos.back()->isUnused());
-  } else {
-    V1->setIsUnused(true);
-  }
-  
+
+  // Now that V1 is dead, remove it.
+  markValNoForDeletion(V1);
+
   return V2;
 }
 
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 194d03d8dbfb5..2726fc337539c 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -47,7 +47,7 @@
 using namespace llvm;
 
 // Hidden options for help debugging.
-static cl::opt<bool> DisableReMat("disable-rematerialization", 
+static cl::opt<bool> DisableReMat("disable-rematerialization",
                                   cl::init(false), cl::Hidden);
 
 STATISTIC(numIntervals , "Number of original intervals");
@@ -55,22 +55,24 @@ STATISTIC(numFolds     , "Number of loads/stores folded into instructions");
 STATISTIC(numSplits    , "Number of intervals split");
 
 char LiveIntervals::ID = 0;
-static RegisterPass<LiveIntervals> X("liveintervals", "Live Interval Analysis");
+INITIALIZE_PASS(LiveIntervals, "liveintervals",
+                "Live Interval Analysis", false, false);
 
 void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addRequired<AliasAnalysis>();
   AU.addPreserved<AliasAnalysis>();
-  AU.addPreserved<LiveVariables>();
   AU.addRequired<LiveVariables>();
-  AU.addPreservedID(MachineLoopInfoID);
+  AU.addPreserved<LiveVariables>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
   AU.addPreservedID(MachineDominatorsID);
-  
+
   if (!StrongPHIElim) {
     AU.addPreservedID(PHIEliminationID);
     AU.addRequiredID(PHIEliminationID);
   }
-  
+
   AU.addRequiredID(TwoAddressInstructionPassID);
   AU.addPreserved<ProcessImplicitDefs>();
   AU.addRequired<ProcessImplicitDefs>();
@@ -84,7 +86,7 @@ void LiveIntervals::releaseMemory() {
   for (DenseMap<unsigned, LiveInterval*>::iterator I = r2iMap_.begin(),
        E = r2iMap_.end(); I != E; ++I)
     delete I->second;
-  
+
   r2iMap_.clear();
 
   // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
@@ -188,10 +190,6 @@ bool LiveIntervals::conflictsWithPhysReg(const LiveInterval &li,
     const MachineInstr &MI = *I;
 
     // Allow copies to and from li.reg
-    unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-    if (tii_->isMoveInstr(MI, SrcReg, DstReg, SrcSubReg, DstSubReg))
-      if (SrcReg == li.reg || DstReg == li.reg)
-        continue;
     if (MI.isCopy())
       if (MI.getOperand(0).getReg() == li.reg ||
           MI.getOperand(1).getReg() == li.reg)
@@ -278,7 +276,7 @@ bool MultipleDefsBySameMI(const MachineInstr &MI, unsigned MOIdx) {
 
 /// isPartialRedef - Return true if the specified def at the specific index is
 /// partially re-defining the specified live interval. A common case of this is
-/// a definition of the sub-register. 
+/// a definition of the sub-register.
 bool LiveIntervals::isPartialRedef(SlotIndex MIIdx, MachineOperand &MO,
                                    LiveInterval &interval) {
   if (!MO.getSubReg() || MO.isEarlyClobber())
@@ -324,9 +322,7 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       mi->addRegisterDefined(interval.reg);
 
     MachineInstr *CopyMI = NULL;
-    unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-    if (mi->isCopyLike() ||
-        tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg)) {
+    if (mi->isCopyLike()) {
       CopyMI = mi;
     }
 
@@ -420,8 +416,8 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
     // def-and-use register operand.
 
     // It may also be partial redef like this:
-    // 80	%reg1041:6<def> = VSHRNv4i16 %reg1034<kill>, 12, pred:14, pred:%reg0
-    // 120	%reg1041:5<def> = VSHRNv4i16 %reg1039<kill>, 12, pred:14, pred:%reg0
+    // 80  %reg1041:6<def> = VSHRNv4i16 %reg1034<kill>, 12, pred:14, pred:%reg0
+    // 120 %reg1041:5<def> = VSHRNv4i16 %reg1039<kill>, 12, pred:14, pred:%reg0
     bool PartReDef = isPartialRedef(MIIdx, MO, interval);
     if (PartReDef || mi->isRegTiedToUseOperand(MOIdx)) {
       // If this is a two-address definition, then we have already processed
@@ -454,11 +450,9 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       OldValNo->setCopy(0);
 
       // A re-def may be a copy. e.g. %reg1030:6<def> = VMOVD %reg1026, ...
-      unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-      if (PartReDef && (mi->isCopyLike() ||
-          tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg)))
+      if (PartReDef && mi->isCopyLike())
         OldValNo->setCopy(&*mi);
-      
+
       // Add the new live interval which replaces the range for the input copy.
       LiveRange LR(DefIndex, RedefIndex, ValNo);
       DEBUG(dbgs() << " replace range with " << LR);
@@ -485,12 +479,10 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
 
       VNInfo *ValNo;
       MachineInstr *CopyMI = NULL;
-      unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-      if (mi->isCopyLike() ||
-          tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg))
+      if (mi->isCopyLike())
         CopyMI = mi;
       ValNo = interval.getNextValue(defIndex, CopyMI, true, VNInfoAllocator);
-      
+
       SlotIndex killIndex = getMBBEndIdx(mbb);
       LiveRange LR(defIndex, killIndex, ValNo);
       interval.addRange(LR);
@@ -567,10 +559,10 @@ void LiveIntervals::handlePhysicalRegisterDef(MachineBasicBlock *MBB,
         goto exit;
       }
     }
-    
+
     baseIndex = baseIndex.getNextIndex();
   }
-  
+
   // The only case we should have a dead physreg here without a killing or
   // instruction where we know it's dead is if it is live-in to the function
   // and never used. Another possible case is the implicit use of the
@@ -602,9 +594,7 @@ void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB,
                              getOrCreateInterval(MO.getReg()));
   else if (allocatableRegs_[MO.getReg()]) {
     MachineInstr *CopyMI = NULL;
-    unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-    if (MI->isCopyLike() ||
-        tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubReg, DstSubReg))
+    if (MI->isCopyLike())
       CopyMI = MI;
     handlePhysicalRegisterDef(MBB, MI, MIIdx, MO,
                               getOrCreateInterval(MO.getReg()), CopyMI);
@@ -696,7 +686,7 @@ void LiveIntervals::handleLiveInRegister(MachineBasicBlock *MBB,
 /// registers. for some ordering of the machine instructions [1,N] a
 /// live interval is an interval [i, j) where 1 <= i <= j < N for
 /// which a variable is live
-void LiveIntervals::computeIntervals() { 
+void LiveIntervals::computeIntervals() {
   DEBUG(dbgs() << "********** COMPUTING LIVE INTERVALS **********\n"
                << "********** Function: "
                << ((Value*)mf_->getFunction())->getName() << '\n');
@@ -723,11 +713,11 @@ void LiveIntervals::computeIntervals() {
           handleLiveInRegister(MBB, MIIndex, getOrCreateInterval(*AS),
                                true);
     }
-    
+
     // Skip over empty initial indices.
     if (getInstructionFromIndex(MIIndex) == 0)
       MIIndex = indexes_->getNextNonNullIndex(MIIndex);
-    
+
     for (MachineBasicBlock::iterator MI = MBB->begin(), miEnd = MBB->end();
          MI != miEnd; ++MI) {
       DEBUG(dbgs() << MIIndex << "\t" << *MI);
@@ -746,7 +736,7 @@ void LiveIntervals::computeIntervals() {
         else if (MO.isUndef())
           UndefUses.push_back(MO.getReg());
       }
-      
+
       // Move to the next instr slot.
       MIIndex = indexes_->getNextNonNullIndex(MIIndex);
     }
@@ -791,7 +781,7 @@ unsigned LiveIntervals::getReMatImplicitUse(const LiveInterval &li,
     unsigned Reg = MO.getReg();
     if (Reg == 0 || Reg == li.reg)
       continue;
-    
+
     if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
         !allocatableRegs_[Reg])
       continue;
@@ -810,7 +800,7 @@ unsigned LiveIntervals::getReMatImplicitUse(const LiveInterval &li,
 /// which reaches the given instruction also reaches the specified use index.
 bool LiveIntervals::isValNoAvailableAt(const LiveInterval &li, MachineInstr *MI,
                                        SlotIndex UseIdx) const {
-  SlotIndex Index = getInstructionIndex(MI);  
+  SlotIndex Index = getInstructionIndex(MI);
   VNInfo *ValNo = li.FindLiveRangeContaining(Index)->valno;
   LiveInterval::const_iterator UI = li.FindLiveRangeContaining(UseIdx);
   return UI != li.end() && UI->valno == ValNo;
@@ -915,7 +905,7 @@ static bool FilterFoldedOps(MachineInstr *MI,
   }
   return false;
 }
-                           
+
 
 /// tryFoldMemoryOperand - Attempts to fold either a spill / restore from
 /// slot / to reg or any rematerialized load into ith operand of specified
@@ -1035,7 +1025,7 @@ void LiveIntervals::rewriteImplicitOps(const LiveInterval &li,
 /// for addIntervalsForSpills to rewrite uses / defs for the given live range.
 bool LiveIntervals::
 rewriteInstructionForSpills(const LiveInterval &li, const VNInfo *VNI,
-                 bool TrySplit, SlotIndex index, SlotIndex end, 
+                 bool TrySplit, SlotIndex index, SlotIndex end,
                  MachineInstr *MI,
                  MachineInstr *ReMatOrigDefMI, MachineInstr *ReMatDefMI,
                  unsigned Slot, int LdSlot,
@@ -1094,7 +1084,7 @@ rewriteInstructionForSpills(const LiveInterval &li, const VNInfo *VNI,
     //      keep the src/dst regs pinned.
     //
     // Keep track of whether we replace a use and/or def so that we can
-    // create the spill interval with the appropriate range. 
+    // create the spill interval with the appropriate range.
     SmallVector<unsigned, 2> Ops;
     tie(HasUse, HasDef) = MI->readsWritesVirtualRegister(Reg, &Ops);
 
@@ -1156,7 +1146,7 @@ rewriteInstructionForSpills(const LiveInterval &li, const VNInfo *VNI,
       if (mopj.isImplicit())
         rewriteImplicitOps(li, MI, NewVReg, vrm);
     }
-            
+
     if (CreatedNewVReg) {
       if (DefIsReMat) {
         vrm.setVirtIsReMaterialized(NewVReg, ReMatDefMI);
@@ -1696,7 +1686,7 @@ addIntervalsForSpills(const LiveInterval &li,
   if (NeedStackSlot && vrm.getPreSplitReg(li.reg) == 0) {
     if (vrm.getStackSlot(li.reg) == VirtRegMap::NO_STACK_SLOT)
       Slot = vrm.assignVirt2StackSlot(li.reg);
-    
+
     // This case only occurs when the prealloc splitter has already assigned
     // a stack slot to this vreg.
     else
@@ -1753,7 +1743,7 @@ addIntervalsForSpills(const LiveInterval &li,
             Ops.push_back(j);
             if (MO.isDef())
               continue;
-            if (isReMat || 
+            if (isReMat ||
                 (!FoundUse && !alsoFoldARestore(Id, index, VReg,
                                                 RestoreMBBs, RestoreIdxes))) {
               // MI has two-address uses of the same register. If the use
@@ -1866,7 +1856,6 @@ addIntervalsForSpills(const LiveInterval &li,
   for (unsigned i = 0, e = NewLIs.size(); i != e; ++i) {
     LiveInterval *LI = NewLIs[i];
     if (!LI->empty()) {
-      LI->weight /= SlotIndex::NUM * getApproximateInstructionCount(*LI);
       if (!AddedKill.count(LI)) {
         LiveRange *LR = &LI->ranges[LI->ranges.size()-1];
         SlotIndex LastUseIdx = LR->end.getBaseIndex();
@@ -1899,7 +1888,7 @@ bool LiveIntervals::hasAllocatableSuperReg(unsigned Reg) const {
 /// getRepresentativeReg - Find the largest super register of the specified
 /// physical register.
 unsigned LiveIntervals::getRepresentativeReg(unsigned Reg) const {
-  // Find the largest super-register that is allocatable. 
+  // Find the largest super-register that is allocatable.
   unsigned BestReg = Reg;
   for (const unsigned* AS = tri_->getSuperRegisters(Reg); *AS; ++AS) {
     unsigned SuperReg = *AS;
@@ -2013,7 +2002,7 @@ LiveRange LiveIntervals::addLiveRangeToEndOfBlock(unsigned reg,
      SlotIndex(getInstructionIndex(startInst).getDefIndex()),
      getMBBEndIdx(startInst->getParent()), VN);
   Interval.addRange(LR);
-  
+
   return LR;
 }
 
diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStackAnalysis.cpp
index 709e2c6d5ca7e..b5c385f772394 100644
--- a/lib/CodeGen/LiveStackAnalysis.cpp
+++ b/lib/CodeGen/LiveStackAnalysis.cpp
@@ -25,7 +25,8 @@
 using namespace llvm;
 
 char LiveStacks::ID = 0;
-static RegisterPass<LiveStacks> X("livestacks", "Live Stack Slot Analysis");
+INITIALIZE_PASS(LiveStacks, "livestacks",
+                "Live Stack Slot Analysis", false, false);
 
 void LiveStacks::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 41b891d30f23b..375307b973a97 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -42,7 +42,8 @@
 using namespace llvm;
 
 char LiveVariables::ID = 0;
-static RegisterPass<LiveVariables> X("livevars", "Live Variable Analysis");
+INITIALIZE_PASS(LiveVariables, "livevars",
+                "Live Variable Analysis", false, false);
 
 
 void LiveVariables::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -482,21 +483,6 @@ void LiveVariables::UpdatePhysRegDefs(MachineInstr *MI,
   }
 }
 
-namespace {
-  struct RegSorter {
-    const TargetRegisterInfo *TRI;
-
-    RegSorter(const TargetRegisterInfo *tri) : TRI(tri) { }
-    bool operator()(unsigned A, unsigned B) {
-      if (TRI->isSubRegister(A, B))
-        return true;
-      else if (TRI->isSubRegister(B, A))
-        return false;
-      return A < B;
-    }
-  };
-}
-
 bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
   MRI = &mf.getRegInfo();
diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp
new file mode 100644
index 0000000000000..7e366f0ceec02
--- /dev/null
+++ b/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -0,0 +1,354 @@
+//===- LocalStackSlotAllocation.cpp - Pre-allocate locals to stack slots --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass assigns local frame indices to stack slots relative to one another
+// and allocates additional base registers to access them when the target
+// estimates the are likely to be out of range of stack pointer and frame
+// pointer relative addressing.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "localstackalloc"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+
+using namespace llvm;
+
+STATISTIC(NumAllocations, "Number of frame indices allocated into local block");
+STATISTIC(NumBaseRegisters, "Number of virtual frame base registers allocated");
+STATISTIC(NumReplacements, "Number of frame indices references replaced");
+
+namespace {
+  class FrameRef {
+    MachineBasicBlock::iterator MI; // Instr referencing the frame
+    int64_t LocalOffset;            // Local offset of the frame idx referenced
+  public:
+    FrameRef(MachineBasicBlock::iterator I, int64_t Offset) :
+      MI(I), LocalOffset(Offset) {}
+    bool operator<(const FrameRef &RHS) const {
+      return LocalOffset < RHS.LocalOffset;
+    }
+    MachineBasicBlock::iterator getMachineInstr() { return MI; }
+  };
+
+  class LocalStackSlotPass: public MachineFunctionPass {
+    SmallVector<int64_t,16> LocalOffsets;
+
+    void AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx, int64_t &Offset,
+                           bool StackGrowsDown, unsigned &MaxAlign);
+    void calculateFrameObjectOffsets(MachineFunction &Fn);
+    bool insertFrameReferenceRegisters(MachineFunction &Fn);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit LocalStackSlotPass() : MachineFunctionPass(ID) { }
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    const char *getPassName() const {
+      return "Local Stack Slot Allocation";
+    }
+
+  private:
+  };
+} // end anonymous namespace
+
+char LocalStackSlotPass::ID = 0;
+
+FunctionPass *llvm::createLocalStackSlotAllocationPass() {
+  return new LocalStackSlotPass();
+}
+
+bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  unsigned LocalObjectCount = MFI->getObjectIndexEnd();
+
+  // If the target doesn't want/need this pass, or if there are no locals
+  // to consider, early exit.
+  if (!TRI->requiresVirtualBaseRegisters(MF) || LocalObjectCount == 0)
+    return true;
+
+  // Make sure we have enough space to store the local offsets.
+  LocalOffsets.resize(MFI->getObjectIndexEnd());
+
+  // Lay out the local blob.
+  calculateFrameObjectOffsets(MF);
+
+  // Insert virtual base registers to resolve frame index references.
+  bool UsedBaseRegs = insertFrameReferenceRegisters(MF);
+
+  // Tell MFI whether any base registers were allocated. PEI will only
+  // want to use the local block allocations from this pass if there were any.
+  // Otherwise, PEI can do a bit better job of getting the alignment right
+  // without a hole at the start since it knows the alignment of the stack
+  // at the start of local allocation, and this pass doesn't.
+  MFI->setUseLocalStackAllocationBlock(UsedBaseRegs);
+
+  return true;
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+void LocalStackSlotPass::AdjustStackOffset(MachineFrameInfo *MFI,
+                                           int FrameIdx, int64_t &Offset,
+                                           bool StackGrowsDown,
+                                           unsigned &MaxAlign) {
+  // If the stack grows down, add the object size to find the lowest address.
+  if (StackGrowsDown)
+    Offset += MFI->getObjectSize(FrameIdx);
+
+  unsigned Align = MFI->getObjectAlignment(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Align);
+
+  // Adjust to alignment boundary.
+  Offset = (Offset + Align - 1) / Align * Align;
+
+  int64_t LocalOffset = StackGrowsDown ? -Offset : Offset;
+  DEBUG(dbgs() << "Allocate FI(" << FrameIdx << ") to local offset "
+        << LocalOffset << "\n");
+  // Keep the offset available for base register allocation
+  LocalOffsets[FrameIdx] = LocalOffset;
+  // And tell MFI about it for PEI to use later
+  MFI->mapLocalFrameObject(FrameIdx, LocalOffset);
+
+  if (!StackGrowsDown)
+    Offset += MFI->getObjectSize(FrameIdx);
+
+  ++NumAllocations;
+}
+
+/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
+/// abstract stack objects.
+///
+void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+  const TargetFrameInfo &TFI = *Fn.getTarget().getFrameInfo();
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameInfo::StackGrowsDown;
+  int64_t Offset = 0;
+  unsigned MaxAlign = 0;
+
+  // Make sure that the stack protector comes before the local variables on the
+  // stack.
+  SmallSet<int, 16> LargeStackObjs;
+  if (MFI->getStackProtectorIndex() >= 0) {
+    AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), Offset,
+                      StackGrowsDown, MaxAlign);
+
+    // Assign large stack objects first.
+    for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+      if (MFI->isDeadObjectIndex(i))
+        continue;
+      if (MFI->getStackProtectorIndex() == (int)i)
+        continue;
+      if (!MFI->MayNeedStackProtector(i))
+        continue;
+
+      AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign);
+      LargeStackObjs.insert(i);
+    }
+  }
+
+  // Then assign frame offsets to stack objects that are not used to spill
+  // callee saved registers.
+  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+    if (MFI->isDeadObjectIndex(i))
+      continue;
+    if (MFI->getStackProtectorIndex() == (int)i)
+      continue;
+    if (LargeStackObjs.count(i))
+      continue;
+
+    AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign);
+  }
+
+  // Remember how big this blob of stack space is
+  MFI->setLocalFrameSize(Offset);
+  MFI->setLocalFrameMaxAlign(MaxAlign);
+}
+
+static inline bool
+lookupCandidateBaseReg(const SmallVector<std::pair<unsigned, int64_t>, 8> &Regs,
+                       std::pair<unsigned, int64_t> &RegOffset,
+                       int64_t FrameSizeAdjust,
+                       int64_t LocalFrameOffset,
+                       const MachineInstr *MI,
+                       const TargetRegisterInfo *TRI) {
+  unsigned e = Regs.size();
+  for (unsigned i = 0; i < e; ++i) {
+    RegOffset = Regs[i];
+    // Check if the relative offset from the where the base register references
+    // to the target address is in range for the instruction.
+    int64_t Offset = FrameSizeAdjust + LocalFrameOffset - RegOffset.second;
+    if (TRI->isFrameOffsetLegal(MI, Offset))
+      return true;
+  }
+  return false;
+}
+
+bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
+  // Scan the function's instructions looking for frame index references.
+  // For each, ask the target if it wants a virtual base register for it
+  // based on what we can tell it about where the local will end up in the
+  // stack frame. If it wants one, re-use a suitable one we've previously
+  // allocated, or if there isn't one that fits the bill, allocate a new one
+  // and ask the target to create a defining instruction for it.
+  bool UsedBaseReg = false;
+
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
+  const TargetFrameInfo &TFI = *Fn.getTarget().getFrameInfo();
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameInfo::StackGrowsDown;
+  MachineBasicBlock::iterator InsertionPt = Fn.begin()->begin();
+
+  // Collect all of the instructions in the block that reference
+  // a frame index. Also store the frame index referenced to ease later
+  // lookup. (For any insn that has more than one FI reference, we arbitrarily
+  // choose the first one).
+  SmallVector<FrameRef, 64> FrameReferenceInsns;
+  // A base register definition is a register+offset pair.
+  SmallVector<std::pair<unsigned, int64_t>, 8> BaseRegisters;
+
+
+  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+      MachineInstr *MI = I;
+      // Debug value instructions can't be out of range, so they don't need
+      // any updates.
+      if (MI->isDebugValue())
+        continue;
+      // For now, allocate the base register(s) within the basic block
+      // where they're used, and don't try to keep them around outside
+      // of that. It may be beneficial to try sharing them more broadly
+      // than that, but the increased register pressure makes that a
+      // tricky thing to balance. Investigate if re-materializing these
+      // becomes an issue.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        // Consider replacing all frame index operands that reference
+        // an object allocated in the local block.
+        if (MI->getOperand(i).isFI()) {
+          // Don't try this with values not in the local block.
+          if (!MFI->isObjectPreAllocated(MI->getOperand(i).getIndex()))
+            break;
+          FrameReferenceInsns.
+            push_back(FrameRef(MI, LocalOffsets[MI->getOperand(i).getIndex()]));
+          break;
+        }
+      }
+    }
+  }
+  // Sort the frame references by local offset
+  array_pod_sort(FrameReferenceInsns.begin(), FrameReferenceInsns.end());
+
+
+  // Loop throught the frame references and allocate for them as necessary
+  for (int ref = 0, e = FrameReferenceInsns.size(); ref < e ; ++ref) {
+    MachineBasicBlock::iterator I =
+      FrameReferenceInsns[ref].getMachineInstr();
+    MachineInstr *MI = I;
+    for (unsigned idx = 0, e = MI->getNumOperands(); idx != e; ++idx) {
+      // Consider replacing all frame index operands that reference
+      // an object allocated in the local block.
+      if (MI->getOperand(idx).isFI()) {
+        int FrameIdx = MI->getOperand(idx).getIndex();
+
+        assert(MFI->isObjectPreAllocated(FrameIdx) &&
+               "Only pre-allocated locals expected!");
+
+        DEBUG(dbgs() << "Considering: " << *MI);
+        if (TRI->needsFrameBaseReg(MI, LocalOffsets[FrameIdx])) {
+          unsigned BaseReg = 0;
+          int64_t Offset = 0;
+          int64_t FrameSizeAdjust =
+            StackGrowsDown ? MFI->getLocalFrameSize() : 0;
+
+          DEBUG(dbgs() << "  Replacing FI in: " << *MI);
+
+          // If we have a suitable base register available, use it; otherwise
+          // create a new one. Note that any offset encoded in the
+          // instruction itself will be taken into account by the target,
+          // so we don't have to adjust for it here when reusing a base
+          // register.
+          std::pair<unsigned, int64_t> RegOffset;
+          if (lookupCandidateBaseReg(BaseRegisters, RegOffset,
+                                     FrameSizeAdjust,
+                                     LocalOffsets[FrameIdx],
+                                     MI, TRI)) {
+            DEBUG(dbgs() << "  Reusing base register " <<
+                  RegOffset.first << "\n");
+            // We found a register to reuse.
+            BaseReg = RegOffset.first;
+            Offset = FrameSizeAdjust + LocalOffsets[FrameIdx] -
+              RegOffset.second;
+          } else {
+            // No previously defined register was in range, so create a
+            // new one.
+            int64_t InstrOffset = TRI->getFrameIndexInstrOffset(MI, idx);
+            const TargetRegisterClass *RC = TRI->getPointerRegClass();
+            BaseReg = Fn.getRegInfo().createVirtualRegister(RC);
+
+            DEBUG(dbgs() << "  Materializing base register " << BaseReg <<
+                  " at frame local offset " <<
+                  LocalOffsets[FrameIdx] + InstrOffset << "\n");
+            // Tell the target to insert the instruction to initialize
+            // the base register.
+            TRI->materializeFrameBaseRegister(InsertionPt, BaseReg,
+                                              FrameIdx, InstrOffset);
+
+            // The base register already includes any offset specified
+            // by the instruction, so account for that so it doesn't get
+            // applied twice.
+            Offset = -InstrOffset;
+
+            int64_t BaseOffset = FrameSizeAdjust + LocalOffsets[FrameIdx] +
+              InstrOffset;
+            BaseRegisters.push_back(
+              std::pair<unsigned, int64_t>(BaseReg, BaseOffset));
+            ++NumBaseRegisters;
+            UsedBaseReg = true;
+          }
+          assert(BaseReg != 0 && "Unable to allocate virtual base register!");
+
+          // Modify the instruction to use the new base register rather
+          // than the frame index operand.
+          TRI->resolveFrameIndex(I, BaseReg, Offset);
+          DEBUG(dbgs() << "Resolved: " << *MI);
+
+          ++NumReplacements;
+        }
+      }
+    }
+  }
+  return UsedBaseReg;
+}
diff --git a/lib/CodeGen/LowerSubregs.cpp b/lib/CodeGen/LowerSubregs.cpp
index dfd4eaeca6603..ad1c537c1911a 100644
--- a/lib/CodeGen/LowerSubregs.cpp
+++ b/lib/CodeGen/LowerSubregs.cpp
@@ -36,7 +36,7 @@ namespace {
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    LowerSubregsInstructionPass() : MachineFunctionPass(&ID) {}
+    LowerSubregsInstructionPass() : MachineFunctionPass(ID) {}
     
     const char *getPassName() const {
       return "Subregister lowering instruction pass";
@@ -58,9 +58,6 @@ namespace {
 
     void TransferDeadFlag(MachineInstr *MI, unsigned DstReg,
                           const TargetRegisterInfo *TRI);
-    void TransferKillFlag(MachineInstr *MI, unsigned SrcReg,
-                          const TargetRegisterInfo *TRI,
-                          bool AddIfNotFound = false);
     void TransferImplicitDefs(MachineInstr *MI);
   };
 
@@ -87,23 +84,6 @@ LowerSubregsInstructionPass::TransferDeadFlag(MachineInstr *MI,
   }
 }
 
-/// TransferKillFlag - MI is a pseudo-instruction with SrcReg killed,
-/// and the lowered replacement instructions immediately precede it.
-/// Mark the replacement instructions with the kill flag.
-void
-LowerSubregsInstructionPass::TransferKillFlag(MachineInstr *MI,
-                                              unsigned SrcReg,
-                                              const TargetRegisterInfo *TRI,
-                                              bool AddIfNotFound) {
-  for (MachineBasicBlock::iterator MII =
-        prior(MachineBasicBlock::iterator(MI)); ; --MII) {
-    if (MII->addRegisterKilled(SrcReg, TRI, AddIfNotFound))
-      break;
-    assert(MII != MI->getParent()->begin() &&
-           "copyPhysReg output doesn't reference source register!");
-  }
-}
-
 /// TransferImplicitDefs - MI is a pseudo-instruction, and the lowered
 /// replacement instructions immediately precede it.  Copy any implicit-def
 /// operands from MI to the replacement instruction.
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index a27ee479433be..50f3f672dcedc 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -441,7 +441,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
 
   MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
   MF->insert(llvm::next(MachineFunction::iterator(this)), NMBB);
-  DEBUG(dbgs() << "PHIElimination splitting critical edge:"
+  DEBUG(dbgs() << "Splitting critical edge:"
         " BB#" << getNumber()
         << " -- BB#" << NMBB->getNumber()
         << " -- BB#" << Succ->getNumber() << '\n');
@@ -468,11 +468,33 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
     LV->addNewBlock(NMBB, this, Succ);
 
   if (MachineDominatorTree *MDT =
-        P->getAnalysisIfAvailable<MachineDominatorTree>())
-    MDT->addNewBlock(NMBB, this);
+      P->getAnalysisIfAvailable<MachineDominatorTree>()) {
+    // Update dominator information.
+    MachineDomTreeNode *SucccDTNode = MDT->getNode(Succ);
+
+    bool IsNewIDom = true;
+    for (const_pred_iterator PI = Succ->pred_begin(), E = Succ->pred_end();
+         PI != E; ++PI) {
+      MachineBasicBlock *PredBB = *PI;
+      if (PredBB == NMBB)
+        continue;
+      if (!MDT->dominates(SucccDTNode, MDT->getNode(PredBB))) {
+        IsNewIDom = false;
+        break;
+      }
+    }
+
+    // We know "this" dominates the newly created basic block.
+    MachineDomTreeNode *NewDTNode = MDT->addNewBlock(NMBB, this);
+
+    // If all the other predecessors of "Succ" are dominated by "Succ" itself
+    // then the new block is the new immediate dominator of "Succ". Otherwise,
+    // the new block doesn't dominate anything.
+    if (IsNewIDom)
+      MDT->changeImmediateDominator(SucccDTNode, NewDTNode);
+  }
 
-  if (MachineLoopInfo *MLI =
-        P->getAnalysisIfAvailable<MachineLoopInfo>())
+  if (MachineLoopInfo *MLI = P->getAnalysisIfAvailable<MachineLoopInfo>())
     if (MachineLoop *TIL = MLI->getLoopFor(this)) {
       // If one or the other blocks were not in a loop, the new block is not
       // either, and thus LI doesn't need to be updated.
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index 833cc00027db5..92e2299ec62fa 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -41,7 +41,7 @@ namespace {
     MachineRegisterInfo *MRI;
   public:
     static char ID; // Pass identification
-    MachineCSE() : MachineFunctionPass(&ID), LookAheadLimit(5), CurrVN(0) {}
+    MachineCSE() : MachineFunctionPass(ID), LookAheadLimit(5), CurrVN(0) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
     
@@ -49,6 +49,7 @@ namespace {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
       AU.addRequired<AliasAnalysis>();
+      AU.addPreservedID(MachineLoopInfoID);
       AU.addRequired<MachineDominatorTree>();
       AU.addPreserved<MachineDominatorTree>();
     }
@@ -85,8 +86,8 @@ namespace {
 } // end anonymous namespace
 
 char MachineCSE::ID = 0;
-static RegisterPass<MachineCSE>
-X("machine-cse", "Machine Common Subexpression Elimination");
+INITIALIZE_PASS(MachineCSE, "machine-cse",
+                "Machine Common Subexpression Elimination", false, false);
 
 FunctionPass *llvm::createMachineCSEPass() { return new MachineCSE(); }
 
@@ -107,29 +108,9 @@ bool MachineCSE::PerformTrivialCoalescing(MachineInstr *MI,
     MachineInstr *DefMI = MRI->getVRegDef(Reg);
     if (DefMI->getParent() != MBB)
       continue;
-    unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-    if (TII->isMoveInstr(*DefMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
-        TargetRegisterInfo::isVirtualRegister(SrcReg) &&
-        !SrcSubIdx && !DstSubIdx) {
-      const TargetRegisterClass *SRC   = MRI->getRegClass(SrcReg);
-      const TargetRegisterClass *RC    = MRI->getRegClass(Reg);
-      const TargetRegisterClass *NewRC = getCommonSubClass(RC, SRC);
-      if (!NewRC)
-        continue;
-      DEBUG(dbgs() << "Coalescing: " << *DefMI);
-      DEBUG(dbgs() << "*** to: " << *MI);
-      MO.setReg(SrcReg);
-      MRI->clearKillFlags(SrcReg);
-      if (NewRC != SRC)
-        MRI->setRegClass(SrcReg, NewRC);
-      DefMI->eraseFromParent();
-      ++NumCoalesces;
-      Changed = true;
-    }
-
     if (!DefMI->isCopy())
       continue;
-    SrcReg = DefMI->getOperand(1).getReg();
+    unsigned SrcReg = DefMI->getOperand(1).getReg();
     if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
       continue;
     if (DefMI->getOperand(0).getSubReg() || DefMI->getOperand(1).getSubReg())
@@ -261,19 +242,13 @@ bool MachineCSE::PhysRegDefReaches(MachineInstr *CSMI, MachineInstr *MI,
   return false;
 }
 
-static bool isCopy(const MachineInstr *MI, const TargetInstrInfo *TII) {
-  unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-  return MI->isCopyLike() ||
-    TII->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx);
-}
-
 bool MachineCSE::isCSECandidate(MachineInstr *MI) {
   if (MI->isLabel() || MI->isPHI() || MI->isImplicitDef() ||
       MI->isKill() || MI->isInlineAsm() || MI->isDebugValue())
     return false;
 
   // Ignore copies.
-  if (isCopy(MI, TII))
+  if (MI->isCopyLike())
     return false;
 
   // Ignore stuff that we obviously can't move.
@@ -329,7 +304,7 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
            E = MRI->use_nodbg_end(); I != E; ++I) {
       MachineInstr *Use = &*I;
       // Ignore copies.
-      if (!isCopy(Use, TII)) {
+      if (!Use->isCopyLike()) {
         HasNonCopyUse = true;
         break;
       }
@@ -385,7 +360,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
       // Look for trivial copy coalescing opportunities.
       if (PerformTrivialCoalescing(MI, MBB)) {
         // After coalescing MI itself may become a copy.
-        if (isCopy(MI, TII))
+        if (MI->isCopyLike())
           continue;
         FoundCSE = VNT.count(MI);
       }
diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp
index b5f8fbba99deb..3c674789244a2 100644
--- a/lib/CodeGen/MachineDominators.cpp
+++ b/lib/CodeGen/MachineDominators.cpp
@@ -24,10 +24,10 @@ TEMPLATE_INSTANTIATION(class DominatorTreeBase<MachineBasicBlock>);
 
 char MachineDominatorTree::ID = 0;
 
-static RegisterPass<MachineDominatorTree>
-E("machinedomtree", "MachineDominator Tree Construction", true);
+INITIALIZE_PASS(MachineDominatorTree, "machinedomtree",
+                "MachineDominator Tree Construction", true, true);
 
-const PassInfo *const llvm::MachineDominatorsID = &E;
+char &llvm::MachineDominatorsID = MachineDominatorTree::ID;
 
 void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
@@ -41,7 +41,7 @@ bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
 }
 
 MachineDominatorTree::MachineDominatorTree()
-    : MachineFunctionPass(&ID) {
+    : MachineFunctionPass(ID) {
   DT = new DominatorTreeBase<MachineBasicBlock>(false);
 }
 
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 666120f032c60..017170076cebb 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -397,7 +397,6 @@ void MachineFunction::viewCFGOnly() const
 /// create a corresponding virtual register for it.
 unsigned MachineFunction::addLiveIn(unsigned PReg,
                                     const TargetRegisterClass *RC) {
-  assert(RC->contains(PReg) && "Not the correct regclass!");
   MachineRegisterInfo &MRI = getRegInfo();
   unsigned VReg = MRI.getLiveInVirtReg(PReg);
   if (VReg) {
@@ -447,7 +446,7 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
   unsigned StackAlign = TFI.getStackAlignment();
   unsigned Align = MinAlign(SPOffset, StackAlign);
   Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
-                                              /*isSS*/false));
+                                              /*isSS*/false, false));
   return -++NumFixedObjects;
 }
 
diff --git a/lib/CodeGen/MachineFunctionAnalysis.cpp b/lib/CodeGen/MachineFunctionAnalysis.cpp
index 07a0f45c0f481..4f84b952e0612 100644
--- a/lib/CodeGen/MachineFunctionAnalysis.cpp
+++ b/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -20,14 +20,14 @@ using namespace llvm;
 // a default constructor.
 static PassInfo
 X("Machine Function Analysis", "machine-function-analysis",
-  intptr_t(&MachineFunctionAnalysis::ID), 0,
+   &MachineFunctionAnalysis::ID, 0,
   /*CFGOnly=*/false, /*is_analysis=*/true);
 
 char MachineFunctionAnalysis::ID = 0;
 
 MachineFunctionAnalysis::MachineFunctionAnalysis(const TargetMachine &tm,
                                                  CodeGenOpt::Level OL) :
-  FunctionPass(&ID), TM(tm), OptLevel(OL), MF(0) {
+  FunctionPass(ID), TM(tm), OptLevel(OL), MF(0) {
 }
 
 MachineFunctionAnalysis::~MachineFunctionAnalysis() {
diff --git a/lib/CodeGen/MachineFunctionPrinterPass.cpp b/lib/CodeGen/MachineFunctionPrinterPass.cpp
index 547c4febc8dae..2aaa798a02c19 100644
--- a/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -29,7 +29,7 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass {
   const std::string Banner;
 
   MachineFunctionPrinterPass(raw_ostream &os, const std::string &banner) 
-      : MachineFunctionPass(&ID), OS(os), Banner(banner) {}
+      : MachineFunctionPass(ID), OS(os), Banner(banner) {}
 
   const char *getPassName() const { return "MachineFunction Printer"; }
 
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 6b2e98549c718..446e461d5460f 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -1236,12 +1236,18 @@ static void printDebugLoc(DebugLoc DL, const MachineFunction *MF,
 void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
   // We can be a bit tidier if we know the TargetMachine and/or MachineFunction.
   const MachineFunction *MF = 0;
+  const MachineRegisterInfo *MRI = 0;
   if (const MachineBasicBlock *MBB = getParent()) {
     MF = MBB->getParent();
     if (!TM && MF)
       TM = &MF->getTarget();
+    if (MF)
+      MRI = &MF->getRegInfo();
   }
 
+  // Save a list of virtual registers.
+  SmallVector<unsigned, 8> VirtRegs;
+
   // Print explicitly defined operands on the left of an assignment syntax.
   unsigned StartOp = 0, e = getNumOperands();
   for (; StartOp < e && getOperand(StartOp).isReg() &&
@@ -1250,6 +1256,9 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
        ++StartOp) {
     if (StartOp != 0) OS << ", ";
     getOperand(StartOp).print(OS, TM);
+    unsigned Reg = getOperand(StartOp).getReg();
+    if (Reg && TargetRegisterInfo::isVirtualRegister(Reg))
+      VirtRegs.push_back(Reg);
   }
 
   if (StartOp != 0)
@@ -1264,6 +1273,10 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
   for (unsigned i = StartOp, e = getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = getOperand(i);
 
+    if (MO.isReg() && MO.getReg() &&
+        TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      VirtRegs.push_back(MO.getReg());
+
     // Omit call-clobbered registers which aren't used anywhere. This makes
     // call instructions much less noisy on targets where calls clobber lots
     // of registers. Don't rely on MO.isDead() because we may be called before
@@ -1325,11 +1338,29 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
     for (mmo_iterator i = memoperands_begin(), e = memoperands_end();
          i != e; ++i) {
       OS << **i;
-      if (next(i) != e)
+      if (llvm::next(i) != e)
         OS << " ";
     }
   }
 
+  // Print the regclass of any virtual registers encountered.
+  if (MRI && !VirtRegs.empty()) {
+    if (!HaveSemi) OS << ";"; HaveSemi = true;
+    for (unsigned i = 0; i != VirtRegs.size(); ++i) {
+      const TargetRegisterClass *RC = MRI->getRegClass(VirtRegs[i]);
+      OS << " " << RC->getName() << ":%reg" << VirtRegs[i];
+      for (unsigned j = i+1; j != VirtRegs.size();) {
+        if (MRI->getRegClass(VirtRegs[j]) != RC) {
+          ++j;
+          continue;
+        }
+        if (VirtRegs[i] != VirtRegs[j])
+          OS << "," << VirtRegs[j];
+        VirtRegs.erase(VirtRegs.begin()+j);
+      }
+    }
+  }
+
   if (!debugLoc.isUnknown() && MF) {
     if (!HaveSemi) OS << ";";
     OS << " dbg:";
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 4c054f51f3a8a..1a74b747e9f2a 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -68,16 +68,16 @@ namespace {
 
     BitVector AllocatableSet;
 
-    // For each opcode, keep a list of potentail CSE instructions.
+    // For each opcode, keep a list of potential CSE instructions.
     DenseMap<unsigned, std::vector<const MachineInstr*> > CSEMap;
 
   public:
     static char ID; // Pass identification, replacement for typeid
     MachineLICM() :
-      MachineFunctionPass(&ID), PreRegAlloc(true) {}
+      MachineFunctionPass(ID), PreRegAlloc(true) {}
 
     explicit MachineLICM(bool PreRA) :
-      MachineFunctionPass(&ID), PreRegAlloc(PreRA) {}
+      MachineFunctionPass(ID), PreRegAlloc(PreRA) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -189,8 +189,8 @@ namespace {
 } // end anonymous namespace
 
 char MachineLICM::ID = 0;
-static RegisterPass<MachineLICM>
-X("machinelicm", "Machine Loop Invariant Code Motion");
+INITIALIZE_PASS(MachineLICM, "machinelicm",
+                "Machine Loop Invariant Code Motion", false, false);
 
 FunctionPass *llvm::createMachineLICMPass(bool PreRegAlloc) {
   return new MachineLICM(PreRegAlloc);
@@ -488,9 +488,14 @@ void MachineLICM::HoistRegion(MachineDomTreeNode *N) {
     MII = NextMII;
   }
 
-  const std::vector<MachineDomTreeNode*> &Children = N->getChildren();
-  for (unsigned I = 0, E = Children.size(); I != E; ++I)
-    HoistRegion(Children[I]);
+  // Don't hoist things out of a large switch statement.  This often causes
+  // code to be hoisted that wasn't going to be executed, and increases
+  // register pressure in a situation where it's likely to matter.
+  if (BB->succ_size() < 25) {
+    const std::vector<MachineDomTreeNode*> &Children = N->getChildren();
+    for (unsigned I = 0, E = Children.size(); I != E; ++I)
+      HoistRegion(Children[I]);
+  }
 }
 
 /// IsLICMCandidate - Returns true if the instruction may be a suitable
diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp
index 269538b31d0b9..bca4b0c28985f 100644
--- a/lib/CodeGen/MachineLoopInfo.cpp
+++ b/lib/CodeGen/MachineLoopInfo.cpp
@@ -30,10 +30,10 @@ TEMPLATE_INSTANTIATION(MLIB);
 }
 
 char MachineLoopInfo::ID = 0;
-static RegisterPass<MachineLoopInfo>
-X("machine-loops", "Machine Natural Loop Construction", true);
+INITIALIZE_PASS(MachineLoopInfo, "machine-loops",
+                "Machine Natural Loop Construction", true, true);
 
-const PassInfo *const llvm::MachineLoopInfoID = &X;
+char &llvm::MachineLoopInfoID = MachineLoopInfo::ID;
 
 bool MachineLoopInfo::runOnMachineFunction(MachineFunction &) {
   releaseMemory();
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index 15778b46fe0a3..b647a4dcc5308 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -28,8 +28,8 @@ using namespace llvm;
 using namespace llvm::dwarf;
 
 // Handle the Pass registration stuff necessary to use TargetData's.
-static RegisterPass<MachineModuleInfo>
-X("machinemoduleinfo", "Machine Module Information");
+INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo",
+                "Machine Module Information", false, false);
 char MachineModuleInfo::ID = 0;
 
 // Out of line virtual method.
@@ -254,7 +254,7 @@ void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
 //===----------------------------------------------------------------------===//
 
 MachineModuleInfo::MachineModuleInfo(const MCAsmInfo &MAI)
-: ImmutablePass(&ID), Context(MAI),
+: ImmutablePass(ID), Context(MAI),
   ObjFileMMI(0),
   CurCallSite(0), CallsEHReturn(0), CallsUnwindInit(0), DbgInfoAvailable(false){
   // Always emit some info, by default "no personality" info.
@@ -264,7 +264,7 @@ MachineModuleInfo::MachineModuleInfo(const MCAsmInfo &MAI)
 }
 
 MachineModuleInfo::MachineModuleInfo()
-: ImmutablePass(&ID), Context(*(MCAsmInfo*)0) {
+: ImmutablePass(ID), Context(*(MCAsmInfo*)0) {
   assert(0 && "This MachineModuleInfo constructor should never be called, MMI "
          "should always be explicitly constructed by LLVMTargetMachine");
   abort();
@@ -579,10 +579,3 @@ namespace {
     }
   };
 }
-
-MachineModuleInfo::VariableDbgInfoMapTy &
-MachineModuleInfo::getVariableDbgInfo() {
-  std::stable_sort(VariableDbgInfo.begin(), VariableDbgInfo.end(),
-                   VariableDebugSorter());
-  return VariableDbgInfo;
-}
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 61334fc1790a1..c8f8fafe227e6 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -26,11 +26,21 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-STATISTIC(NumSunk, "Number of machine instructions sunk");
+static cl::opt<bool> 
+SplitEdges("machine-sink-split",
+           cl::desc("Split critical edges during machine sinking"),
+           cl::init(false), cl::Hidden);
+static cl::opt<unsigned>
+SplitLimit("split-limit",
+           cl::init(~0u), cl::Hidden);
+
+STATISTIC(NumSunk,  "Number of machine instructions sunk");
+STATISTIC(NumSplit, "Number of critical edges split");
 
 namespace {
   class MachineSinking : public MachineFunctionPass {
@@ -44,7 +54,7 @@ namespace {
 
   public:
     static char ID; // Pass identification
-    MachineSinking() : MachineFunctionPass(&ID) {}
+    MachineSinking() : MachineFunctionPass(ID) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -59,21 +69,28 @@ namespace {
     }
   private:
     bool ProcessBlock(MachineBasicBlock &MBB);
+    MachineBasicBlock *SplitCriticalEdge(MachineBasicBlock *From,
+                                         MachineBasicBlock *To);
     bool SinkInstruction(MachineInstr *MI, bool &SawStore);
-    bool AllUsesDominatedByBlock(unsigned Reg, MachineBasicBlock *MBB) const;
+    bool AllUsesDominatedByBlock(unsigned Reg, MachineBasicBlock *MBB,
+                               MachineBasicBlock *DefMBB, bool &LocalUse) const;
   };
 } // end anonymous namespace
 
 char MachineSinking::ID = 0;
-static RegisterPass<MachineSinking>
-X("machine-sink", "Machine code sinking");
+INITIALIZE_PASS(MachineSinking, "machine-sink",
+                "Machine code sinking", false, false);
 
 FunctionPass *llvm::createMachineSinkingPass() { return new MachineSinking(); }
 
 /// AllUsesDominatedByBlock - Return true if all uses of the specified register
-/// occur in blocks dominated by the specified block.
+/// occur in blocks dominated by the specified block. If any use is in the
+/// definition block, then return false since it is never legal to move def
+/// after uses.
 bool MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
-                                             MachineBasicBlock *MBB) const {
+                                             MachineBasicBlock *MBB,
+                                             MachineBasicBlock *DefMBB,
+                                             bool &LocalUse) const {
   assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
          "Only makes sense for vregs");
   // Ignoring debug uses is necessary so debug info doesn't affect the code.
@@ -91,6 +108,9 @@ bool MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
       // PHI nodes use the operand in the predecessor block, not the block with
       // the PHI.
       UseBlock = UseInst->getOperand(I.getOperandNo()+1).getMBB();
+    } else if (UseBlock == DefMBB) {
+      LocalUse = true;
+      return false;
     }
 
     // Check that it dominates.
@@ -166,6 +186,66 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
   return MadeChange;
 }
 
+MachineBasicBlock *MachineSinking::SplitCriticalEdge(MachineBasicBlock *FromBB,
+                                                     MachineBasicBlock *ToBB) {
+  // Avoid breaking back edge. From == To means backedge for single BB loop.
+  if (!SplitEdges || NumSplit == SplitLimit || FromBB == ToBB)
+    return 0;
+
+  // Check for more "complex" loops.
+  if (LI->getLoopFor(FromBB) != LI->getLoopFor(ToBB) ||
+      !LI->isLoopHeader(ToBB)) {
+    // It's not always legal to break critical edges and sink the computation
+    // to the edge.
+    //
+    // BB#1:
+    // v1024
+    // Beq BB#3
+    // <fallthrough>
+    // BB#2:
+    // ... no uses of v1024
+    // <fallthrough>
+    // BB#3:
+    // ...
+    //       = v1024
+    //
+    // If BB#1 -> BB#3 edge is broken and computation of v1024 is inserted:
+    //
+    // BB#1:
+    // ...
+    // Bne BB#2
+    // BB#4:
+    // v1024 =
+    // B BB#3
+    // BB#2:
+    // ... no uses of v1024
+    // <fallthrough>
+    // BB#3:
+    // ...
+    //       = v1024
+    //
+    // This is incorrect since v1024 is not computed along the BB#1->BB#2->BB#3
+    // flow. We need to ensure the new basic block where the computation is
+    // sunk to dominates all the uses.
+    // It's only legal to break critical edge and sink the computation to the
+    // new block if all the predecessors of "To", except for "From", are
+    // not dominated by "From". Given SSA property, this means these
+    // predecessors are dominated by "To".
+    for (MachineBasicBlock::pred_iterator PI = ToBB->pred_begin(),
+           E = ToBB->pred_end(); PI != E; ++PI) {
+      if (*PI == FromBB)
+        continue;
+      if (!DT->dominates(ToBB, *PI))
+        return 0;
+    }
+
+    // FIXME: Determine if it's cost effective to break this edge.
+    return FromBB->SplitCriticalEdge(ToBB, this);
+  }
+
+  return 0;
+}
+
 /// SinkInstruction - Determine whether it is safe to sink the specified machine
 /// instruction out of its current block into a successor.
 bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
@@ -246,7 +326,8 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
       if (SuccToSinkTo) {
         // If a previous operand picked a block to sink to, then this operand
         // must be sinkable to the same block.
-        if (!AllUsesDominatedByBlock(Reg, SuccToSinkTo))
+        bool LocalUse = false;
+        if (!AllUsesDominatedByBlock(Reg, SuccToSinkTo, ParentBlock, LocalUse))
           return false;
 
         continue;
@@ -256,10 +337,14 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
       // we should sink to.
       for (MachineBasicBlock::succ_iterator SI = ParentBlock->succ_begin(),
            E = ParentBlock->succ_end(); SI != E; ++SI) {
-        if (AllUsesDominatedByBlock(Reg, *SI)) {
+        bool LocalUse = false;
+        if (AllUsesDominatedByBlock(Reg, *SI, ParentBlock, LocalUse)) {
           SuccToSinkTo = *SI;
           break;
         }
+        if (LocalUse)
+          // Def is used locally, it's never safe to move this def.
+          return false;
       }
 
       // If we couldn't find a block to sink to, ignore this instruction.
@@ -303,27 +388,44 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
   if (SuccToSinkTo->pred_size() > 1) {
     // We cannot sink a load across a critical edge - there may be stores in
     // other code paths.
+    bool TryBreak = false;
     bool store = true;
     if (!MI->isSafeToMove(TII, AA, store)) {
-      DEBUG(dbgs() << " *** PUNTING: Wont sink load along critical edge.\n");
-      return false;
+      DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n");
+      TryBreak = true;
     }
 
     // We don't want to sink across a critical edge if we don't dominate the
     // successor. We could be introducing calculations to new code paths.
-    if (!DT->dominates(ParentBlock, SuccToSinkTo)) {
-      DEBUG(dbgs() << " *** PUNTING: Critical edge found\n");
-      return false;
+    if (!TryBreak && !DT->dominates(ParentBlock, SuccToSinkTo)) {
+      DEBUG(dbgs() << " *** NOTE: Critical edge found\n");
+      TryBreak = true;
     }
 
     // Don't sink instructions into a loop.
-    if (LI->isLoopHeader(SuccToSinkTo)) {
-      DEBUG(dbgs() << " *** PUNTING: Loop header found\n");
-      return false;
+    if (!TryBreak && LI->isLoopHeader(SuccToSinkTo)) {
+      DEBUG(dbgs() << " *** NOTE: Loop header found\n");
+      TryBreak = true;
     }
 
     // Otherwise we are OK with sinking along a critical edge.
-    DEBUG(dbgs() << "Sinking along critical edge.\n");
+    if (!TryBreak)
+      DEBUG(dbgs() << "Sinking along critical edge.\n");
+    else {
+      MachineBasicBlock *NewSucc = SplitCriticalEdge(ParentBlock, SuccToSinkTo);
+      if (!NewSucc) {
+        DEBUG(dbgs() <<
+              " *** PUNTING: Not legal or profitable to break critical edge\n");
+        return false;
+      } else {
+        DEBUG(dbgs() << " *** Splitting critical edge:"
+              " BB#" << ParentBlock->getNumber()
+              << " -- BB#" << NewSucc->getNumber()
+              << " -- BB#" << SuccToSinkTo->getNumber() << '\n');
+        SuccToSinkTo = NewSucc;
+        ++NumSplit;
+      }
+    }
   }
 
   // Determine where to insert into. Skip phi nodes.
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 2297c908b1e00..1e88562935eac 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -1,4 +1,4 @@
-//===-- MachineVerifier.cpp - Machine Code Verifier -------------*- C++ -*-===//
+//===-- MachineVerifier.cpp - Machine Code Verifier -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -24,6 +24,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Function.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -44,19 +45,14 @@ using namespace llvm;
 namespace {
   struct MachineVerifier {
 
-    MachineVerifier(Pass *pass, bool allowDoubleDefs) :
+    MachineVerifier(Pass *pass) :
       PASS(pass),
-      allowVirtDoubleDefs(allowDoubleDefs),
-      allowPhysDoubleDefs(true),
       OutFileName(getenv("LLVM_VERIFY_MACHINEINSTRS"))
       {}
 
     bool runOnMachineFunction(MachineFunction &MF);
 
     Pass *const PASS;
-    const bool allowVirtDoubleDefs;
-    const bool allowPhysDoubleDefs;
-
     const char *const OutFileName;
     raw_ostream *OS;
     const MachineFunction *MF;
@@ -91,10 +87,6 @@ namespace {
       // defined. Map value is the user.
       RegMap vregsLiveIn;
 
-      // Vregs that must be dead in because they are defined without being
-      // killed first. Map value is the defining instruction.
-      RegMap vregsDeadIn;
-
       // Regs killed in MBB. They may be defined again, and will then be in both
       // regsKilled and regsLiveOut.
       RegSet regsKilled;
@@ -175,6 +167,7 @@ namespace {
 
     // Analysis information if available
     LiveVariables *LiveVars;
+    const LiveIntervals *LiveInts;
 
     void visitMachineFunctionBefore();
     void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB);
@@ -195,15 +188,14 @@ namespace {
 
     void calcRegsRequired();
     void verifyLiveVariables();
+    void verifyLiveIntervals();
   };
 
   struct MachineVerifierPass : public MachineFunctionPass {
     static char ID; // Pass ID, replacement for typeid
-    bool AllowDoubleDefs;
 
-    explicit MachineVerifierPass(bool allowDoubleDefs = false)
-      : MachineFunctionPass(&ID),
-        AllowDoubleDefs(allowDoubleDefs) {}
+    MachineVerifierPass()
+      : MachineFunctionPass(ID) {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
@@ -211,7 +203,7 @@ namespace {
     }
 
     bool runOnMachineFunction(MachineFunction &MF) {
-      MF.verify(this, AllowDoubleDefs);
+      MF.verify(this);
       return false;
     }
   };
@@ -219,17 +211,15 @@ namespace {
 }
 
 char MachineVerifierPass::ID = 0;
-static RegisterPass<MachineVerifierPass>
-MachineVer("machineverifier", "Verify generated machine code");
-static const PassInfo *const MachineVerifyID = &MachineVer;
+INITIALIZE_PASS(MachineVerifierPass, "machineverifier",
+                "Verify generated machine code", false, false);
 
-FunctionPass *llvm::createMachineVerifierPass(bool allowPhysDoubleDefs) {
-  return new MachineVerifierPass(allowPhysDoubleDefs);
+FunctionPass *llvm::createMachineVerifierPass() {
+  return new MachineVerifierPass();
 }
 
-void MachineFunction::verify(Pass *p, bool allowDoubleDefs) const {
-  MachineVerifier(p, allowDoubleDefs)
-    .runOnMachineFunction(const_cast<MachineFunction&>(*this));
+void MachineFunction::verify(Pass *p) const {
+  MachineVerifier(p).runOnMachineFunction(const_cast<MachineFunction&>(*this));
 }
 
 bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
@@ -255,10 +245,13 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
   TRI = TM->getRegisterInfo();
   MRI = &MF.getRegInfo();
 
+  LiveVars = NULL;
+  LiveInts = NULL;
   if (PASS) {
-    LiveVars = PASS->getAnalysisIfAvailable<LiveVariables>();
-  } else {
-    LiveVars = NULL;
+    LiveInts = PASS->getAnalysisIfAvailable<LiveIntervals>();
+    // We don't want to verify LiveVariables if LiveIntervals is available.
+    if (!LiveInts)
+      LiveVars = PASS->getAnalysisIfAvailable<LiveVariables>();
   }
 
   visitMachineFunctionBefore();
@@ -512,6 +505,20 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     if ((*I)->isStore() && !TI.mayStore())
       report("Missing mayStore flag", MI);
   }
+
+  // Debug values must not have a slot index.
+  // Other instructions must have one.
+  if (LiveInts) {
+    bool mapped = !LiveInts->isNotInMIMap(MI);
+    if (MI->isDebugValue()) {
+      if (mapped)
+        report("Debug instruction has a slot index", MI);
+    } else {
+      if (!mapped)
+        report("Missing slot index", MI);
+    }
+  }
+
 }
 
 void
@@ -570,15 +577,30 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
       } else
         isKill = MO->isKill();
 
-      if (isKill) {
+      if (isKill)
         addRegWithSubRegs(regsKilled, Reg);
 
-        // Check that LiveVars knows this kill
-        if (LiveVars && TargetRegisterInfo::isVirtualRegister(Reg)) {
-          LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
-          if (std::find(VI.Kills.begin(),
-                        VI.Kills.end(), MI) == VI.Kills.end())
-            report("Kill missing from LiveVariables", MO, MONum);
+      // Check that LiveVars knows this kill.
+      if (LiveVars && TargetRegisterInfo::isVirtualRegister(Reg) &&
+          MO->isKill()) {
+        LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
+        if (std::find(VI.Kills.begin(),
+                      VI.Kills.end(), MI) == VI.Kills.end())
+          report("Kill missing from LiveVariables", MO, MONum);
+      }
+
+      // Check LiveInts liveness and kill.
+      if (LiveInts && !LiveInts->isNotInMIMap(MI)) {
+        SlotIndex UseIdx = LiveInts->getInstructionIndex(MI).getUseIndex();
+        if (LiveInts->hasInterval(Reg)) {
+          const LiveInterval &LI = LiveInts->getInterval(Reg);
+          if (!LI.liveAt(UseIdx)) {
+            report("No live range at use", MO, MONum);
+            *OS << UseIdx << " is not live in " << LI << '\n';
+          }
+          // TODO: Verify isKill == LI.killedAt.
+        } else if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+          report("Virtual register has no Live interval", MO, MONum);
         }
       }
 
@@ -607,6 +629,28 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
         addRegWithSubRegs(regsDead, Reg);
       else
         addRegWithSubRegs(regsDefined, Reg);
+
+      // Check LiveInts for a live range, but only for virtual registers.
+      if (LiveInts && TargetRegisterInfo::isVirtualRegister(Reg) &&
+          !LiveInts->isNotInMIMap(MI)) {
+        SlotIndex DefIdx = LiveInts->getInstructionIndex(MI).getDefIndex();
+        if (LiveInts->hasInterval(Reg)) {
+          const LiveInterval &LI = LiveInts->getInterval(Reg);
+          if (const LiveRange *LR = LI.getLiveRangeContaining(DefIdx)) {
+            assert(LR->valno && "NULL valno is not allowed");
+            if (LR->valno->def != DefIdx) {
+              report("Inconsistent valno->def", MO, MONum);
+              *OS << "Valno " << LR->valno->id << " is not defined at "
+                  << DefIdx << " in " << LI << '\n';
+            }
+          } else {
+            report("No live range at def", MO, MONum);
+            *OS << DefIdx << " is not live in " << LI << '\n';
+          }
+        } else {
+          report("Virtual register has no Live interval", MO, MONum);
+        }
+      }
     }
 
     // Check register classes.
@@ -670,40 +714,9 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
 void MachineVerifier::visitMachineInstrAfter(const MachineInstr *MI) {
   BBInfo &MInfo = MBBInfoMap[MI->getParent()];
   set_union(MInfo.regsKilled, regsKilled);
-  set_subtract(regsLive, regsKilled);
-  regsKilled.clear();
-
-  // Verify that both <def> and <def,dead> operands refer to dead registers.
-  RegVector defs(regsDefined);
-  defs.append(regsDead.begin(), regsDead.end());
-
-  for (RegVector::const_iterator I = defs.begin(), E = defs.end();
-       I != E; ++I) {
-    if (regsLive.count(*I)) {
-      if (TargetRegisterInfo::isPhysicalRegister(*I)) {
-        if (!allowPhysDoubleDefs && !isReserved(*I) &&
-            !regsLiveInButUnused.count(*I)) {
-          report("Redefining a live physical register", MI);
-          *OS << "Register " << TRI->getName(*I)
-              << " was defined but already live.\n";
-        }
-      } else {
-        if (!allowVirtDoubleDefs) {
-          report("Redefining a live virtual register", MI);
-          *OS << "Virtual register %reg" << *I
-              << " was defined but already live.\n";
-        }
-      }
-    } else if (TargetRegisterInfo::isVirtualRegister(*I) &&
-               !MInfo.regsKilled.count(*I)) {
-      // Virtual register defined without being killed first must be dead on
-      // entry.
-      MInfo.vregsDeadIn.insert(std::make_pair(*I, MI));
-    }
-  }
-
-  set_subtract(regsLive, regsDead); regsDead.clear();
-  set_union(regsLive, regsDefined); regsDefined.clear();
+  set_subtract(regsLive, regsKilled); regsKilled.clear();
+  set_subtract(regsLive, regsDead);   regsDead.clear();
+  set_union(regsLive, regsDefined);   regsDefined.clear();
 }
 
 void
@@ -828,35 +841,15 @@ void MachineVerifier::visitMachineFunctionAfter() {
       continue;
 
     checkPHIOps(MFI);
-
-    // Verify dead-in virtual registers.
-    if (!allowVirtDoubleDefs) {
-      for (MachineBasicBlock::const_pred_iterator PrI = MFI->pred_begin(),
-             PrE = MFI->pred_end(); PrI != PrE; ++PrI) {
-        BBInfo &PrInfo = MBBInfoMap[*PrI];
-        if (!PrInfo.reachable)
-          continue;
-
-        for (RegMap::iterator I = MInfo.vregsDeadIn.begin(),
-               E = MInfo.vregsDeadIn.end(); I != E; ++I) {
-          // DeadIn register must be in neither regsLiveOut or vregsPassed of
-          // any predecessor.
-          if (PrInfo.isLiveOut(I->first)) {
-            report("Live-in virtual register redefined", I->second);
-            *OS << "Register %reg" << I->first
-                << " was live-out from predecessor MBB #"
-                << (*PrI)->getNumber() << ".\n";
-          }
-        }
-      }
-    }
   }
 
-  // Now check LiveVariables info if available
-  if (LiveVars) {
+  // Now check liveness info if available
+  if (LiveVars || LiveInts)
     calcRegsRequired();
+  if (LiveVars)
     verifyLiveVariables();
-  }
+  if (LiveInts)
+    verifyLiveIntervals();
 }
 
 void MachineVerifier::verifyLiveVariables() {
@@ -886,4 +879,55 @@ void MachineVerifier::verifyLiveVariables() {
   }
 }
 
+void MachineVerifier::verifyLiveIntervals() {
+  assert(LiveInts && "Don't call verifyLiveIntervals without LiveInts");
+  for (LiveIntervals::const_iterator LVI = LiveInts->begin(),
+       LVE = LiveInts->end(); LVI != LVE; ++LVI) {
+    const LiveInterval &LI = *LVI->second;
+    assert(LVI->first == LI.reg && "Invalid reg to interval mapping");
+
+    for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
+         I!=E; ++I) {
+      VNInfo *VNI = *I;
+      const LiveRange *DefLR = LI.getLiveRangeContaining(VNI->def);
+
+      if (!DefLR) {
+        if (!VNI->isUnused()) {
+          report("Valno not live at def and not marked unused", MF);
+          *OS << "Valno #" << VNI->id << " in " << LI << '\n';
+        }
+        continue;
+      }
+
+      if (VNI->isUnused())
+        continue;
+
+      if (DefLR->valno != VNI) {
+        report("Live range at def has different valno", MF);
+        DefLR->print(*OS);
+        *OS << " should use valno #" << VNI->id << " in " << LI << '\n';
+      }
+
+    }
+
+    for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I!=E; ++I) {
+      const LiveRange &LR = *I;
+      assert(LR.valno && "Live range has no valno");
+
+      if (LR.valno->id >= LI.getNumValNums() ||
+          LR.valno != LI.getValNumInfo(LR.valno->id)) {
+        report("Foreign valno in live range", MF);
+        LR.print(*OS);
+        *OS << " has a valno not in " << LI << '\n';
+      }
+
+      if (LR.valno->isUnused()) {
+        report("Live range valno is marked unused", MF);
+        LR.print(*OS);
+        *OS << " in " << LI << '\n';
+      }
+
+    }
+  }
+}
 
diff --git a/lib/CodeGen/OptimizeExts.cpp b/lib/CodeGen/OptimizeExts.cpp
deleted file mode 100644
index dcdc243e5db34..0000000000000
--- a/lib/CodeGen/OptimizeExts.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-//===-- OptimizeExts.cpp - Optimize sign / zero extension instrs -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass performs optimization of sign / zero extension instructions. It
-// may be extended to handle other instructions of similar property.
-//
-// On some targets, some instructions, e.g. X86 sign / zero extension, may
-// leave the source value in the lower part of the result. This pass will
-// replace (some) uses of the pre-extension value with uses of the sub-register
-// of the results.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "ext-opt"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-using namespace llvm;
-
-static cl::opt<bool> Aggressive("aggressive-ext-opt", cl::Hidden,
-                                cl::desc("Aggressive extension optimization"));
-
-STATISTIC(NumReuse, "Number of extension results reused");
-
-namespace {
-  class OptimizeExts : public MachineFunctionPass {
-    const TargetMachine   *TM;
-    const TargetInstrInfo *TII;
-    MachineRegisterInfo *MRI;
-    MachineDominatorTree *DT;   // Machine dominator tree
-
-  public:
-    static char ID; // Pass identification
-    OptimizeExts() : MachineFunctionPass(&ID) {}
-
-    virtual bool runOnMachineFunction(MachineFunction &MF);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      MachineFunctionPass::getAnalysisUsage(AU);
-      if (Aggressive) {
-        AU.addRequired<MachineDominatorTree>();
-        AU.addPreserved<MachineDominatorTree>();
-      }
-    }
-
-  private:
-    bool OptimizeInstr(MachineInstr *MI, MachineBasicBlock *MBB,
-                       SmallPtrSet<MachineInstr*, 8> &LocalMIs);
-  };
-}
-
-char OptimizeExts::ID = 0;
-static RegisterPass<OptimizeExts>
-X("opt-exts", "Optimize sign / zero extensions");
-
-FunctionPass *llvm::createOptimizeExtsPass() { return new OptimizeExts(); }
-
-/// OptimizeInstr - If instruction is a copy-like instruction, i.e. it reads
-/// a single register and writes a single register and it does not modify
-/// the source, and if the source value is preserved as a sub-register of
-/// the result, then replace all reachable uses of the source with the subreg
-/// of the result.
-/// Do not generate an EXTRACT that is used only in a debug use, as this
-/// changes the code.  Since this code does not currently share EXTRACTs, just
-/// ignore all debug uses.
-bool OptimizeExts::OptimizeInstr(MachineInstr *MI, MachineBasicBlock *MBB,
-                                 SmallPtrSet<MachineInstr*, 8> &LocalMIs) {
-  bool Changed = false;
-  LocalMIs.insert(MI);
-
-  unsigned SrcReg, DstReg, SubIdx;
-  if (TII->isCoalescableExtInstr(*MI, SrcReg, DstReg, SubIdx)) {
-    if (TargetRegisterInfo::isPhysicalRegister(DstReg) ||
-        TargetRegisterInfo::isPhysicalRegister(SrcReg))
-      return false;
-
-    MachineRegisterInfo::use_nodbg_iterator UI = MRI->use_nodbg_begin(SrcReg);
-    if (++UI == MRI->use_nodbg_end())
-      // No other uses.
-      return false;
-
-    // Ok, the source has other uses. See if we can replace the other uses
-    // with use of the result of the extension.
-    SmallPtrSet<MachineBasicBlock*, 4> ReachedBBs;
-    UI = MRI->use_nodbg_begin(DstReg);
-    for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end();
-         UI != UE; ++UI)
-      ReachedBBs.insert(UI->getParent());
-
-    bool ExtendLife = true;
-    // Uses that are in the same BB of uses of the result of the instruction.
-    SmallVector<MachineOperand*, 8> Uses;
-    // Uses that the result of the instruction can reach.
-    SmallVector<MachineOperand*, 8> ExtendedUses;
-
-    UI = MRI->use_nodbg_begin(SrcReg);
-    for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end();
-         UI != UE; ++UI) {
-      MachineOperand &UseMO = UI.getOperand();
-      MachineInstr *UseMI = &*UI;
-      if (UseMI == MI)
-        continue;
-      if (UseMI->isPHI()) {
-        ExtendLife = false;
-        continue;
-      }
-
-      // It's an error to translate this:
-      //
-      //    %reg1025 = <sext> %reg1024
-      //     ...
-      //    %reg1026 = SUBREG_TO_REG 0, %reg1024, 4
-      //
-      // into this:
-      //
-      //    %reg1025 = <sext> %reg1024
-      //     ...
-      //    %reg1027 = COPY %reg1025:4
-      //    %reg1026 = SUBREG_TO_REG 0, %reg1027, 4
-      //
-      // The problem here is that SUBREG_TO_REG is there to assert that an
-      // implicit zext occurs. It doesn't insert a zext instruction. If we allow
-      // the COPY here, it will give us the value after the <sext>,
-      // not the original value of %reg1024 before <sext>.
-      if (UseMI->getOpcode() == TargetOpcode::SUBREG_TO_REG)
-        continue;
-
-      MachineBasicBlock *UseMBB = UseMI->getParent();
-      if (UseMBB == MBB) {
-        // Local uses that come after the extension.
-        if (!LocalMIs.count(UseMI))
-          Uses.push_back(&UseMO);
-      } else if (ReachedBBs.count(UseMBB))
-        // Non-local uses where the result of extension is used. Always
-        // replace these unless it's a PHI.
-        Uses.push_back(&UseMO);
-      else if (Aggressive && DT->dominates(MBB, UseMBB))
-        // We may want to extend live range of the extension result in order
-        // to replace these uses.
-        ExtendedUses.push_back(&UseMO);
-      else {
-        // Both will be live out of the def MBB anyway. Don't extend live
-        // range of the extension result.
-        ExtendLife = false;
-        break;
-      }
-    }
-
-    if (ExtendLife && !ExtendedUses.empty())
-      // Ok, we'll extend the liveness of the extension result.
-      std::copy(ExtendedUses.begin(), ExtendedUses.end(),
-                std::back_inserter(Uses));
-
-    // Now replace all uses.
-    if (!Uses.empty()) {
-      SmallPtrSet<MachineBasicBlock*, 4> PHIBBs;
-      // Look for PHI uses of the extended result, we don't want to extend the
-      // liveness of a PHI input. It breaks all kinds of assumptions down
-      // stream. A PHI use is expected to be the kill of its source values.
-      UI = MRI->use_nodbg_begin(DstReg);
-      for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end();
-           UI != UE; ++UI)
-        if (UI->isPHI())
-          PHIBBs.insert(UI->getParent());
-
-      const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
-      for (unsigned i = 0, e = Uses.size(); i != e; ++i) {
-        MachineOperand *UseMO = Uses[i];
-        MachineInstr *UseMI = UseMO->getParent();
-        MachineBasicBlock *UseMBB = UseMI->getParent();
-        if (PHIBBs.count(UseMBB))
-          continue;
-        unsigned NewVR = MRI->createVirtualRegister(RC);
-        BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
-                TII->get(TargetOpcode::COPY), NewVR)
-          .addReg(DstReg, 0, SubIdx);
-        UseMO->setReg(NewVR);
-        ++NumReuse;
-        Changed = true;
-      }
-    }
-  }
-
-  return Changed;
-}
-
-bool OptimizeExts::runOnMachineFunction(MachineFunction &MF) {
-  TM = &MF.getTarget();
-  TII = TM->getInstrInfo();
-  MRI = &MF.getRegInfo();
-  DT = Aggressive ? &getAnalysis<MachineDominatorTree>() : 0;
-
-  bool Changed = false;
-
-  SmallPtrSet<MachineInstr*, 8> LocalMIs;
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
-    MachineBasicBlock *MBB = &*I;
-    LocalMIs.clear();
-    for (MachineBasicBlock::iterator MII = I->begin(), ME = I->end(); MII != ME;
-         ++MII) {
-      MachineInstr *MI = &*MII;
-      Changed |= OptimizeInstr(MI, MBB, LocalMIs);
-    }
-  }
-
-  return Changed;
-}
diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp
index 1613fe21e42dd..edb4eea71b8a7 100644
--- a/lib/CodeGen/OptimizePHIs.cpp
+++ b/lib/CodeGen/OptimizePHIs.cpp
@@ -33,7 +33,7 @@ namespace {
 
   public:
     static char ID; // Pass identification
-    OptimizePHIs() : MachineFunctionPass(&ID) {}
+    OptimizePHIs() : MachineFunctionPass(ID) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -54,8 +54,8 @@ namespace {
 }
 
 char OptimizePHIs::ID = 0;
-static RegisterPass<OptimizePHIs>
-X("opt-phis", "Optimize machine instruction PHIs");
+INITIALIZE_PASS(OptimizePHIs, "opt-phis",
+                "Optimize machine instruction PHIs", false, false);
 
 FunctionPass *llvm::createOptimizePHIsPass() { return new OptimizePHIs(); }
 
@@ -101,16 +101,10 @@ bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
     MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
 
     // Skip over register-to-register moves.
-    unsigned MvSrcReg, MvDstReg, SrcSubIdx, DstSubIdx;
-    if (SrcMI &&
-        TII->isMoveInstr(*SrcMI, MvSrcReg, MvDstReg, SrcSubIdx, DstSubIdx) &&
-        SrcSubIdx == 0 && DstSubIdx == 0 &&
-        TargetRegisterInfo::isVirtualRegister(MvSrcReg))
-      SrcMI = MRI->getVRegDef(MvSrcReg);
-    else if (SrcMI && SrcMI->isCopy() &&
-             !SrcMI->getOperand(0).getSubReg() &&
-             !SrcMI->getOperand(1).getSubReg() &&
-           TargetRegisterInfo::isVirtualRegister(SrcMI->getOperand(1).getReg()))
+    if (SrcMI && SrcMI->isCopy() &&
+        !SrcMI->getOperand(0).getSubReg() &&
+        !SrcMI->getOperand(1).getSubReg() &&
+        TargetRegisterInfo::isVirtualRegister(SrcMI->getOperand(1).getReg()))
       SrcMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg());
     if (!SrcMI)
       return false;
diff --git a/lib/CodeGen/PBQP/HeuristicBase.h b/lib/CodeGen/PBQP/HeuristicBase.h
index 3bb24e1cc370c..791c227f0d07e 100644
--- a/lib/CodeGen/PBQP/HeuristicBase.h
+++ b/lib/CodeGen/PBQP/HeuristicBase.h
@@ -173,9 +173,13 @@ namespace PBQP {
       bool finished = false;
 
       while (!finished) {
-        if (!optimalReduce())
-          if (!impl().heuristicReduce())
+        if (!optimalReduce()) {
+          if (impl().heuristicReduce()) {
+            getSolver().recordRN();
+          } else {
             finished = true;
+          }
+        }
       }
     }
 
diff --git a/lib/CodeGen/PBQP/HeuristicSolver.h b/lib/CodeGen/PBQP/HeuristicSolver.h
index 02938df007004..35514f9674785 100644
--- a/lib/CodeGen/PBQP/HeuristicSolver.h
+++ b/lib/CodeGen/PBQP/HeuristicSolver.h
@@ -226,6 +226,8 @@ namespace PBQP {
 
       // Nothing to do. Just push the node onto the reduction stack.
       pushToStack(nItr);
+
+      s.recordR0();
     }
 
     /// \brief Apply rule R1.
@@ -274,6 +276,7 @@ namespace PBQP {
       assert(nd.getSolverDegree() == 0 &&
              "Degree 1 with edge removed should be 0.");
       pushToStack(xnItr);
+      s.recordR1();
     }
 
     /// \brief Apply rule R2.
@@ -378,8 +381,14 @@ namespace PBQP {
       removeSolverEdge(zxeItr);
 
       pushToStack(xnItr);
+      s.recordR2();
     }
 
+    /// \brief Record an application of the RN rule.
+    ///
+    /// For use by the HeuristicBase.
+    void recordRN() { s.recordRN(); } 
+
   private:
 
     NodeData& getSolverNodeData(Graph::NodeItr nItr) {
diff --git a/lib/CodeGen/PBQP/Heuristics/Briggs.h b/lib/CodeGen/PBQP/Heuristics/Briggs.h
index 4c1ce119ed050..18eaf7c0da9b3 100644
--- a/lib/CodeGen/PBQP/Heuristics/Briggs.h
+++ b/lib/CodeGen/PBQP/Heuristics/Briggs.h
@@ -52,9 +52,7 @@ namespace PBQP {
         bool operator()(Graph::NodeItr n1Itr, Graph::NodeItr n2Itr) const {
           if (s->getSolverDegree(n1Itr) > s->getSolverDegree(n2Itr))
             return true;
-          if (s->getSolverDegree(n1Itr) < s->getSolverDegree(n2Itr))
-            return false;
-          return (&*n1Itr < &*n2Itr);
+          return false;
         }
       private:
         HeuristicSolverImpl<Briggs> *s;
@@ -69,9 +67,7 @@ namespace PBQP {
                   cost2 = g->getNodeCosts(n2Itr)[0] / s->getSolverDegree(n2Itr);
           if (cost1 < cost2)
             return true;
-          if (cost1 > cost2)
-            return false;
-          return (&*n1Itr < &*n2Itr);
+          return false;
         }
 
       private:
diff --git a/lib/CodeGen/PBQP/Solution.h b/lib/CodeGen/PBQP/Solution.h
index 294b5370afdfe..047fd04c7cb8f 100644
--- a/lib/CodeGen/PBQP/Solution.h
+++ b/lib/CodeGen/PBQP/Solution.h
@@ -26,15 +26,46 @@ namespace PBQP {
   /// To get the selection for each node in the problem use the getSelection method.
   class Solution {
   private:
+
     typedef std::map<Graph::NodeItr, unsigned, NodeItrComparator> SelectionsMap;
     SelectionsMap selections;
 
+    unsigned r0Reductions, r1Reductions, r2Reductions, rNReductions;
+
   public:
 
     /// \brief Number of nodes for which selections have been made.
     /// @return Number of nodes for which selections have been made.
     unsigned numNodes() const { return selections.size(); }
 
+    /// \brief Records a reduction via the R0 rule. Should be called from the
+    ///        solver only.
+    void recordR0() { ++r0Reductions; }
+
+    /// \brief Returns the number of R0 reductions applied to solve the problem.
+    unsigned numR0Reductions() const { return r0Reductions; }
+
+    /// \brief Records a reduction via the R1 rule. Should be called from the
+    ///        solver only.
+    void recordR1() { ++r1Reductions; }
+
+    /// \brief Returns the number of R1 reductions applied to solve the problem.
+    unsigned numR1Reductions() const { return r1Reductions; }
+
+    /// \brief Records a reduction via the R2 rule. Should be called from the
+    ///        solver only.
+    void recordR2() { ++r2Reductions; }
+
+    /// \brief Returns the number of R2 reductions applied to solve the problem.
+    unsigned numR2Reductions() const { return r2Reductions; }
+
+    /// \brief Records a reduction via the RN rule. Should be called from the
+    ///        solver only.
+    void recordRN() { ++ rNReductions; }
+
+    /// \brief Returns the number of RN reductions applied to solve the problem.
+    unsigned numRNReductions() const { return rNReductions; }
+
     /// \brief Set the selection for a given node.
     /// @param nItr Node iterator.
     /// @param selection Selection for nItr.
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index ea6b094d7efe4..d4df4c548711e 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Function.h"
@@ -37,16 +38,15 @@ STATISTIC(NumAtomic, "Number of atomic phis lowered");
 STATISTIC(NumReused, "Number of reused lowered phis");
 
 char PHIElimination::ID = 0;
-static RegisterPass<PHIElimination>
-X("phi-node-elimination", "Eliminate PHI nodes for register allocation");
+INITIALIZE_PASS(PHIElimination, "phi-node-elimination",
+                "Eliminate PHI nodes for register allocation", false, false);
 
-const PassInfo *const llvm::PHIEliminationID = &X;
+char &llvm::PHIEliminationID = PHIElimination::ID;
 
 void llvm::PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<LiveVariables>();
   AU.addPreserved<MachineDominatorTree>();
-  // rdar://7401784 This would be nice:
-  // AU.addPreservedID(MachineLoopInfoID);
+  AU.addPreserved<MachineLoopInfo>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -56,9 +56,11 @@ bool llvm::PHIElimination::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
   // Split critical edges to help the coalescer
-  if (LiveVariables *LV = getAnalysisIfAvailable<LiveVariables>())
+  if (LiveVariables *LV = getAnalysisIfAvailable<LiveVariables>()) {
+    MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
     for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
-      Changed |= SplitPHIEdges(MF, *I, *LV);
+      Changed |= SplitPHIEdges(MF, *I, *LV, MLI);
+  }
 
   // Populate VRegPHIUseCount
   analyzePHINodes(MF);
@@ -179,6 +181,7 @@ void llvm::PHIElimination::LowerAtomicPHINode(
 
   unsigned NumSrcs = (MPhi->getNumOperands() - 1) / 2;
   unsigned DestReg = MPhi->getOperand(0).getReg();
+  assert(MPhi->getOperand(0).getSubReg() == 0 && "Can't handle sub-reg PHIs");
   bool isDead = MPhi->getOperand(0).isDead();
 
   // Create a new register for the incoming PHI arguments.
@@ -265,6 +268,8 @@ void llvm::PHIElimination::LowerAtomicPHINode(
   SmallPtrSet<MachineBasicBlock*, 8> MBBsInsertedInto;
   for (int i = NumSrcs - 1; i >= 0; --i) {
     unsigned SrcReg = MPhi->getOperand(i*2+1).getReg();
+    unsigned SrcSubReg = MPhi->getOperand(i*2+1).getSubReg();
+
     assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
            "Machine PHI Operands must all be virtual registers!");
 
@@ -294,7 +299,7 @@ void llvm::PHIElimination::LowerAtomicPHINode(
     // Insert the copy.
     if (!reusedIncoming && IncomingReg)
       BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
-              TII->get(TargetOpcode::COPY), IncomingReg).addReg(SrcReg);
+              TII->get(TargetOpcode::COPY), IncomingReg).addReg(SrcReg, 0, SrcSubReg);
 
     // Now update live variable information if we have it.  Otherwise we're done
     if (!LV) continue;
@@ -378,10 +383,12 @@ void llvm::PHIElimination::analyzePHINodes(const MachineFunction& MF) {
 
 bool llvm::PHIElimination::SplitPHIEdges(MachineFunction &MF,
                                          MachineBasicBlock &MBB,
-                                         LiveVariables &LV) {
+                                         LiveVariables &LV,
+                                         MachineLoopInfo *MLI) {
   if (MBB.empty() || !MBB.front().isPHI() || MBB.isLandingPad())
     return false;   // Quick exit for basic blocks without PHIs.
 
+  bool Changed = false;
   for (MachineBasicBlock::const_iterator BBI = MBB.begin(), BBE = MBB.end();
        BBI != BBE && BBI->isPHI(); ++BBI) {
     for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) {
@@ -390,8 +397,15 @@ bool llvm::PHIElimination::SplitPHIEdges(MachineFunction &MF,
       // We break edges when registers are live out from the predecessor block
       // (not considering PHI nodes). If the register is live in to this block
       // anyway, we would gain nothing from splitting.
-      if (!LV.isLiveIn(Reg, MBB) && LV.isLiveOut(Reg, *PreMBB))
-        PreMBB->SplitCriticalEdge(&MBB, this);
+      // Avoid splitting backedges of loops. It would introduce small
+      // out-of-line blocks into the loop which is very bad for code placement.
+      if (PreMBB != &MBB &&
+          !LV.isLiveIn(Reg, MBB) && LV.isLiveOut(Reg, *PreMBB)) {
+        if (!MLI ||
+            !(MLI->getLoopFor(PreMBB) == MLI->getLoopFor(&MBB) &&
+              MLI->isLoopHeader(&MBB)))
+          Changed |= PreMBB->SplitCriticalEdge(&MBB, this) != 0;
+      }
     }
   }
   return true;
diff --git a/lib/CodeGen/PHIElimination.h b/lib/CodeGen/PHIElimination.h
index 7dedf0318a8a6..45a97182e71c5 100644
--- a/lib/CodeGen/PHIElimination.h
+++ b/lib/CodeGen/PHIElimination.h
@@ -13,19 +13,21 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 
 namespace llvm {
   class LiveVariables;
+  class MachineRegisterInfo;
+  class MachineLoopInfo;
   
   /// Lower PHI instructions to copies.  
   class PHIElimination : public MachineFunctionPass {
-    MachineRegisterInfo  *MRI; // Machine register information
+    MachineRegisterInfo *MRI; // Machine register information
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    PHIElimination() : MachineFunctionPass(&ID) {}
+    PHIElimination() : MachineFunctionPass(ID) {}
 
     virtual bool runOnMachineFunction(MachineFunction &Fn);
     
@@ -49,7 +51,7 @@ namespace llvm {
 
     /// Split critical edges where necessary for good coalescer performance.
     bool SplitPHIEdges(MachineFunction &MF, MachineBasicBlock &MBB,
-                       LiveVariables &LV);
+                       LiveVariables &LV, MachineLoopInfo *MLI);
 
     /// SplitCriticalEdge - Split a critical edge from A to B by
     /// inserting a new MBB. Update branches in A and PHI instructions
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
new file mode 100644
index 0000000000000..17cee46ca16c6
--- /dev/null
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -0,0 +1,287 @@
+//===-- PeepholeOptimizer.cpp - Peephole Optimizations --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Perform peephole optimizations on the machine code:
+//
+// - Optimize Extensions
+//
+//     Optimization of sign / zero extension instructions. It may be extended to
+//     handle other instructions with similar properties.
+//
+//     On some targets, some instructions, e.g. X86 sign / zero extension, may
+//     leave the source value in the lower part of the result. This optimization
+//     will replace some uses of the pre-extension value with uses of the
+//     sub-register of the results.
+//
+// - Optimize Comparisons
+//
+//     Optimization of comparison instructions. For instance, in this code:
+//
+//       sub r1, 1
+//       cmp r1, 0
+//       bz  L1
+//
+//     If the "sub" instruction all ready sets (or could be modified to set) the
+//     same flag that the "cmp" instruction sets and that "bz" uses, then we can
+//     eliminate the "cmp" instruction.
+// 
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "peephole-opt"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+// Optimize Extensions
+static cl::opt<bool>
+Aggressive("aggressive-ext-opt", cl::Hidden,
+           cl::desc("Aggressive extension optimization"));
+
+STATISTIC(NumReuse,      "Number of extension results reused");
+STATISTIC(NumEliminated, "Number of compares eliminated");
+
+namespace {
+  class PeepholeOptimizer : public MachineFunctionPass {
+    const TargetMachine   *TM;
+    const TargetInstrInfo *TII;
+    MachineRegisterInfo   *MRI;
+    MachineDominatorTree  *DT;  // Machine dominator tree
+
+  public:
+    static char ID; // Pass identification
+    PeepholeOptimizer() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+      if (Aggressive) {
+        AU.addRequired<MachineDominatorTree>();
+        AU.addPreserved<MachineDominatorTree>();
+      }
+    }
+
+  private:
+    bool OptimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+                          SmallPtrSet<MachineInstr*, 8> &LocalMIs);
+  };
+}
+
+char PeepholeOptimizer::ID = 0;
+INITIALIZE_PASS(PeepholeOptimizer, "peephole-opts",
+                "Peephole Optimizations", false, false);
+
+FunctionPass *llvm::createPeepholeOptimizerPass() {
+  return new PeepholeOptimizer();
+}
+
+/// OptimizeExtInstr - If instruction is a copy-like instruction, i.e. it reads
+/// a single register and writes a single register and it does not modify the
+/// source, and if the source value is preserved as a sub-register of the
+/// result, then replace all reachable uses of the source with the subreg of the
+/// result.
+/// 
+/// Do not generate an EXTRACT that is used only in a debug use, as this changes
+/// the code. Since this code does not currently share EXTRACTs, just ignore all
+/// debug uses.
+bool PeepholeOptimizer::
+OptimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+                 SmallPtrSet<MachineInstr*, 8> &LocalMIs) {
+  LocalMIs.insert(MI);
+
+  unsigned SrcReg, DstReg, SubIdx;
+  if (!TII->isCoalescableExtInstr(*MI, SrcReg, DstReg, SubIdx))
+    return false;
+
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+      TargetRegisterInfo::isPhysicalRegister(SrcReg))
+    return false;
+
+  MachineRegisterInfo::use_nodbg_iterator UI = MRI->use_nodbg_begin(SrcReg);
+  if (++UI == MRI->use_nodbg_end())
+    // No other uses.
+    return false;
+
+  // The source has other uses. See if we can replace the other uses with use of
+  // the result of the extension.
+  SmallPtrSet<MachineBasicBlock*, 4> ReachedBBs;
+  UI = MRI->use_nodbg_begin(DstReg);
+  for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end();
+       UI != UE; ++UI)
+    ReachedBBs.insert(UI->getParent());
+
+  // Uses that are in the same BB of uses of the result of the instruction.
+  SmallVector<MachineOperand*, 8> Uses;
+
+  // Uses that the result of the instruction can reach.
+  SmallVector<MachineOperand*, 8> ExtendedUses;
+
+  bool ExtendLife = true;
+  UI = MRI->use_nodbg_begin(SrcReg);
+  for (MachineRegisterInfo::use_nodbg_iterator UE = MRI->use_nodbg_end();
+       UI != UE; ++UI) {
+    MachineOperand &UseMO = UI.getOperand();
+    MachineInstr *UseMI = &*UI;
+    if (UseMI == MI)
+      continue;
+
+    if (UseMI->isPHI()) {
+      ExtendLife = false;
+      continue;
+    }
+
+    // It's an error to translate this:
+    //
+    //    %reg1025 = <sext> %reg1024
+    //     ...
+    //    %reg1026 = SUBREG_TO_REG 0, %reg1024, 4
+    //
+    // into this:
+    //
+    //    %reg1025 = <sext> %reg1024
+    //     ...
+    //    %reg1027 = COPY %reg1025:4
+    //    %reg1026 = SUBREG_TO_REG 0, %reg1027, 4
+    //
+    // The problem here is that SUBREG_TO_REG is there to assert that an
+    // implicit zext occurs. It doesn't insert a zext instruction. If we allow
+    // the COPY here, it will give us the value after the <sext>, not the
+    // original value of %reg1024 before <sext>.
+    if (UseMI->getOpcode() == TargetOpcode::SUBREG_TO_REG)
+      continue;
+
+    MachineBasicBlock *UseMBB = UseMI->getParent();
+    if (UseMBB == MBB) {
+      // Local uses that come after the extension.
+      if (!LocalMIs.count(UseMI))
+        Uses.push_back(&UseMO);
+    } else if (ReachedBBs.count(UseMBB)) {
+      // Non-local uses where the result of the extension is used. Always
+      // replace these unless it's a PHI.
+      Uses.push_back(&UseMO);
+    } else if (Aggressive && DT->dominates(MBB, UseMBB)) {
+      // We may want to extend the live range of the extension result in order
+      // to replace these uses.
+      ExtendedUses.push_back(&UseMO);
+    } else {
+      // Both will be live out of the def MBB anyway. Don't extend live range of
+      // the extension result.
+      ExtendLife = false;
+      break;
+    }
+  }
+
+  if (ExtendLife && !ExtendedUses.empty())
+    // Extend the liveness of the extension result.
+    std::copy(ExtendedUses.begin(), ExtendedUses.end(),
+              std::back_inserter(Uses));
+
+  // Now replace all uses.
+  bool Changed = false;
+  if (!Uses.empty()) {
+    SmallPtrSet<MachineBasicBlock*, 4> PHIBBs;
+
+    // Look for PHI uses of the extended result, we don't want to extend the
+    // liveness of a PHI input. It breaks all kinds of assumptions down
+    // stream. A PHI use is expected to be the kill of its source values.
+    UI = MRI->use_nodbg_begin(DstReg);
+    for (MachineRegisterInfo::use_nodbg_iterator
+           UE = MRI->use_nodbg_end(); UI != UE; ++UI)
+      if (UI->isPHI())
+        PHIBBs.insert(UI->getParent());
+
+    const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
+    for (unsigned i = 0, e = Uses.size(); i != e; ++i) {
+      MachineOperand *UseMO = Uses[i];
+      MachineInstr *UseMI = UseMO->getParent();
+      MachineBasicBlock *UseMBB = UseMI->getParent();
+      if (PHIBBs.count(UseMBB))
+        continue;
+
+      unsigned NewVR = MRI->createVirtualRegister(RC);
+      BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), NewVR)
+        .addReg(DstReg, 0, SubIdx);
+
+      UseMO->setReg(NewVR);
+      ++NumReuse;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// OptimizeCmpInstr - If the instruction is a compare and the previous
+/// instruction it's comparing against all ready sets (or could be modified to
+/// set) the same flag as the compare, then we can remove the comparison and use
+/// the flag from the previous instruction.
+bool PeepholeOptimizer::OptimizeCmpInstr(MachineInstr *MI,
+                                         MachineBasicBlock *MBB) {
+  // If this instruction is a comparison against zero and isn't comparing a
+  // physical register, we can try to optimize it.
+  unsigned SrcReg;
+  int CmpValue;
+  if (!TII->AnalyzeCompare(MI, SrcReg, CmpValue) ||
+      TargetRegisterInfo::isPhysicalRegister(SrcReg) || CmpValue != 0)
+    return false;
+
+  MachineRegisterInfo::def_iterator DI = MRI->def_begin(SrcReg);
+  if (llvm::next(DI) != MRI->def_end())
+    // Only support one definition.
+    return false;
+
+  // Attempt to convert the defining instruction to set the "zero" flag.
+  if (TII->ConvertToSetZeroFlag(&*DI, MI)) {
+    ++NumEliminated;
+    return true;
+  }
+
+  return false;
+}
+
+bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  TM  = &MF.getTarget();
+  TII = TM->getInstrInfo();
+  MRI = &MF.getRegInfo();
+  DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : 0;
+
+  bool Changed = false;
+
+  SmallPtrSet<MachineInstr*, 8> LocalMIs;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = &*I;
+    LocalMIs.clear();
+
+    for (MachineBasicBlock::iterator
+           MII = I->begin(), ME = I->end(); MII != ME; ) {
+      MachineInstr *MI = &*MII;
+
+      if (MI->getDesc().isCompare() &&
+          !MI->getDesc().hasUnmodeledSideEffects()) {
+        ++MII; // The iterator may become invalid if the compare is deleted.
+        Changed |= OptimizeCmpInstr(MI, MBB);
+      } else {
+        Changed |= OptimizeExtInstr(MI, MBB, LocalMIs);
+        ++MII;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 4af8e07f34800..f0bd6d1372be7 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -85,7 +85,7 @@ namespace {
   public:
     static char ID;
     PostRAScheduler(CodeGenOpt::Level ol) :
-      MachineFunctionPass(&ID), OptLevel(ol) {}
+      MachineFunctionPass(ID), OptLevel(ol) {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
@@ -130,7 +130,7 @@ namespace {
 
     /// KillIndices - The index of the most recent kill (proceding bottom-up),
     /// or ~0u if the register is not live.
-    unsigned KillIndices[TargetRegisterInfo::FirstVirtualRegister];
+    std::vector<unsigned> KillIndices;
 
   public:
     SchedulePostRATDList(MachineFunction &MF,
@@ -140,7 +140,8 @@ namespace {
                          AntiDepBreaker *ADB,
                          AliasAnalysis *aa)
       : ScheduleDAGInstrs(MF, MLI, MDT), Topo(SUnits),
-      HazardRec(HR), AntiDepBreak(ADB), AA(aa) {}
+        HazardRec(HR), AntiDepBreak(ADB), AA(aa),
+        KillIndices(TRI->getNumRegs()) {}
 
     ~SchedulePostRATDList() {
     }
diff --git a/lib/CodeGen/PreAllocSplitting.cpp b/lib/CodeGen/PreAllocSplitting.cpp
index fb2f90935551c..cd9d83eeb6846 100644
--- a/lib/CodeGen/PreAllocSplitting.cpp
+++ b/lib/CodeGen/PreAllocSplitting.cpp
@@ -92,7 +92,7 @@ namespace {
   public:
     static char ID;
     PreAllocSplitting()
-      : MachineFunctionPass(&ID) {}
+      : MachineFunctionPass(ID) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -203,10 +203,11 @@ namespace {
 
 char PreAllocSplitting::ID = 0;
 
-static RegisterPass<PreAllocSplitting>
-X("pre-alloc-splitting", "Pre-Register Allocation Live Interval Splitting");
+INITIALIZE_PASS(PreAllocSplitting, "pre-alloc-splitting",
+                "Pre-Register Allocation Live Interval Splitting",
+                false, false);
 
-const PassInfo *const llvm::PreAllocSplittingID = &X;
+char &llvm::PreAllocSplittingID = PreAllocSplitting::ID;
 
 /// findSpillPoint - Find a gap as far away from the given MI that's suitable
 /// for spilling the current live interval. The index must be before any
@@ -676,11 +677,7 @@ void PreAllocSplitting::ReconstructLiveInterval(LiveInterval* LI) {
     VNInfo* NewVN = LI->getNextValue(DefIdx, 0, true, Alloc);
     
     // If the def is a move, set the copy field.
-    unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-    if (TII->isMoveInstr(*DI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
-      if (DstReg == LI->reg)
-        NewVN->setCopy(&*DI);
-    } else if (DI->isCopyLike() && DI->getOperand(0).getReg() == LI->reg)
+    if (DI->isCopyLike() && DI->getOperand(0).getReg() == LI->reg)
       NewVN->setCopy(&*DI);
 
     NewVNs[&*DI] = NewVN;
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index 2e31908f9fe2f..b8831db1d118a 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -26,8 +26,8 @@
 using namespace llvm;
 
 char ProcessImplicitDefs::ID = 0;
-static RegisterPass<ProcessImplicitDefs> X("processimpdefs",
-                                           "Process Implicit Definitions.");
+INITIALIZE_PASS(ProcessImplicitDefs, "processimpdefs",
+                "Process Implicit Definitions.", false, false);
 
 void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
@@ -46,12 +46,6 @@ ProcessImplicitDefs::CanTurnIntoImplicitDef(MachineInstr *MI,
                                             unsigned Reg, unsigned OpIdx,
                                             const TargetInstrInfo *tii_,
                                             SmallSet<unsigned, 8> &ImpDefRegs) {
-  unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-  if (tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
-      Reg == SrcReg &&
-      (DstSubReg == 0 || ImpDefRegs.count(DstReg)))
-    return true;
-
   switch(OpIdx) {
   case 1:
     return MI->isCopy() && (MI->getOperand(0).getSubReg() == 0 ||
@@ -75,14 +69,6 @@ static bool isUndefCopy(MachineInstr *MI, unsigned Reg,
       return true;
     return false;
   }
-
-  unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-  if (tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubReg, DstSubReg)) {
-    if (Reg != SrcReg)
-      return false;
-    if (DstSubReg == 0 || ImpDefRegs.count(DstReg))
-      return true;
-  }
   return false;
 }
 
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 3843b2537051c..e2802c1fdf4a7 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -19,6 +19,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "pei"
 #include "PrologEpilogInserter.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -32,7 +33,10 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 #include <climits>
 
@@ -40,8 +44,11 @@ using namespace llvm;
 
 char PEI::ID = 0;
 
-static RegisterPass<PEI>
-X("prologepilog", "Prologue/Epilogue Insertion");
+INITIALIZE_PASS(PEI, "prologepilog",
+                "Prologue/Epilogue Insertion", false, false);
+
+STATISTIC(NumVirtualFrameRegs, "Number of virtual frame regs encountered");
+STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
 
 /// createPrologEpilogCodeInserter - This function returns a pass that inserts
 /// prolog and epilog code, and eliminates abstract frame references.
@@ -56,7 +63,6 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
   RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : NULL;
   FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn);
-  FrameConstantRegMap.clear();
 
   // Calculate the MaxCallFrameSize and AdjustsStack variables for the
   // function's frame information. Also eliminates call frame pseudo
@@ -72,10 +78,10 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   calculateCalleeSavedRegisters(Fn);
 
   // Determine placement of CSR spill/restore code:
-  //  - with shrink wrapping, place spills and restores to tightly
+  //  - With shrink wrapping, place spills and restores to tightly
   //    enclose regions in the Machine CFG of the function where
-  //    they are used. Without shrink wrapping
-  //  - default (no shrink wrapping), place all spills in the
+  //    they are used.
+  //  - Without shink wrapping (default), place all spills in the
   //    entry block, all restores in return blocks.
   placeCSRSpillsAndRestores(Fn);
 
@@ -461,8 +467,10 @@ AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
   Offset = (Offset + Align - 1) / Align * Align;
 
   if (StackGrowsDown) {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
     MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
   } else {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
     MFI->setObjectOffset(FrameIdx, Offset);
     Offset += MFI->getObjectSize(FrameIdx);
   }
@@ -547,15 +555,66 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
       AdjustStackOffset(MFI, SFI, StackGrowsDown, Offset, MaxAlign);
   }
 
+  // FIXME: Once this is working, then enable flag will change to a target
+  // check for whether the frame is large enough to want to use virtual
+  // frame index registers. Functions which don't want/need this optimization
+  // will continue to use the existing code path.
+  if (MFI->getUseLocalStackAllocationBlock()) {
+    unsigned Align = MFI->getLocalFrameMaxAlign();
+
+    // Adjust to alignment boundary.
+    Offset = (Offset + Align - 1) / Align * Align;
+
+    DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+
+    // Resolve offsets for objects in the local block.
+    for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) {
+      std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i);
+      int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
+      DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
+            FIOffset << "]\n");
+      MFI->setObjectOffset(Entry.first, FIOffset);
+    }
+    // Allocate the local block
+    Offset += MFI->getLocalFrameSize();
+
+    MaxAlign = std::max(Align, MaxAlign);
+  }
+
   // Make sure that the stack protector comes before the local variables on the
   // stack.
-  if (MFI->getStackProtectorIndex() >= 0)
+  SmallSet<int, 16> LargeStackObjs;
+  if (MFI->getStackProtectorIndex() >= 0) {
     AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), StackGrowsDown,
                       Offset, MaxAlign);
 
+    // Assign large stack objects first.
+    for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+      if (MFI->isObjectPreAllocated(i) &&
+          MFI->getUseLocalStackAllocationBlock())
+        continue;
+      if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
+        continue;
+      if (RS && (int)i == RS->getScavengingFrameIndex())
+        continue;
+      if (MFI->isDeadObjectIndex(i))
+        continue;
+      if (MFI->getStackProtectorIndex() == (int)i)
+        continue;
+      if (!MFI->MayNeedStackProtector(i))
+        continue;
+
+      AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+      LargeStackObjs.insert(i);
+    }
+  }
+
   // Then assign frame offsets to stack objects that are not used to spill
   // callee saved registers.
   for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+    if (MFI->isObjectPreAllocated(i) &&
+        MFI->getUseLocalStackAllocationBlock())
+      continue;
     if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
       continue;
     if (RS && (int)i == RS->getScavengingFrameIndex())
@@ -564,6 +623,8 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
       continue;
     if (MFI->getStackProtectorIndex() == (int)i)
       continue;
+    if (LargeStackObjs.count(i))
+      continue;
 
     AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
   }
@@ -694,16 +755,8 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
           // If this instruction has a FrameIndex operand, we need to
           // use that target machine register info object to eliminate
           // it.
-          TargetRegisterInfo::FrameIndexValue Value;
-          unsigned VReg =
-            TRI.eliminateFrameIndex(MI, SPAdj, &Value,
+            TRI.eliminateFrameIndex(MI, SPAdj,
                                     FrameIndexVirtualScavenging ?  NULL : RS);
-          if (VReg) {
-            assert (FrameIndexVirtualScavenging &&
-                    "Not scavenging, but virtual returned from "
-                    "eliminateFrameIndex()!");
-            FrameConstantRegMap[VReg] = FrameConstantEntry(Value, SPAdj);
-          }
 
           // Reset the iterator if we were at the beginning of the BB.
           if (AtBeginning) {
@@ -731,38 +784,6 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
   }
 }
 
-/// findLastUseReg - find the killing use of the specified register within
-/// the instruciton range. Return the operand number of the kill in Operand.
-static MachineBasicBlock::iterator
-findLastUseReg(MachineBasicBlock::iterator I, MachineBasicBlock::iterator ME,
-               unsigned Reg) {
-  // Scan forward to find the last use of this virtual register
-  for (++I; I != ME; ++I) {
-    MachineInstr *MI = I;
-    bool isDefInsn = false;
-    bool isKillInsn = false;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
-      if (MI->getOperand(i).isReg()) {
-        unsigned OpReg = MI->getOperand(i).getReg();
-        if (OpReg == 0 || !TargetRegisterInfo::isVirtualRegister(OpReg))
-          continue;
-        assert (OpReg == Reg
-                && "overlapping use of scavenged index register!");
-        // If this is the killing use, we have a candidate.
-        if (MI->getOperand(i).isKill())
-          isKillInsn = true;
-        else if (MI->getOperand(i).isDef())
-          isDefInsn = true;
-      }
-    if (isKillInsn && !isDefInsn)
-      return I;
-  }
-  // If we hit the end of the basic block, there was no kill of
-  // the virtual register, which is wrong.
-  assert (0 && "scavenged index register never killed!");
-  return ME;
-}
-
 /// scavengeFrameVirtualRegs - Replace all frame index virtual registers
 /// with physical registers. Use the register scavenger to find an
 /// appropriate register to use.
@@ -772,27 +793,14 @@ void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
        E = Fn.end(); BB != E; ++BB) {
     RS->enterBasicBlock(BB);
 
-    // FIXME: The logic flow in this function is still too convoluted.
-    // It needs a cleanup refactoring. Do that in preparation for tracking
-    // more than one scratch register value and using ranges to find
-    // available scratch registers.
-    unsigned CurrentVirtReg = 0;
-    unsigned CurrentScratchReg = 0;
-    bool havePrevValue = false;
-    TargetRegisterInfo::FrameIndexValue PrevValue(0,0);
-    TargetRegisterInfo::FrameIndexValue Value(0,0);
-    MachineInstr *PrevLastUseMI = NULL;
-    unsigned PrevLastUseOp = 0;
-    bool trackingCurrentValue = false;
+    unsigned VirtReg = 0;
+    unsigned ScratchReg = 0;
     int SPAdj = 0;
 
     // The instruction stream may change in the loop, so check BB->end()
     // directly.
     for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
       MachineInstr *MI = I;
-      bool isDefInsn = false;
-      bool isKillInsn = false;
-      bool clobbersScratchReg = false;
       bool DoIncr = true;
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         if (MI->getOperand(i).isReg()) {
@@ -800,121 +808,30 @@ void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
           unsigned Reg = MO.getReg();
           if (Reg == 0)
             continue;
-          if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
-            // If we have a previous scratch reg, check and see if anything
-            // here kills whatever value is in there.
-            if (Reg == CurrentScratchReg) {
-              if (MO.isUse()) {
-                // Two-address operands implicitly kill
-                if (MO.isKill() || MI->isRegTiedToDefOperand(i))
-                  clobbersScratchReg = true;
-              } else {
-                assert (MO.isDef());
-                clobbersScratchReg = true;
-              }
-            }
+          if (!TargetRegisterInfo::isVirtualRegister(Reg))
             continue;
-          }
-          // If this is a def, remember that this insn defines the value.
-          // This lets us properly consider insns which re-use the scratch
-          // register, such as r2 = sub r2, #imm, in the middle of the
-          // scratch range.
-          if (MO.isDef())
-            isDefInsn = true;
+
+          ++NumVirtualFrameRegs;
 
           // Have we already allocated a scratch register for this virtual?
-          if (Reg != CurrentVirtReg) {
+          if (Reg != VirtReg) {
             // When we first encounter a new virtual register, it
             // must be a definition.
             assert(MI->getOperand(i).isDef() &&
                    "frame index virtual missing def!");
-            // We can't have nested virtual register live ranges because
-            // there's only a guarantee of one scavenged register at a time.
-            assert (CurrentVirtReg == 0 &&
-                    "overlapping frame index virtual registers!");
-
-            // If the target gave us information about what's in the register,
-            // we can use that to re-use scratch regs.
-            DenseMap<unsigned, FrameConstantEntry>::iterator Entry =
-              FrameConstantRegMap.find(Reg);
-            trackingCurrentValue = Entry != FrameConstantRegMap.end();
-            if (trackingCurrentValue) {
-              SPAdj = (*Entry).second.second;
-              Value = (*Entry).second.first;
-            } else {
-              SPAdj = 0;
-              Value.first = 0;
-              Value.second = 0;
-            }
-
-            // If the scratch register from the last allocation is still
-            // available, see if the value matches. If it does, just re-use it.
-            if (trackingCurrentValue && havePrevValue && PrevValue == Value) {
-              // FIXME: This assumes that the instructions in the live range
-              // for the virtual register are exclusively for the purpose
-              // of populating the value in the register. That's reasonable
-              // for these frame index registers, but it's still a very, very
-              // strong assumption. rdar://7322732. Better would be to
-              // explicitly check each instruction in the range for references
-              // to the virtual register. Only delete those insns that
-              // touch the virtual register.
-
-              // Find the last use of the new virtual register. Remove all
-              // instruction between here and there, and update the current
-              // instruction to reference the last use insn instead.
-              MachineBasicBlock::iterator LastUseMI =
-                findLastUseReg(I, BB->end(), Reg);
-
-              // Remove all instructions up 'til the last use, since they're
-              // just calculating the value we already have.
-              BB->erase(I, LastUseMI);
-              I = LastUseMI;
-
-              // Extend the live range of the scratch register
-              PrevLastUseMI->getOperand(PrevLastUseOp).setIsKill(false);
-              RS->setUsed(CurrentScratchReg);
-              CurrentVirtReg = Reg;
-
-              // We deleted the instruction we were scanning the operands of.
-              // Jump back to the instruction iterator loop. Don't increment
-              // past this instruction since we updated the iterator already.
-              DoIncr = false;
-              break;
-            }
-
             // Scavenge a new scratch register
-            CurrentVirtReg = Reg;
+            VirtReg = Reg;
             const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg);
-            CurrentScratchReg = RS->scavengeRegister(RC, I, SPAdj);
-            PrevValue = Value;
+            ScratchReg = RS->scavengeRegister(RC, I, SPAdj);
+            ++NumScavengedRegs;
           }
           // replace this reference to the virtual register with the
           // scratch register.
-          assert (CurrentScratchReg && "Missing scratch register!");
-          MI->getOperand(i).setReg(CurrentScratchReg);
+          assert (ScratchReg && "Missing scratch register!");
+          MI->getOperand(i).setReg(ScratchReg);
 
-          if (MI->getOperand(i).isKill()) {
-            isKillInsn = true;
-            PrevLastUseOp = i;
-            PrevLastUseMI = MI;
-          }
         }
       }
-      // If this is the last use of the scratch, stop tracking it. The
-      // last use will be a kill operand in an instruction that does
-      // not also define the scratch register.
-      if (isKillInsn && !isDefInsn) {
-        CurrentVirtReg = 0;
-        havePrevValue = trackingCurrentValue;
-      }
-      // Similarly, notice if instruction clobbered the value in the
-      // register we're tracking for possible later reuse. This is noted
-      // above, but enforced here since the value is still live while we
-      // process the rest of the operands of the instruction.
-      if (clobbersScratchReg) {
-        havePrevValue = false;
-        CurrentScratchReg = 0;
-      }
       if (DoIncr) {
         RS->forward(I);
         ++I;
diff --git a/lib/CodeGen/PrologEpilogInserter.h b/lib/CodeGen/PrologEpilogInserter.h
index aa95773596cfe..d575124a6b3e6 100644
--- a/lib/CodeGen/PrologEpilogInserter.h
+++ b/lib/CodeGen/PrologEpilogInserter.h
@@ -36,7 +36,7 @@ namespace llvm {
   class PEI : public MachineFunctionPass {
   public:
     static char ID;
-    PEI() : MachineFunctionPass(&ID) {}
+    PEI() : MachineFunctionPass(ID) {}
 
     const char *getPassName() const {
       return "Prolog/Epilog Insertion & Frame Finalization";
@@ -99,13 +99,6 @@ namespace llvm {
     // TRI->requiresFrameIndexScavenging() for the curren function.
     bool FrameIndexVirtualScavenging;
 
-    // When using the scavenger post-pass to resolve frame reference
-    // materialization registers, maintain a map of the registers to
-    // the constant value and SP adjustment associated with it.
-    typedef std::pair<TargetRegisterInfo::FrameIndexValue, int>
-      FrameConstantEntry;
-    DenseMap<unsigned, FrameConstantEntry> FrameConstantRegMap;
-
 #ifndef NDEBUG
     // Machine function handle.
     MachineFunction* MF;
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index f44478e5dd0bf..fc150d55e2265 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -16,6 +16,7 @@
 #include "llvm/BasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -46,7 +47,7 @@ namespace {
   class RAFast : public MachineFunctionPass {
   public:
     static char ID;
-    RAFast() : MachineFunctionPass(&ID), StackSlotForVirtReg(-1),
+    RAFast() : MachineFunctionPass(ID), StackSlotForVirtReg(-1),
                isBulkSpilling(false) {}
   private:
     const TargetMachine *TM;
@@ -80,6 +81,8 @@ namespace {
     // that is currently available in a physical register.
     LiveRegMap LiveVirtRegs;
 
+    DenseMap<unsigned, MachineInstr *> LiveDbgValueMap;
+
     // RegState - Track the state of a physical register.
     enum RegState {
       // A disabled register is not available for allocation, but an alias may
@@ -110,9 +113,9 @@ namespace {
     // Allocatable - vector of allocatable physical registers.
     BitVector Allocatable;
 
-    // SkippedInstrs - Descriptors of instructions whose clobber list was ignored
-    // because all registers were spilled. It is still necessary to mark all the
-    // clobbered registers as used by the function.
+    // SkippedInstrs - Descriptors of instructions whose clobber list was
+    // ignored because all registers were spilled. It is still necessary to
+    // mark all the clobbered registers as used by the function.
     SmallPtrSet<const TargetInstrDesc*, 4> SkippedInstrs;
 
     // isBulkSpilling - This flag is set when LiveRegMap will be cleared
@@ -236,8 +239,7 @@ void RAFast::killVirtReg(unsigned VirtReg) {
 }
 
 /// spillVirtReg - This method spills the value specified by VirtReg into the
-/// corresponding stack slot if needed. If isKill is set, the register is also
-/// killed.
+/// corresponding stack slot if needed.
 void RAFast::spillVirtReg(MachineBasicBlock::iterator MI, unsigned VirtReg) {
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Spilling a physical register is illegal!");
@@ -265,6 +267,31 @@ void RAFast::spillVirtReg(MachineBasicBlock::iterator MI,
     TII->storeRegToStackSlot(*MBB, MI, LR.PhysReg, SpillKill, FI, RC, TRI);
     ++NumStores;   // Update statistics
 
+    // If this register is used by DBG_VALUE then insert new DBG_VALUE to
+    // identify spilled location as the place to find corresponding variable's
+    // value.
+    if (MachineInstr *DBG = LiveDbgValueMap.lookup(LRI->first)) {
+      const MDNode *MDPtr =
+        DBG->getOperand(DBG->getNumOperands()-1).getMetadata();
+      int64_t Offset = 0;
+      if (DBG->getOperand(1).isImm())
+        Offset = DBG->getOperand(1).getImm();
+      DebugLoc DL;
+      if (MI == MBB->end()) {
+        // If MI is at basic block end then use last instruction's location.
+        MachineBasicBlock::iterator EI = MI;
+        DL = (--EI)->getDebugLoc();
+      }
+      else
+        DL = MI->getDebugLoc();
+      if (MachineInstr *NewDV =
+          TII->emitFrameIndexDebugValue(*MF, FI, Offset, MDPtr, DL)) {
+        MachineBasicBlock *MBB = DBG->getParent();
+        MBB->insert(MI, NewDV);
+        DEBUG(dbgs() << "Inserting debug info due to spill:" << "\n" << *NewDV);
+        LiveDbgValueMap[LRI->first] = NewDV;
+      }
+    }
     if (SpillKill)
       LR.LastUse = 0; // Don't kill register again
   }
@@ -471,7 +498,8 @@ void RAFast::allocVirtReg(MachineInstr *MI, LiveRegEntry &LRE, unsigned Hint) {
   // First try to find a completely free register.
   for (TargetRegisterClass::iterator I = AOB; I != AOE; ++I) {
     unsigned PhysReg = *I;
-    if (PhysRegState[PhysReg] == regFree && !UsedInInstr.test(PhysReg))
+    if (PhysRegState[PhysReg] == regFree && !UsedInInstr.test(PhysReg) &&
+        Allocatable.test(PhysReg))
       return assignVirtToPhysReg(LRE, PhysReg);
   }
 
@@ -480,6 +508,8 @@ void RAFast::allocVirtReg(MachineInstr *MI, LiveRegEntry &LRE, unsigned Hint) {
 
   unsigned BestReg = 0, BestCost = spillImpossible;
   for (TargetRegisterClass::iterator I = AOB; I != AOE; ++I) {
+    if (!Allocatable.test(*I))
+      continue;
     unsigned Cost = calcSpillCost(*I);
     // Cost is 0 when all aliases are already disabled.
     if (Cost == 0)
@@ -520,12 +550,9 @@ RAFast::defineVirtReg(MachineInstr *MI, unsigned OpNum,
     if ((!Hint || !TargetRegisterInfo::isPhysicalRegister(Hint)) &&
         MRI->hasOneNonDBGUse(VirtReg)) {
       const MachineInstr &UseMI = *MRI->use_nodbg_begin(VirtReg);
-      unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
       // It's a copy, use the destination register as a hint.
       if (UseMI.isCopyLike())
         Hint = UseMI.getOperand(0).getReg();
-      else if (TII->isMoveInstr(UseMI, SrcReg, DstReg, SrcSubReg, DstSubReg))
-        Hint = DstReg;
     }
     allocVirtReg(MI, *LRI, Hint);
   } else if (LR.LastUse) {
@@ -712,7 +739,8 @@ void RAFast::AllocateBasicBlock() {
   // Add live-in registers as live.
   for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
          E = MBB->livein_end(); I != E; ++I)
-    definePhysReg(MII, *I, regReserved);
+    if (Allocatable.test(*I))
+      definePhysReg(MII, *I, regReserved);
 
   SmallVector<unsigned, 8> VirtDead;
   SmallVector<MachineInstr*, 32> Coalesced;
@@ -756,31 +784,43 @@ void RAFast::AllocateBasicBlock() {
 
     // Debug values are not allowed to change codegen in any way.
     if (MI->isDebugValue()) {
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        MachineOperand &MO = MI->getOperand(i);
-        if (!MO.isReg()) continue;
-        unsigned Reg = MO.getReg();
-        if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
-        LiveRegMap::iterator LRI = LiveVirtRegs.find(Reg);
-        if (LRI != LiveVirtRegs.end())
-          setPhysReg(MI, i, LRI->second.PhysReg);
-        else {
-          int SS = StackSlotForVirtReg[Reg];
-          if (SS == -1)
-            MO.setReg(0); // We can't allocate a physreg for a DebugValue, sorry!
+      bool ScanDbgValue = true;
+      while (ScanDbgValue) {
+        ScanDbgValue = false;
+        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+          MachineOperand &MO = MI->getOperand(i);
+          if (!MO.isReg()) continue;
+          unsigned Reg = MO.getReg();
+          if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+          LiveDbgValueMap[Reg] = MI;
+          LiveRegMap::iterator LRI = LiveVirtRegs.find(Reg);
+          if (LRI != LiveVirtRegs.end())
+            setPhysReg(MI, i, LRI->second.PhysReg);
           else {
-            // Modify DBG_VALUE now that the value is in a spill slot.
-            uint64_t Offset = MI->getOperand(1).getImm();
-            const MDNode *MDPtr = 
-              MI->getOperand(MI->getNumOperands()-1).getMetadata();
-            DebugLoc DL = MI->getDebugLoc();
-            if (MachineInstr *NewDV = 
-                TII->emitFrameIndexDebugValue(*MF, SS, Offset, MDPtr, DL)) {
-              DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
-              MachineBasicBlock *MBB = MI->getParent();
-              MBB->insert(MBB->erase(MI), NewDV);
-            } else
-              MO.setReg(0); // We can't allocate a physreg for a DebugValue, sorry!
+            int SS = StackSlotForVirtReg[Reg];
+            if (SS == -1)
+              // We can't allocate a physreg for a DebugValue, sorry!
+              MO.setReg(0);
+            else {
+              // Modify DBG_VALUE now that the value is in a spill slot.
+              int64_t Offset = MI->getOperand(1).getImm();
+              const MDNode *MDPtr =
+                MI->getOperand(MI->getNumOperands()-1).getMetadata();
+              DebugLoc DL = MI->getDebugLoc();
+              if (MachineInstr *NewDV =
+                  TII->emitFrameIndexDebugValue(*MF, SS, Offset, MDPtr, DL)) {
+                DEBUG(dbgs() << "Modifying debug info due to spill:" <<
+                      "\t" << *MI);
+                MachineBasicBlock *MBB = MI->getParent();
+                MBB->insert(MBB->erase(MI), NewDV);
+                // Scan NewDV operands from the beginning.
+                MI = NewDV;
+                ScanDbgValue = true;
+                break;
+              } else
+                // We can't allocate a physreg for a DebugValue; sorry!
+                MO.setReg(0);
+            }
           }
         }
       }
@@ -789,14 +829,13 @@ void RAFast::AllocateBasicBlock() {
     }
 
     // If this is a copy, we may be able to coalesce.
-    unsigned CopySrc, CopyDst, CopySrcSub, CopyDstSub;
+    unsigned CopySrc = 0, CopyDst = 0, CopySrcSub = 0, CopyDstSub = 0;
     if (MI->isCopy()) {
       CopyDst = MI->getOperand(0).getReg();
       CopySrc = MI->getOperand(1).getReg();
       CopyDstSub = MI->getOperand(0).getSubReg();
       CopySrcSub = MI->getOperand(1).getSubReg();
-    } else if (!TII->isMoveInstr(*MI, CopySrc, CopyDst, CopySrcSub, CopyDstSub))
-      CopySrc = CopyDst = 0;
+    }
 
     // Track registers used by instruction.
     UsedInInstr.reset();
@@ -843,13 +882,18 @@ void RAFast::AllocateBasicBlock() {
     // operands. If there are also physical defs, these registers must avoid
     // both physical defs and uses, making them more constrained than normal
     // operands.
+    // Similarly, if there are multiple defs and tied operands, we must make
+    // sure the same register is allocated to uses and defs.
     // We didn't detect inline asm tied operands above, so just make this extra
     // pass for all inline asm.
     if (MI->isInlineAsm() || hasEarlyClobbers || hasPartialRedefs ||
-        (hasTiedOps && hasPhysDefs)) {
+        (hasTiedOps && (hasPhysDefs || TID.getNumDefs() > 1))) {
       handleThroughOperands(MI, VirtDead);
       // Don't attempt coalescing when we have funny stuff going on.
       CopyDst = 0;
+      // Pretend we have early clobbers so the use operands get marked below.
+      // This is not necessary for the common case of a single tied use.
+      hasEarlyClobbers = true;
     }
 
     // Second scan.
@@ -870,14 +914,17 @@ void RAFast::AllocateBasicBlock() {
 
     MRI->addPhysRegsUsed(UsedInInstr);
 
-    // Track registers defined by instruction - early clobbers at this point.
+    // Track registers defined by instruction - early clobbers and tied uses at
+    // this point.
     UsedInInstr.reset();
     if (hasEarlyClobbers) {
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         MachineOperand &MO = MI->getOperand(i);
-        if (!MO.isReg() || !MO.isDef()) continue;
+        if (!MO.isReg()) continue;
         unsigned Reg = MO.getReg();
         if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+        // Look for physreg defs and tied uses.
+        if (!MO.isDef() && !MI->isRegTiedToDefOperand(i)) continue;
         UsedInInstr.set(Reg);
         for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
           UsedInInstr.set(*AS);
@@ -887,9 +934,9 @@ void RAFast::AllocateBasicBlock() {
     unsigned DefOpEnd = MI->getNumOperands();
     if (TID.isCall()) {
       // Spill all virtregs before a call. This serves two purposes: 1. If an
-      // exception is thrown, the landing pad is going to expect to find registers
-      // in their spill slots, and 2. we don't have to wade through all the
-      // <imp-def> operands on the call instruction.
+      // exception is thrown, the landing pad is going to expect to find
+      // registers in their spill slots, and 2. we don't have to wade through
+      // all the <imp-def> operands on the call instruction.
       DefOpEnd = VirtOpEnd;
       DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
       spillAll(MI);
@@ -992,6 +1039,7 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
 
   SkippedInstrs.clear();
   StackSlotForVirtReg.clear();
+  LiveDbgValueMap.clear();
   return true;
 }
 
diff --git a/lib/CodeGen/RegAllocLinearScan.cpp b/lib/CodeGen/RegAllocLinearScan.cpp
index 044672d6d7a5e..5c62354a8872c 100644
--- a/lib/CodeGen/RegAllocLinearScan.cpp
+++ b/lib/CodeGen/RegAllocLinearScan.cpp
@@ -87,10 +87,10 @@ namespace {
                                "to skip."),
                       cl::init(0),
                       cl::Hidden);
- 
+
   struct RALinScan : public MachineFunctionPass {
     static char ID;
-    RALinScan() : MachineFunctionPass(&ID) {
+    RALinScan() : MachineFunctionPass(ID) {
       // Initialize the queue to record recently-used registers.
       if (NumRecentlyUsedRegs > 0)
         RecentRegs.resize(NumRecentlyUsedRegs, 0);
@@ -125,9 +125,10 @@ namespace {
     const TargetRegisterInfo* tri_;
     const TargetInstrInfo* tii_;
     BitVector allocatableRegs_;
+    BitVector reservedRegs_;
     LiveIntervals* li_;
     LiveStacks* ls_;
-    const MachineLoopInfo *loopInfo;
+    MachineLoopInfo *loopInfo;
 
     /// handled_ - Intervals are added to the handled_ set in the order of their
     /// start value.  This is uses for backtracking.
@@ -255,9 +256,9 @@ namespace {
                             SmallVector<LiveInterval*, 8> &SpillIntervals);
 
     /// attemptTrivialCoalescing - If a simple interval is defined by a copy,
-    /// try allocate the definition the same register as the source register
-    /// if the register is not defined during live time of the interval. This
-    /// eliminate a copy. This is used to coalesce copies which were not
+    /// try to allocate the definition to the same register as the source,
+    /// if the register is not defined during the life time of the interval.
+    /// This eliminates a copy, and is used to coalesce copies which were not
     /// coalesced away before allocation either due to dest and src being in
     /// different register classes or because the coalescer was overly
     /// conservative.
@@ -335,6 +336,17 @@ namespace {
                             SmallVector<unsigned, 256> &inactiveCounts,
                             bool SkipDGRegs);
 
+    /// getFirstNonReservedPhysReg - return the first non-reserved physical
+    /// register in the register class.
+    unsigned getFirstNonReservedPhysReg(const TargetRegisterClass *RC) {
+        TargetRegisterClass::iterator aoe = RC->allocation_order_end(*mf_);
+        TargetRegisterClass::iterator i = RC->allocation_order_begin(*mf_);
+        while (i != aoe && reservedRegs_.test(*i))
+          ++i;
+        assert(i != aoe && "All registers reserved?!");
+        return *i;
+      }
+
     void ComputeRelatedRegClasses();
 
     template <typename ItTy>
@@ -358,8 +370,8 @@ namespace {
   char RALinScan::ID = 0;
 }
 
-static RegisterPass<RALinScan>
-X("linearscan-regalloc", "Linear Scan Register Allocator");
+INITIALIZE_PASS(RALinScan, "linearscan-regalloc",
+                "Linear Scan Register Allocator", false, false);
 
 void RALinScan::ComputeRelatedRegClasses() {
   // First pass, add all reg classes to the union, and determine at least one
@@ -371,7 +383,7 @@ void RALinScan::ComputeRelatedRegClasses() {
     for (TargetRegisterClass::iterator I = (*RCI)->begin(), E = (*RCI)->end();
          I != E; ++I) {
       HasAliases = HasAliases || *tri_->getAliasSet(*I) != 0;
-      
+
       const TargetRegisterClass *&PRC = OneClassForEachPhysReg[*I];
       if (PRC) {
         // Already processed this register.  Just make sure we know that
@@ -382,7 +394,7 @@ void RALinScan::ComputeRelatedRegClasses() {
       }
     }
   }
-  
+
   // Second pass, now that we know conservatively what register classes each reg
   // belongs to, add info about aliases.  We don't need to do this for targets
   // without register aliases.
@@ -419,20 +431,15 @@ unsigned RALinScan::attemptTrivialCoalescing(LiveInterval &cur, unsigned Reg) {
   unsigned CandReg;
   {
     MachineInstr *CopyMI;
-    unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
     if (vni->def != SlotIndex() && vni->isDefAccurate() &&
-        (CopyMI = li_->getInstructionFromIndex(vni->def)) &&
-        (CopyMI->isCopy() ||
-         tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg)))
+        (CopyMI = li_->getInstructionFromIndex(vni->def)) && CopyMI->isCopy())
       // Defined by a copy, try to extend SrcReg forward
-      CandReg = CopyMI->isCopy() ? CopyMI->getOperand(1).getReg() : SrcReg;
+      CandReg = CopyMI->getOperand(1).getReg();
     else if (TrivCoalesceEnds &&
-             (CopyMI =
-              li_->getInstructionFromIndex(range.end.getBaseIndex())) &&
-             tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
-             cur.reg == SrcReg)
+            (CopyMI = li_->getInstructionFromIndex(range.end.getBaseIndex())) &&
+             CopyMI->isCopy() && cur.reg == CopyMI->getOperand(1).getReg())
       // Only used by a copy, try to extend DstReg backwards
-      CandReg = DstReg;
+      CandReg = CopyMI->getOperand(0).getReg();
     else
       return Reg;
   }
@@ -469,6 +476,7 @@ bool RALinScan::runOnMachineFunction(MachineFunction &fn) {
   tri_ = tm_->getRegisterInfo();
   tii_ = tm_->getInstrInfo();
   allocatableRegs_ = tri_->getAllocatableSet(fn);
+  reservedRegs_ = tri_->getReservedRegs(fn);
   li_ = &getAnalysis<LiveIntervals>();
   ls_ = &getAnalysis<LiveStacks>();
   loopInfo = &getAnalysis<MachineLoopInfo>();
@@ -487,9 +495,9 @@ bool RALinScan::runOnMachineFunction(MachineFunction &fn) {
 
   vrm_ = &getAnalysis<VirtRegMap>();
   if (!rewriter_.get()) rewriter_.reset(createVirtRegRewriter());
-  
-  spiller_.reset(createSpiller(mf_, li_, loopInfo, vrm_));
-  
+
+  spiller_.reset(createSpiller(*this, *mf_, *vrm_));
+
   initIntervalSets();
 
   linearScan();
@@ -543,7 +551,7 @@ void RALinScan::linearScan() {
   // linear scan algorithm
   DEBUG({
       dbgs() << "********** LINEAR SCAN **********\n"
-             << "********** Function: " 
+             << "********** Function: "
              << mf_->getFunction()->getName() << '\n';
       printIntervals("fixed", fixed_.begin(), fixed_.end());
     });
@@ -765,7 +773,8 @@ FindIntervalInVector(RALinScan::IntervalPtrs &IP, LiveInterval *LI) {
   return IP.end();
 }
 
-static void RevertVectorIteratorsTo(RALinScan::IntervalPtrs &V, SlotIndex Point){
+static void RevertVectorIteratorsTo(RALinScan::IntervalPtrs &V,
+                                    SlotIndex Point){
   for (unsigned i = 0, e = V.size(); i != e; ++i) {
     RALinScan::IntervalPtr &IP = V[i];
     LiveInterval::iterator I = std::upper_bound(IP.first->begin(),
@@ -804,7 +813,7 @@ static void addStackInterval(LiveInterval *cur, LiveStacks *ls_,
 static
 float getConflictWeight(LiveInterval *cur, unsigned Reg, LiveIntervals *li_,
                         MachineRegisterInfo *mri_,
-                        const MachineLoopInfo *loopInfo) {
+                        MachineLoopInfo *loopInfo) {
   float Conflicts = 0;
   for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(Reg),
          E = mri_->reg_end(); I != E; ++I) {
@@ -837,7 +846,7 @@ void RALinScan::findIntervalsToSpill(LiveInterval *cur,
         dbgs() << tri_->getName(Candidates[i].first) << " ";
       dbgs() << "\n";
     });
-  
+
   // Calculate the number of conflicts of each candidate.
   for (IntervalPtrs::iterator i = active_.begin(); i != active_.end(); ++i) {
     unsigned Reg = i->first->reg;
@@ -955,7 +964,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
   if (cur->empty()) {
     unsigned physReg = vrm_->getRegAllocPref(cur->reg);
     if (!physReg)
-      physReg = *RC->allocation_order_begin(*mf_);
+      physReg = getFirstNonReservedPhysReg(RC);
     DEBUG(dbgs() <<  tri_->getName(physReg) << '\n');
     // Note the register is not really in use.
     vrm_->assignVirt2Phys(cur->reg, physReg);
@@ -978,27 +987,10 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
     if ((vni->def != SlotIndex()) && !vni->isUnused() &&
          vni->isDefAccurate()) {
       MachineInstr *CopyMI = li_->getInstructionFromIndex(vni->def);
-      unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-      if (CopyMI &&
-          tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg)) {
-        unsigned Reg = 0;
-        if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
-          Reg = SrcReg;
-        else if (vrm_->isAssignedReg(SrcReg))
-          Reg = vrm_->getPhys(SrcReg);
-        if (Reg) {
-          if (SrcSubReg)
-            Reg = tri_->getSubReg(Reg, SrcSubReg);
-          if (DstSubReg)
-            Reg = tri_->getMatchingSuperReg(Reg, DstSubReg, RC);
-          if (Reg && allocatableRegs_[Reg] && RC->contains(Reg))
-            mri_->setRegAllocationHint(cur->reg, 0, Reg);
-        }
-      } else if (CopyMI && CopyMI->isCopy()) {
-        DstReg = CopyMI->getOperand(0).getReg();
-        DstSubReg = CopyMI->getOperand(0).getSubReg();
-        SrcReg = CopyMI->getOperand(1).getReg();
-        SrcSubReg = CopyMI->getOperand(1).getSubReg();
+      if (CopyMI && CopyMI->isCopy()) {
+        unsigned DstSubReg = CopyMI->getOperand(0).getSubReg();
+        unsigned SrcReg = CopyMI->getOperand(1).getReg();
+        unsigned SrcSubReg = CopyMI->getOperand(1).getSubReg();
         unsigned Reg = 0;
         if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
           Reg = SrcReg;
@@ -1024,7 +1016,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
     assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
            "Can only allocate virtual registers!");
     const TargetRegisterClass *RegRC = mri_->getRegClass(Reg);
-    // If this is not in a related reg class to the register we're allocating, 
+    // If this is not in a related reg class to the register we're allocating,
     // don't check it.
     if (RelatedRegClasses.getLeaderValue(RegRC) == RCLeader &&
         cur->overlapsFrom(*i->first, i->second-1)) {
@@ -1033,7 +1025,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
       SpillWeightsToAdd.push_back(std::make_pair(Reg, i->first->weight));
     }
   }
-  
+
   // Speculatively check to see if we can get a register right now.  If not,
   // we know we won't be able to by adding more constraints.  If so, we can
   // check to see if it is valid.  Doing an exhaustive search of the fixed_ list
@@ -1048,7 +1040,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
     SmallSet<unsigned, 8> RegAliases;
     for (const unsigned *AS = tri_->getAliasSet(physReg); *AS; ++AS)
       RegAliases.insert(*AS);
-    
+
     bool ConflictsWithFixed = false;
     for (unsigned i = 0, e = fixed_.size(); i != e; ++i) {
       IntervalPtr &IP = fixed_[i];
@@ -1068,7 +1060,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
         }
       }
     }
-    
+
     // Okay, the register picked by our speculative getFreePhysReg call turned
     // out to be in use.  Actually add all of the conflicting fixed registers to
     // regUse_ so we can do an accurate query.
@@ -1080,7 +1072,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
         LiveInterval *I = IP.first;
 
         const TargetRegisterClass *RegRC = OneClassForEachPhysReg[I->reg];
-        if (RelatedRegClasses.getLeaderValue(RegRC) == RCLeader &&       
+        if (RelatedRegClasses.getLeaderValue(RegRC) == RCLeader &&
             I->endIndex() > StartPosition) {
           LiveInterval::iterator II = I->advanceTo(IP.second, StartPosition);
           IP.second = II;
@@ -1099,11 +1091,11 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
       physReg = getFreePhysReg(cur);
     }
   }
-    
+
   // Restore the physical register tracker, removing information about the
   // future.
   restoreRegUses();
-  
+
   // If we find a free register, we are done: assign this virtual to
   // the free physical register and add this interval to the active
   // list.
@@ -1118,7 +1110,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
     UpgradeRegister(physReg);
     if (LiveInterval *NextReloadLI = hasNextReloadInterval(cur)) {
       // "Downgrade" physReg to try to keep physReg from being allocated until
-      // the next reload from the same SS is allocated. 
+      // the next reload from the same SS is allocated.
       mri_->setRegAllocationHint(NextReloadLI->reg, 0, physReg);
       DowngradeRegister(cur, physReg);
     }
@@ -1131,7 +1123,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
   for (std::vector<std::pair<unsigned, float> >::iterator
        I = SpillWeightsToAdd.begin(), E = SpillWeightsToAdd.end(); I != E; ++I)
     updateSpillWeights(SpillWeights, I->first, I->second, RC);
-  
+
   // for each interval in active, update spill weights.
   for (IntervalPtrs::const_iterator i = active_.begin(), e = active_.end();
        i != e; ++i) {
@@ -1141,7 +1133,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
     reg = vrm_->getPhys(reg);
     updateSpillWeights(SpillWeights, reg, i->first->weight, RC);
   }
- 
+
   DEBUG(dbgs() << "\tassigning stack slot at interval "<< *cur << ":\n");
 
   // Find a register to spill.
@@ -1155,17 +1147,22 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
            e = RC->allocation_order_end(*mf_); i != e; ++i) {
       unsigned reg = *i;
       float regWeight = SpillWeights[reg];
-      // Skip recently allocated registers.
+      // Don't even consider reserved regs.
+      if (reservedRegs_.test(reg))
+        continue;
+      // Skip recently allocated registers and reserved registers.
       if (minWeight > regWeight && !isRecentlyUsed(reg))
         Found = true;
       RegsWeights.push_back(std::make_pair(reg, regWeight));
     }
-  
+
   // If we didn't find a register that is spillable, try aliases?
   if (!Found) {
     for (TargetRegisterClass::iterator i = RC->allocation_order_begin(*mf_),
            e = RC->allocation_order_end(*mf_); i != e; ++i) {
       unsigned reg = *i;
+      if (reservedRegs_.test(reg))
+        continue;
       // No need to worry about if the alias register size < regsize of RC.
       // We are going to spill all registers that alias it anyway.
       for (const unsigned* as = tri_->getAliasSet(reg); *as; ++as)
@@ -1179,7 +1176,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
   minWeight = RegsWeights[0].second;
   if (minWeight == HUGE_VALF) {
     // All registers must have inf weight. Just grab one!
-    minReg = BestPhysReg ? BestPhysReg : *RC->allocation_order_begin(*mf_);
+    minReg = BestPhysReg ? BestPhysReg : getFirstNonReservedPhysReg(RC);
     if (cur->weight == HUGE_VALF ||
         li_->getApproximateInstructionCount(*cur) == 0) {
       // Spill a physical register around defs and uses.
@@ -1224,8 +1221,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
   // linearscan.
   if (cur->weight != HUGE_VALF && cur->weight <= minWeight) {
     DEBUG(dbgs() << "\t\t\tspilling(c): " << *cur << '\n');
-    SmallVector<LiveInterval*, 8> spillIs;
-    std::vector<LiveInterval*> added;
+    SmallVector<LiveInterval*, 8> spillIs, added;
     spiller_->spill(cur, added, spillIs);
 
     std::sort(added.begin(), added.end(), LISorter());
@@ -1288,27 +1284,33 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
 
   // The earliest start of a Spilled interval indicates up to where
   // in handled we need to roll back
-  assert(!spillIs.empty() && "No spill intervals?"); 
+  assert(!spillIs.empty() && "No spill intervals?");
   SlotIndex earliestStart = spillIs[0]->beginIndex();
-  
+
   // Spill live intervals of virtual regs mapped to the physical register we
   // want to clear (and its aliases).  We only spill those that overlap with the
   // current interval as the rest do not affect its allocation. we also keep
   // track of the earliest start of all spilled live intervals since this will
   // mark our rollback point.
-  std::vector<LiveInterval*> added;
+  SmallVector<LiveInterval*, 8> added;
   while (!spillIs.empty()) {
     LiveInterval *sli = spillIs.back();
     spillIs.pop_back();
     DEBUG(dbgs() << "\t\t\tspilling(a): " << *sli << '\n');
     if (sli->beginIndex() < earliestStart)
       earliestStart = sli->beginIndex();
-       
-    spiller_->spill(sli, added, spillIs, &earliestStart);
+    spiller_->spill(sli, added, spillIs);
     addStackInterval(sli, ls_, li_, mri_, *vrm_);
     spilled.insert(sli->reg);
   }
 
+  // Include any added intervals in earliestStart.
+  for (unsigned i = 0, e = added.size(); i != e; ++i) {
+    SlotIndex SI = added[i]->beginIndex();
+    if (SI < earliestStart)
+      earliestStart = SI;
+  }
+
   DEBUG(dbgs() << "\t\trolling back to: " << earliestStart << '\n');
 
   // Scan handled in reverse order up to the earliest start of a
@@ -1431,6 +1433,9 @@ unsigned RALinScan::getFreePhysReg(LiveInterval* cur,
     // Ignore "downgraded" registers.
     if (SkipDGRegs && DowngradedRegs.count(Reg))
       continue;
+    // Skip reserved registers.
+    if (reservedRegs_.test(Reg))
+      continue;
     // Skip recently allocated registers.
     if (isRegAvail(Reg) && !isRecentlyUsed(Reg)) {
       FreeReg = Reg;
@@ -1459,6 +1464,9 @@ unsigned RALinScan::getFreePhysReg(LiveInterval* cur,
     // Ignore "downgraded" registers.
     if (SkipDGRegs && DowngradedRegs.count(Reg))
       continue;
+    // Skip reserved registers.
+    if (reservedRegs_.test(Reg))
+      continue;
     if (isRegAvail(Reg) && Reg < inactiveCounts.size() &&
         FreeRegInactiveCount < inactiveCounts[Reg] && !isRecentlyUsed(Reg)) {
       FreeReg = Reg;
@@ -1479,17 +1487,17 @@ unsigned RALinScan::getFreePhysReg(LiveInterval* cur,
 unsigned RALinScan::getFreePhysReg(LiveInterval *cur) {
   SmallVector<unsigned, 256> inactiveCounts;
   unsigned MaxInactiveCount = 0;
-  
+
   const TargetRegisterClass *RC = mri_->getRegClass(cur->reg);
   const TargetRegisterClass *RCLeader = RelatedRegClasses.getLeaderValue(RC);
- 
+
   for (IntervalPtrs::iterator i = inactive_.begin(), e = inactive_.end();
        i != e; ++i) {
     unsigned reg = i->first->reg;
     assert(TargetRegisterInfo::isVirtualRegister(reg) &&
            "Can only allocate virtual registers!");
 
-    // If this is not in a related reg class to the register we're allocating, 
+    // If this is not in a related reg class to the register we're allocating,
     // don't check it.
     const TargetRegisterClass *RegRC = mri_->getRegClass(reg);
     if (RelatedRegClasses.getLeaderValue(RegRC) == RCLeader) {
@@ -1506,7 +1514,7 @@ unsigned RALinScan::getFreePhysReg(LiveInterval *cur) {
   unsigned Preference = vrm_->getRegAllocPref(cur->reg);
   if (Preference) {
     DEBUG(dbgs() << "(preferred: " << tri_->getName(Preference) << ") ");
-    if (isRegAvail(Preference) && 
+    if (isRegAvail(Preference) &&
         RC->contains(Preference))
       return Preference;
   }
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 7e61a12a7eea5..61f337bab49c4 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -34,6 +34,8 @@
 #include "PBQP/HeuristicSolver.h"
 #include "PBQP/Graph.h"
 #include "PBQP/Heuristics/Briggs.h"
+#include "RenderMachineFunction.h"
+#include "Splitter.h"
 #include "VirtRegMap.h"
 #include "VirtRegRewriter.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
@@ -65,6 +67,11 @@ pbqpCoalescing("pbqp-coalescing",
                 cl::desc("Attempt coalescing during PBQP register allocation."),
                 cl::init(false), cl::Hidden);
 
+static cl::opt<bool>
+pbqpPreSplitting("pbqp-pre-splitting",
+                 cl::desc("Pre-splite before PBQP register allocation."),
+                 cl::init(false), cl::Hidden);
+
 namespace {
 
   ///
@@ -77,7 +84,7 @@ namespace {
     static char ID;
 
     /// Construct a PBQP register allocator.
-    PBQPRegAlloc() : MachineFunctionPass(&ID) {}
+    PBQPRegAlloc() : MachineFunctionPass(ID) {}
 
     /// Return the pass name.
     virtual const char* getPassName() const {
@@ -96,7 +103,10 @@ namespace {
       au.addPreserved<LiveStacks>();
       au.addRequired<MachineLoopInfo>();
       au.addPreserved<MachineLoopInfo>();
+      if (pbqpPreSplitting)
+        au.addRequired<LoopSplitter>();
       au.addRequired<VirtRegMap>();
+      au.addRequired<RenderMachineFunction>();
       MachineFunctionPass::getAnalysisUsage(au);
     }
 
@@ -104,7 +114,15 @@ namespace {
     virtual bool runOnMachineFunction(MachineFunction &MF);
 
   private:
-    typedef std::map<const LiveInterval*, unsigned> LI2NodeMap;
+
+    class LIOrdering {
+    public:
+      bool operator()(const LiveInterval *li1, const LiveInterval *li2) const {
+        return li1->reg < li2->reg;
+      }
+    };
+
+    typedef std::map<const LiveInterval*, unsigned, LIOrdering> LI2NodeMap;
     typedef std::vector<const LiveInterval*> Node2LIMap;
     typedef std::vector<unsigned> AllowedSet;
     typedef std::vector<AllowedSet> AllowedSetMap;
@@ -112,7 +130,7 @@ namespace {
     typedef std::pair<unsigned, unsigned> RegPair;
     typedef std::map<RegPair, PBQP::PBQPNum> CoalesceMap;
 
-    typedef std::set<LiveInterval*> LiveIntervalSet;
+    typedef std::set<LiveInterval*, LIOrdering> LiveIntervalSet;
 
     typedef std::vector<PBQP::Graph::NodeItr> NodeVector;
 
@@ -122,6 +140,7 @@ namespace {
     const TargetInstrInfo *tii;
     const MachineLoopInfo *loopInfo;
     MachineRegisterInfo *mri;
+    RenderMachineFunction *rmf;
 
     LiveIntervals *lis;
     LiveStacks *lss;
@@ -379,12 +398,14 @@ PBQPRegAlloc::CoalesceMap PBQPRegAlloc::findCoalesces() {
          iItr != iEnd; ++iItr) {
 
       const MachineInstr *instr = &*iItr;
-      unsigned srcReg, dstReg, srcSubReg, dstSubReg;
 
       // If this isn't a copy then continue to the next instruction.
-      if (!tii->isMoveInstr(*instr, srcReg, dstReg, srcSubReg, dstSubReg))
+      if (!instr->isCopy())
         continue;
 
+      unsigned srcReg = instr->getOperand(1).getReg();
+      unsigned dstReg = instr->getOperand(0).getReg();
+
       // If the registers are already the same our job is nice and easy.
       if (dstReg == srcReg)
         continue;
@@ -567,6 +588,8 @@ PBQP::Graph PBQPRegAlloc::constructPBQPProblem() {
   // Resize allowedSets container appropriately.
   allowedSets.resize(vregIntervalsToAlloc.size());
 
+  BitVector ReservedRegs = tri->getReservedRegs(*mf);
+
   // Iterate over virtual register intervals to compute allowed sets...
   for (unsigned node = 0; node < node2LI.size(); ++node) {
 
@@ -575,8 +598,12 @@ PBQP::Graph PBQPRegAlloc::constructPBQPProblem() {
     const TargetRegisterClass *liRC = mri->getRegClass(li->reg);
 
     // Start by assuming all allocable registers in the class are allowed...
-    RegVector liAllowed(liRC->allocation_order_begin(*mf),
-                        liRC->allocation_order_end(*mf));
+    RegVector liAllowed;
+    TargetRegisterClass::iterator aob = liRC->allocation_order_begin(*mf);
+    TargetRegisterClass::iterator aoe = liRC->allocation_order_end(*mf);
+    for (TargetRegisterClass::iterator it = aob; it != aoe; ++it)
+      if (!ReservedRegs.test(*it))
+        liAllowed.push_back(*it);
 
     // Eliminate the physical registers which overlap with this range, along
     // with all their aliases.
@@ -735,9 +762,11 @@ bool PBQPRegAlloc::mapPBQPToRegAlloc(const PBQP::Solution &solution) {
       const LiveInterval *spillInterval = node2LI[node];
       double oldSpillWeight = spillInterval->weight;
       SmallVector<LiveInterval*, 8> spillIs;
+      rmf->rememberUseDefs(spillInterval);
       std::vector<LiveInterval*> newSpills =
         lis->addIntervalsForSpills(*spillInterval, spillIs, loopInfo, *vrm);
       addStackInterval(spillInterval, mri);
+      rmf->rememberSpills(spillInterval, newSpills);
 
       (void) oldSpillWeight;
       DEBUG(dbgs() << "VREG " << virtReg << " -> SPILLED (Cost: "
@@ -845,9 +874,11 @@ bool PBQPRegAlloc::runOnMachineFunction(MachineFunction &MF) {
   lis = &getAnalysis<LiveIntervals>();
   lss = &getAnalysis<LiveStacks>();
   loopInfo = &getAnalysis<MachineLoopInfo>();
+  rmf = &getAnalysis<RenderMachineFunction>();
 
   vrm = &getAnalysis<VirtRegMap>();
 
+
   DEBUG(dbgs() << "PBQP Register Allocating for " << mf->getFunction()->getName() << "\n");
 
   // Allocator main loop:
@@ -884,6 +915,8 @@ bool PBQPRegAlloc::runOnMachineFunction(MachineFunction &MF) {
   // Finalise allocation, allocate empty ranges.
   finalizeAlloc();
 
+  rmf->renderMachineFunction("After PBQP register allocation.", vrm);
+
   vregIntervalsToAlloc.clear();
   emptyVRegIntervals.clear();
   li2Node.clear();
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index ab0bc2d78a608..02b5539f0f4f0 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -54,9 +54,8 @@ bool CoalescerPair::isMoveInstr(const MachineInstr *MI,
     DstSub = compose(MI->getOperand(0).getSubReg(), MI->getOperand(3).getImm());
     Src = MI->getOperand(2).getReg();
     SrcSub = MI->getOperand(2).getSubReg();
-  } else if (!tii_.isMoveInstr(*MI, Src, Dst, SrcSub, DstSub)) {
+  } else
     return false;
-  }
   return true;
 }
 
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 43b3fb6426351..a2580b85bcc33 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -21,7 +21,9 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -226,19 +228,14 @@ void RegScavenger::getRegsUsed(BitVector &used, bool includeReserved) {
     used = ~RegsAvailable & ~ReservedRegs;
 }
 
-/// CreateRegClassMask - Set the bits that represent the registers in the
-/// TargetRegisterClass.
-static void CreateRegClassMask(const TargetRegisterClass *RC, BitVector &Mask) {
-  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I != E;
-       ++I)
-    Mask.set(*I);
-}
-
 unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
   for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
        I != E; ++I)
-    if (!isAliasUsed(*I))
+    if (!isAliasUsed(*I)) {
+      DEBUG(dbgs() << "Scavenger found unused reg: " << TRI->getName(*I) <<
+            "\n");
       return *I;
+    }
   return 0;
 }
 
@@ -325,11 +322,9 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
 unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
                                         MachineBasicBlock::iterator I,
                                         int SPAdj) {
-  // Mask off the registers which are not in the TargetRegisterClass.
-  BitVector Candidates(NumPhysRegs, false);
-  CreateRegClassMask(RC, Candidates);
-  // Do not include reserved registers.
-  Candidates ^= ReservedRegs & Candidates;
+  // Consider all allocatable registers in the register class initially
+  BitVector Candidates =
+    TRI->getAllocatableSet(*I->getParent()->getParent(), RC);
 
   // Exclude all the registers being used by the instruction.
   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
@@ -349,8 +344,10 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI);
 
   // If we found an unused register there is no reason to spill it.
-  if (!isAliasUsed(SReg))
+  if (!isAliasUsed(SReg)) {
+    DEBUG(dbgs() << "Scavenged register: " << TRI->getName(SReg) << "\n");
     return SReg;
+  }
 
   assert(ScavengedReg == 0 &&
          "Scavenger slot is live, unable to scavenge another register!");
@@ -366,12 +363,12 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
            "Cannot scavenge register without an emergency spill slot!");
     TII->storeRegToStackSlot(*MBB, I, SReg, true, ScavengingFrameIndex, RC,TRI);
     MachineBasicBlock::iterator II = prior(I);
-    TRI->eliminateFrameIndex(II, SPAdj, NULL, this);
+    TRI->eliminateFrameIndex(II, SPAdj, this);
 
     // Restore the scavenged register before its use (or first terminator).
     TII->loadRegFromStackSlot(*MBB, UseMI, SReg, ScavengingFrameIndex, RC, TRI);
     II = prior(UseMI);
-    TRI->eliminateFrameIndex(II, SPAdj, NULL, this);
+    TRI->eliminateFrameIndex(II, SPAdj, this);
   }
 
   ScavengeRestore = prior(UseMI);
@@ -380,5 +377,8 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   // ScavengedReg = SReg;
   ScavengedRC = RC;
 
+  DEBUG(dbgs() << "Scavenged register (with spill): " << TRI->getName(SReg) <<
+        "\n");
+
   return SReg;
 }
diff --git a/lib/CodeGen/RenderMachineFunction.cpp b/lib/CodeGen/RenderMachineFunction.cpp
new file mode 100644
index 0000000000000..93426eecbbc1e
--- /dev/null
+++ b/lib/CodeGen/RenderMachineFunction.cpp
@@ -0,0 +1,1014 @@
+//===-- llvm/CodeGen/RenderMachineFunction.cpp - MF->HTML -----s-----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "rendermf"
+
+#include "RenderMachineFunction.h"
+
+#include "VirtRegMap.h"
+
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <sstream>
+
+using namespace llvm;
+
+char RenderMachineFunction::ID = 0;
+INITIALIZE_PASS(RenderMachineFunction, "rendermf",
+                "Render machine functions (and related info) to HTML pages",
+                false, false);
+
+static cl::opt<std::string>
+outputFileSuffix("rmf-file-suffix",
+                 cl::desc("Appended to function name to get output file name "
+                          "(default: \".html\")"),
+                 cl::init(".html"), cl::Hidden);
+
+static cl::opt<std::string>
+machineFuncsToRender("rmf-funcs",
+                     cl::desc("Coma seperated list of functions to render"
+                              ", or \"*\"."),
+                     cl::init(""), cl::Hidden);
+
+static cl::opt<std::string>
+pressureClasses("rmf-classes",
+                cl::desc("Register classes to render pressure for."),
+                cl::init(""), cl::Hidden);
+
+static cl::opt<std::string>
+showIntervals("rmf-intervals",
+              cl::desc("Live intervals to show alongside code."),
+              cl::init(""), cl::Hidden);
+
+static cl::opt<bool>
+filterEmpty("rmf-filter-empty-intervals",
+            cl::desc("Don't display empty intervals."),
+            cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+showEmptyIndexes("rmf-empty-indexes",
+                 cl::desc("Render indexes not associated with instructions or "
+                          "MBB starts."),
+                 cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+useFancyVerticals("rmf-fancy-verts",
+                  cl::desc("Use SVG for vertical text."),
+                  cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+prettyHTML("rmf-pretty-html",
+           cl::desc("Pretty print HTML. For debugging the renderer only.."),
+           cl::init(false), cl::Hidden);
+
+
+namespace llvm {
+
+  bool MFRenderingOptions::renderingOptionsProcessed;
+  std::set<std::string> MFRenderingOptions::mfNamesToRender;
+  bool MFRenderingOptions::renderAllMFs = false;
+
+  std::set<std::string> MFRenderingOptions::classNamesToRender;
+  bool MFRenderingOptions::renderAllClasses = false;
+
+  std::set<std::pair<unsigned, unsigned> >
+    MFRenderingOptions::intervalNumsToRender;
+  unsigned MFRenderingOptions::intervalTypesToRender = ExplicitOnly;
+
+  template <typename OutputItr>
+  void MFRenderingOptions::splitComaSeperatedList(const std::string &s,
+                                                         OutputItr outItr) {
+    std::string::const_iterator curPos = s.begin();
+    std::string::const_iterator nextComa = std::find(curPos, s.end(), ',');
+    while (nextComa != s.end()) {
+      std::string elem;
+      std::copy(curPos, nextComa, std::back_inserter(elem));
+      *outItr = elem;
+      ++outItr;
+      curPos = llvm::next(nextComa);
+      nextComa = std::find(curPos, s.end(), ',');
+    }
+
+    if (curPos != s.end()) {
+      std::string elem;
+      std::copy(curPos, s.end(), std::back_inserter(elem));
+      *outItr = elem;
+      ++outItr;
+    }
+  }
+
+  void MFRenderingOptions::processOptions() {
+    if (!renderingOptionsProcessed) {
+      processFuncNames();
+      processRegClassNames();
+      processIntervalNumbers();
+      renderingOptionsProcessed = true;
+    }
+  }
+
+  void MFRenderingOptions::processFuncNames() {
+    if (machineFuncsToRender == "*") {
+      renderAllMFs = true;
+    } else {
+      splitComaSeperatedList(machineFuncsToRender,
+                             std::inserter(mfNamesToRender,
+                                           mfNamesToRender.begin()));
+    }
+  }
+
+  void MFRenderingOptions::processRegClassNames() {
+    if (pressureClasses == "*") {
+      renderAllClasses = true;
+    } else {
+      splitComaSeperatedList(pressureClasses,
+                             std::inserter(classNamesToRender,
+                                           classNamesToRender.begin()));
+    }
+  }
+
+  void MFRenderingOptions::processIntervalNumbers() {
+    std::set<std::string> intervalRanges;
+    splitComaSeperatedList(showIntervals,
+                           std::inserter(intervalRanges,
+                                         intervalRanges.begin()));
+    std::for_each(intervalRanges.begin(), intervalRanges.end(),
+                  processIntervalRange);
+  }
+
+  void MFRenderingOptions::processIntervalRange(
+                                          const std::string &intervalRangeStr) {
+    if (intervalRangeStr == "*") {
+      intervalTypesToRender |= All;
+    } else if (intervalRangeStr == "virt-nospills*") {
+      intervalTypesToRender |= VirtNoSpills;
+    } else if (intervalRangeStr == "spills*") {
+      intervalTypesToRender |= VirtSpills;
+    } else if (intervalRangeStr == "virt*") {
+      intervalTypesToRender |= AllVirt;
+    } else if (intervalRangeStr == "phys*") {
+      intervalTypesToRender |= AllPhys;
+    } else {
+      std::istringstream iss(intervalRangeStr);
+      unsigned reg1, reg2;
+      if ((iss >> reg1 >> std::ws)) {
+        if (iss.eof()) {
+          intervalNumsToRender.insert(std::make_pair(reg1, reg1 + 1));
+        } else {
+          char c;
+          iss >> c;
+          if (c == '-' && (iss >> reg2)) {
+            intervalNumsToRender.insert(std::make_pair(reg1, reg2 + 1));
+          } else {
+            dbgs() << "Warning: Invalid interval range \""
+                   << intervalRangeStr << "\" in -rmf-intervals. Skipping.\n";
+          }
+        }
+      } else {
+        dbgs() << "Warning: Invalid interval number \""
+               << intervalRangeStr << "\" in -rmf-intervals. Skipping.\n";
+      }
+    }
+  }
+
+  void MFRenderingOptions::setup(MachineFunction *mf,
+                                 const TargetRegisterInfo *tri,
+                                 LiveIntervals *lis,
+                                 const RenderMachineFunction *rmf) {
+    this->mf = mf;
+    this->tri = tri;
+    this->lis = lis;
+    this->rmf = rmf;
+
+    clear();
+  }
+
+  void MFRenderingOptions::clear() {
+    regClassesTranslatedToCurrentFunction = false;
+    regClassSet.clear();
+
+    intervalsTranslatedToCurrentFunction = false;
+    intervalSet.clear();
+  }
+
+  void MFRenderingOptions::resetRenderSpecificOptions() {
+    intervalSet.clear();
+    intervalsTranslatedToCurrentFunction = false;
+  }
+
+  bool MFRenderingOptions::shouldRenderCurrentMachineFunction() const {
+    processOptions();
+
+    return (renderAllMFs ||
+            mfNamesToRender.find(mf->getFunction()->getName()) !=
+              mfNamesToRender.end());    
+  }
+
+  const MFRenderingOptions::RegClassSet& MFRenderingOptions::regClasses() const{
+    translateRegClassNamesToCurrentFunction();
+    return regClassSet;
+  }
+
+  const MFRenderingOptions::IntervalSet& MFRenderingOptions::intervals() const {
+    translateIntervalNumbersToCurrentFunction();
+    return intervalSet;
+  }
+
+  bool MFRenderingOptions::renderEmptyIndexes() const {
+    return showEmptyIndexes;
+  }
+
+  bool MFRenderingOptions::fancyVerticals() const {
+    return useFancyVerticals;
+  }
+
+  void MFRenderingOptions::translateRegClassNamesToCurrentFunction() const {
+    if (!regClassesTranslatedToCurrentFunction) {
+      processOptions();
+      for (TargetRegisterInfo::regclass_iterator rcItr = tri->regclass_begin(),
+                                                 rcEnd = tri->regclass_end();
+           rcItr != rcEnd; ++rcItr) {
+        const TargetRegisterClass *trc = *rcItr;
+        if (renderAllClasses ||
+            classNamesToRender.find(trc->getName()) !=
+              classNamesToRender.end()) {
+          regClassSet.insert(trc);
+        }
+      }
+      regClassesTranslatedToCurrentFunction = true;
+    }
+  }
+
+  void MFRenderingOptions::translateIntervalNumbersToCurrentFunction() const {
+    if (!intervalsTranslatedToCurrentFunction) {
+      processOptions();
+
+      // If we're not just doing explicit then do a copy over all matching
+      // types.
+      if (intervalTypesToRender != ExplicitOnly) {
+        for (LiveIntervals::iterator liItr = lis->begin(), liEnd = lis->end();
+             liItr != liEnd; ++liItr) {
+          LiveInterval *li = liItr->second;
+
+          if (filterEmpty && li->empty())
+            continue;
+
+          if ((TargetRegisterInfo::isPhysicalRegister(li->reg) &&
+               (intervalTypesToRender & AllPhys))) {
+            intervalSet.insert(li);
+          } else if (TargetRegisterInfo::isVirtualRegister(li->reg)) {
+            if (((intervalTypesToRender & VirtNoSpills) && !rmf->isSpill(li)) || 
+                ((intervalTypesToRender & VirtSpills) && rmf->isSpill(li))) {
+              intervalSet.insert(li);
+            }
+          }
+        }
+      }
+
+      // If we need to process the explicit list...
+      if (intervalTypesToRender != All) {
+        for (std::set<std::pair<unsigned, unsigned> >::const_iterator
+               regRangeItr = intervalNumsToRender.begin(),
+               regRangeEnd = intervalNumsToRender.end();
+             regRangeItr != regRangeEnd; ++regRangeItr) {
+          const std::pair<unsigned, unsigned> &range = *regRangeItr;
+          for (unsigned reg = range.first; reg != range.second; ++reg) {
+            if (lis->hasInterval(reg)) {
+              intervalSet.insert(&lis->getInterval(reg));
+            }
+          }
+        }
+      }
+
+      intervalsTranslatedToCurrentFunction = true;
+    }
+  }
+
+  // ---------- TargetRegisterExtraInformation implementation ----------
+
+  TargetRegisterExtraInfo::TargetRegisterExtraInfo()
+    : mapsPopulated(false) {
+  }
+
+  void TargetRegisterExtraInfo::setup(MachineFunction *mf,
+                                      MachineRegisterInfo *mri,
+                                      const TargetRegisterInfo *tri,
+                                      LiveIntervals *lis) {
+    this->mf = mf;
+    this->mri = mri;
+    this->tri = tri;
+    this->lis = lis;
+  }
+
+  void TargetRegisterExtraInfo::reset() {
+    if (!mapsPopulated) {
+      initWorst();
+      //initBounds();
+      initCapacity();
+      mapsPopulated = true;
+    }
+
+    resetPressureAndLiveStates();
+  }
+
+  void TargetRegisterExtraInfo::clear() {
+    prWorst.clear();
+    vrWorst.clear();
+    capacityMap.clear();
+    pressureMap.clear();
+    //liveStatesMap.clear();
+    mapsPopulated = false;
+  }
+
+  void TargetRegisterExtraInfo::initWorst() {
+    assert(!mapsPopulated && prWorst.empty() && vrWorst.empty() &&
+           "Worst map already initialised?");
+
+    // Start with the physical registers.
+    for (unsigned preg = 1; preg < tri->getNumRegs(); ++preg) {
+      WorstMapLine &pregLine = prWorst[preg];
+
+      for (TargetRegisterInfo::regclass_iterator rcItr = tri->regclass_begin(),
+                                                 rcEnd = tri->regclass_end();
+           rcItr != rcEnd; ++rcItr) {
+        const TargetRegisterClass *trc = *rcItr;
+
+        unsigned numOverlaps = 0;
+        for (TargetRegisterClass::iterator rItr = trc->begin(),
+                                           rEnd = trc->end();
+             rItr != rEnd; ++rItr) {
+          unsigned trcPReg = *rItr;
+          if (tri->regsOverlap(preg, trcPReg))
+            ++numOverlaps;
+        }
+        
+        pregLine[trc] = numOverlaps;
+      }
+    }
+
+    // Now the register classes.
+    for (TargetRegisterInfo::regclass_iterator rc1Itr = tri->regclass_begin(),
+                                               rcEnd = tri->regclass_end();
+         rc1Itr != rcEnd; ++rc1Itr) {
+      const TargetRegisterClass *trc1 = *rc1Itr;
+      WorstMapLine &classLine = vrWorst[trc1];
+
+      for (TargetRegisterInfo::regclass_iterator rc2Itr = tri->regclass_begin();
+           rc2Itr != rcEnd; ++rc2Itr) {
+        const TargetRegisterClass *trc2 = *rc2Itr;
+
+        unsigned worst = 0;
+
+        for (TargetRegisterClass::iterator trc1Itr = trc1->begin(),
+                                           trc1End = trc1->end();
+             trc1Itr != trc1End; ++trc1Itr) {
+          unsigned trc1Reg = *trc1Itr;
+          unsigned trc1RegWorst = 0;
+
+          for (TargetRegisterClass::iterator trc2Itr = trc2->begin(),
+                                             trc2End = trc2->end();
+               trc2Itr != trc2End; ++trc2Itr) {
+            unsigned trc2Reg = *trc2Itr;
+            if (tri->regsOverlap(trc1Reg, trc2Reg))
+              ++trc1RegWorst;
+          }
+          if (trc1RegWorst > worst) {
+            worst = trc1RegWorst;
+          }    
+        }
+
+        if (worst != 0) {
+          classLine[trc2] = worst;
+        }
+      }
+    }
+  }
+
+  unsigned TargetRegisterExtraInfo::getWorst(
+                                        unsigned reg,
+                                        const TargetRegisterClass *trc) const {
+    const WorstMapLine *wml = 0;
+    if (TargetRegisterInfo::isPhysicalRegister(reg)) {
+      PRWorstMap::const_iterator prwItr = prWorst.find(reg);
+      assert(prwItr != prWorst.end() && "Missing prWorst entry.");
+      wml = &prwItr->second;
+    } else {
+      const TargetRegisterClass *regTRC = mri->getRegClass(reg);
+      VRWorstMap::const_iterator vrwItr = vrWorst.find(regTRC);
+      assert(vrwItr != vrWorst.end() && "Missing vrWorst entry.");
+      wml = &vrwItr->second;
+    }
+    
+    WorstMapLine::const_iterator wmlItr = wml->find(trc);
+    if (wmlItr == wml->end())
+      return 0;
+
+    return wmlItr->second;
+  }
+
+  void TargetRegisterExtraInfo::initCapacity() {
+    assert(!mapsPopulated && capacityMap.empty() &&
+           "Capacity map already initialised?");
+
+    for (TargetRegisterInfo::regclass_iterator rcItr = tri->regclass_begin(),
+           rcEnd = tri->regclass_end();
+         rcItr != rcEnd; ++rcItr) {
+      const TargetRegisterClass *trc = *rcItr;
+      unsigned capacity = std::distance(trc->allocation_order_begin(*mf),
+                                        trc->allocation_order_end(*mf));
+
+      if (capacity != 0)
+        capacityMap[trc] = capacity;
+    }
+  }
+
+  unsigned TargetRegisterExtraInfo::getCapacity(
+                                         const TargetRegisterClass *trc) const {
+    CapacityMap::const_iterator cmItr = capacityMap.find(trc);
+    assert(cmItr != capacityMap.end() &&
+           "vreg with unallocable register class");
+    return cmItr->second;
+  }
+
+  void TargetRegisterExtraInfo::resetPressureAndLiveStates() {
+    pressureMap.clear();
+    //liveStatesMap.clear();
+
+    // Iterate over all slots.
+    
+
+    // Iterate over all live intervals.
+    for (LiveIntervals::iterator liItr = lis->begin(),
+           liEnd = lis->end();
+         liItr != liEnd; ++liItr) {
+      LiveInterval *li = liItr->second;
+
+      const TargetRegisterClass *liTRC;
+
+      if (TargetRegisterInfo::isPhysicalRegister(li->reg))
+        continue;
+      
+      liTRC = mri->getRegClass(li->reg);
+     
+
+      // For all ranges in the current interal.
+      for (LiveInterval::iterator lrItr = li->begin(),
+             lrEnd = li->end();
+           lrItr != lrEnd; ++lrItr) {
+        LiveRange *lr = &*lrItr;
+        
+        // For all slots in the current range.
+        for (SlotIndex i = lr->start; i != lr->end; i = i.getNextSlot()) {
+
+          // Record increased pressure at index for all overlapping classes.
+          for (TargetRegisterInfo::regclass_iterator
+                 rcItr = tri->regclass_begin(),
+                 rcEnd = tri->regclass_end();
+               rcItr != rcEnd; ++rcItr) {
+            const TargetRegisterClass *trc = *rcItr;
+
+            if (trc->allocation_order_begin(*mf) ==
+                trc->allocation_order_end(*mf))
+              continue;
+
+            unsigned worstAtI = getWorst(li->reg, trc);
+
+            if (worstAtI != 0) {
+              pressureMap[i][trc] += worstAtI;
+            }
+          }
+        }
+      }
+    } 
+  }
+
+  unsigned TargetRegisterExtraInfo::getPressureAtSlot(
+                                                 const TargetRegisterClass *trc,
+                                                 SlotIndex i) const {
+    PressureMap::const_iterator pmItr = pressureMap.find(i);
+    if (pmItr == pressureMap.end())
+      return 0;
+    const PressureMapLine &pmLine = pmItr->second;
+    PressureMapLine::const_iterator pmlItr = pmLine.find(trc);
+    if (pmlItr == pmLine.end())
+      return 0;
+    return pmlItr->second;
+  }
+
+  bool TargetRegisterExtraInfo::classOverCapacityAtSlot(
+                                                 const TargetRegisterClass *trc,
+                                                 SlotIndex i) const {
+    return (getPressureAtSlot(trc, i) > getCapacity(trc));
+  }
+
+  // ---------- MachineFunctionRenderer implementation ----------
+
+  void RenderMachineFunction::Spacer::print(raw_ostream &os) const {
+    if (!prettyHTML)
+      return;
+    for (unsigned i = 0; i < ns; ++i) {
+      os << " ";
+    }
+  }
+
+  RenderMachineFunction::Spacer RenderMachineFunction::s(unsigned ns) const {
+    return Spacer(ns);
+  }
+
+  raw_ostream& operator<<(raw_ostream &os, const RenderMachineFunction::Spacer &s) {
+    s.print(os);
+    return os;
+  }
+
+  template <typename Iterator>
+  std::string RenderMachineFunction::escapeChars(Iterator sBegin, Iterator sEnd) const {
+    std::string r;
+
+    for (Iterator sItr = sBegin; sItr != sEnd; ++sItr) {
+      char c = *sItr;
+
+      switch (c) {
+        case '<': r.append("&lt;"); break;
+        case '>': r.append("&gt;"); break;
+        case '&': r.append("&amp;"); break;
+        case ' ': r.append("&nbsp;"); break;
+        case '\"': r.append("&quot;"); break;
+        default: r.push_back(c); break;
+      }
+    }
+
+    return r;
+  }
+
+  RenderMachineFunction::LiveState
+  RenderMachineFunction::getLiveStateAt(const LiveInterval *li,
+                                        SlotIndex i) const {
+    const MachineInstr *mi = sis->getInstructionFromIndex(i);
+
+    // For uses/defs recorded use/def indexes override current liveness and
+    // instruction operands (Only for the interval which records the indexes).
+    if (i.isUse() || i.isDef()) {
+      UseDefs::const_iterator udItr = useDefs.find(li);
+      if (udItr != useDefs.end()) {
+        const SlotSet &slotSet = udItr->second;
+        if (slotSet.count(i)) {
+          if (i.isUse()) {
+            return Used;
+          }
+          // else
+          return Defined;
+        }
+      }
+    }
+
+    // If the slot is a load/store, or there's no info in the use/def set then
+    // use liveness and instruction operand info.
+    if (li->liveAt(i)) {
+
+      if (mi == 0) {
+        if (vrm == 0 || 
+            (vrm->getStackSlot(li->reg) == VirtRegMap::NO_STACK_SLOT)) {
+          return AliveReg;
+        } else {
+          return AliveStack;
+        }
+      } else {
+        if (i.isDef() && mi->definesRegister(li->reg, tri)) {
+          return Defined;
+        } else if (i.isUse() && mi->readsRegister(li->reg)) {
+          return Used;
+        } else {
+          if (vrm == 0 || 
+              (vrm->getStackSlot(li->reg) == VirtRegMap::NO_STACK_SLOT)) {
+            return AliveReg;
+          } else {
+            return AliveStack;
+          }
+        }
+      }
+    }
+    return Dead;
+  }
+
+  RenderMachineFunction::PressureState
+  RenderMachineFunction::getPressureStateAt(const TargetRegisterClass *trc,
+                                              SlotIndex i) const {
+    if (trei.getPressureAtSlot(trc, i) == 0) {
+      return Zero;
+    } else if (trei.classOverCapacityAtSlot(trc, i)){
+      return High;
+    }
+    return Low;
+  }
+
+  /// \brief Render a machine instruction.
+  void RenderMachineFunction::renderMachineInstr(raw_ostream &os,
+                                                 const MachineInstr *mi) const {
+    std::string s;
+    raw_string_ostream oss(s);
+    oss << *mi;
+
+    os << escapeChars(oss.str());
+  }
+
+  template <typename T>
+  void RenderMachineFunction::renderVertical(const Spacer &indent,
+                                             raw_ostream &os,
+                                             const T &t) const {
+    if (ro.fancyVerticals()) {
+      os << indent << "<object\n"
+         << indent + s(2) << "class=\"obj\"\n"
+         << indent + s(2) << "type=\"image/svg+xml\"\n"
+         << indent + s(2) << "width=\"14px\"\n"
+         << indent + s(2) << "height=\"55px\"\n"
+         << indent + s(2) << "data=\"data:image/svg+xml,\n"
+         << indent + s(4) << "<svg xmlns='http://www.w3.org/2000/svg'>\n"
+         << indent + s(6) << "<text x='-55' y='10' "
+                             "font-family='Courier' font-size='12' "
+                             "transform='rotate(-90)' "
+                             "text-rendering='optimizeSpeed' "
+                             "fill='#000'>" << t << "</text>\n"
+         << indent + s(4) << "</svg>\">\n"
+         << indent << "</object>\n";
+    } else {
+      std::ostringstream oss;
+      oss << t;
+      std::string tStr(oss.str());
+
+      os << indent;
+      for (std::string::iterator tStrItr = tStr.begin(), tStrEnd = tStr.end();
+           tStrItr != tStrEnd; ++tStrItr) {
+        os << *tStrItr << "<br/>";
+      }
+      os << "\n";
+    }
+  }
+
+  void RenderMachineFunction::insertCSS(const Spacer &indent,
+                                        raw_ostream &os) const {
+    os << indent << "<style type=\"text/css\">\n"
+       << indent + s(2) << "body { font-color: black; }\n"
+       << indent + s(2) << "table.code td { font-family: monospace; "
+                    "border-width: 0px; border-style: solid; "
+                    "border-bottom: 1px solid #dddddd; white-space: nowrap; }\n"
+       << indent + s(2) << "table.code td.p-z { background-color: #000000; }\n"
+       << indent + s(2) << "table.code td.p-l { background-color: #00ff00; }\n"
+       << indent + s(2) << "table.code td.p-h { background-color: #ff0000; }\n"
+       << indent + s(2) << "table.code td.l-n { background-color: #ffffff; }\n"
+       << indent + s(2) << "table.code td.l-d { background-color: #ff0000; }\n"
+       << indent + s(2) << "table.code td.l-u { background-color: #ffff00; }\n"
+       << indent + s(2) << "table.code td.l-r { background-color: #000000; }\n"
+       << indent + s(2) << "table.code td.l-s { background-color: #770000; }\n"
+       << indent + s(2) << "table.code th { border-width: 0px; "
+                    "border-style: solid; }\n"
+       << indent << "</style>\n";
+  }
+
+  void RenderMachineFunction::renderFunctionSummary(
+                                    const Spacer &indent, raw_ostream &os,
+                                    const char * const renderContextStr) const {
+    os << indent << "<h1>Function: " << mf->getFunction()->getName()
+                 << "</h1>\n"
+       << indent << "<h2>Rendering context: " << renderContextStr << "</h2>\n";
+  }
+
+
+  void RenderMachineFunction::renderPressureTableLegend(
+                                                      const Spacer &indent,
+                                                      raw_ostream &os) const {
+    os << indent << "<h2>Rendering Pressure Legend:</h2>\n"
+       << indent << "<table class=\"code\">\n"
+       << indent + s(2) << "<tr>\n"
+       << indent + s(4) << "<th>Pressure</th><th>Description</th>"
+                    "<th>Appearance</th>\n"
+       << indent + s(2) << "</tr>\n"
+       << indent + s(2) << "<tr>\n"
+       << indent + s(4) << "<td>No Pressure</td>"
+                    "<td>No physical registers of this class requested.</td>"
+                    "<td class=\"p-z\">&nbsp;&nbsp;</td>\n"
+       << indent + s(2) << "</tr>\n"
+       << indent + s(2) << "<tr>\n"
+       << indent + s(4) << "<td>Low Pressure</td>"
+                    "<td>Sufficient physical registers to meet demand.</td>"
+                    "<td class=\"p-l\">&nbsp;&nbsp;</td>\n"
+       << indent + s(2) << "</tr>\n"
+       << indent + s(2) << "<tr>\n"
+       << indent + s(4) << "<td>High Pressure</td>"
+                    "<td>Potentially insufficient physical registers to meet demand.</td>"
+                    "<td class=\"p-h\">&nbsp;&nbsp;</td>\n"
+       << indent + s(2) << "</tr>\n"
+       << indent << "</table>\n";
+  }
+
+  template <typename CellType>
+  void RenderMachineFunction::renderCellsWithRLE(
+                   const Spacer &indent, raw_ostream &os,
+                   const std::pair<CellType, unsigned> &rleAccumulator,
+                   const std::map<CellType, std::string> &cellTypeStrs) const {
+
+    if (rleAccumulator.second == 0)
+      return; 
+
+    typename std::map<CellType, std::string>::const_iterator ctsItr =
+      cellTypeStrs.find(rleAccumulator.first);
+
+    assert(ctsItr != cellTypeStrs.end() && "No string for given cell type.");
+
+    os << indent + s(4) << "<td class=\"" << ctsItr->second << "\"";
+    if (rleAccumulator.second > 1)
+      os << " colspan=" << rleAccumulator.second;
+    os << "></td>\n";
+  }
+
+
+  void RenderMachineFunction::renderCodeTablePlusPI(const Spacer &indent,
+                                                    raw_ostream &os) const {
+
+    std::map<LiveState, std::string> lsStrs;
+    lsStrs[Dead] = "l-n";
+    lsStrs[Defined] = "l-d";
+    lsStrs[Used] = "l-u";
+    lsStrs[AliveReg] = "l-r";
+    lsStrs[AliveStack] = "l-s";
+
+    std::map<PressureState, std::string> psStrs;
+    psStrs[Zero] = "p-z";
+    psStrs[Low] = "p-l";
+    psStrs[High] = "p-h";
+
+    // Open the table... 
+
+    os << indent << "<table cellpadding=0 cellspacing=0 class=\"code\">\n"
+       << indent + s(2) << "<tr>\n";
+
+    // Render the header row...
+
+    os << indent + s(4) << "<th>index</th>\n"
+       << indent + s(4) << "<th>instr</th>\n";
+
+    // Render class names if necessary...
+    if (!ro.regClasses().empty()) {
+      for (MFRenderingOptions::RegClassSet::const_iterator
+             rcItr = ro.regClasses().begin(),
+             rcEnd = ro.regClasses().end();
+           rcItr != rcEnd; ++rcItr) {
+        const TargetRegisterClass *trc = *rcItr;
+        os << indent + s(4) << "<th>\n";
+        renderVertical(indent + s(6), os, trc->getName());
+        os << indent + s(4) << "</th>\n";
+      }
+    }
+
+    // FIXME: Is there a nicer way to insert space between columns in HTML?
+    if (!ro.regClasses().empty() && !ro.intervals().empty())
+      os << indent + s(4) << "<th>&nbsp;&nbsp;</th>\n";
+
+    // Render interval numbers if necessary...
+    if (!ro.intervals().empty()) {
+      for (MFRenderingOptions::IntervalSet::const_iterator
+             liItr = ro.intervals().begin(),
+             liEnd = ro.intervals().end();
+           liItr != liEnd; ++liItr) {
+
+        const LiveInterval *li = *liItr;
+        os << indent + s(4) << "<th>\n";
+        renderVertical(indent + s(6), os, li->reg);
+        os << indent + s(4) << "</th>\n";
+      }
+    }
+
+    os << indent + s(2) << "</tr>\n";
+
+    // End header row, start with the data rows...
+
+    MachineInstr *mi = 0;
+
+    // Data rows:
+    for (SlotIndex i = sis->getZeroIndex(); i != sis->getLastIndex();
+         i = i.getNextSlot()) {
+     
+      // Render the slot column. 
+      os << indent + s(2) << "<tr height=6ex>\n";
+      
+      // Render the code column.
+      if (i.isLoad()) {
+        MachineBasicBlock *mbb = sis->getMBBFromIndex(i);
+        mi = sis->getInstructionFromIndex(i);
+
+        if (i == sis->getMBBStartIdx(mbb) || mi != 0 ||
+            ro.renderEmptyIndexes()) {
+          os << indent + s(4) << "<td rowspan=4>" << i << "&nbsp;</td>\n"
+             << indent + s(4) << "<td rowspan=4>\n";
+
+          if (i == sis->getMBBStartIdx(mbb)) {
+            os << indent + s(6) << "BB#" << mbb->getNumber() << ":&nbsp;\n";
+          } else if (mi != 0) {
+            os << indent + s(6) << "&nbsp;&nbsp;";
+            renderMachineInstr(os, mi);
+          } else {
+            // Empty interval - leave blank.
+          }
+          os << indent + s(4) << "</td>\n";
+        } else {
+          i = i.getStoreIndex(); // <- Will be incremented to the next index.
+          continue;
+        }
+      }
+
+      // Render the class columns.
+      if (!ro.regClasses().empty()) {
+        std::pair<PressureState, unsigned> psRLEAccumulator(Zero, 0);
+        for (MFRenderingOptions::RegClassSet::const_iterator
+               rcItr = ro.regClasses().begin(),
+               rcEnd = ro.regClasses().end();
+             rcItr != rcEnd; ++rcItr) {
+          const TargetRegisterClass *trc = *rcItr;
+          PressureState newPressure = getPressureStateAt(trc, i);
+
+          if (newPressure == psRLEAccumulator.first) {
+            ++psRLEAccumulator.second;
+          } else {
+            renderCellsWithRLE(indent + s(4), os, psRLEAccumulator, psStrs);
+            psRLEAccumulator.first = newPressure;
+            psRLEAccumulator.second = 1;
+          }
+        }
+        renderCellsWithRLE(indent + s(4), os, psRLEAccumulator, psStrs);
+      }
+  
+      // FIXME: Is there a nicer way to insert space between columns in HTML?
+      if (!ro.regClasses().empty() && !ro.intervals().empty())
+        os << indent + s(4) << "<td width=2em></td>\n";
+
+      if (!ro.intervals().empty()) {
+        std::pair<LiveState, unsigned> lsRLEAccumulator(Dead, 0);
+        for (MFRenderingOptions::IntervalSet::const_iterator
+               liItr = ro.intervals().begin(),
+               liEnd = ro.intervals().end();
+             liItr != liEnd; ++liItr) {
+          const LiveInterval *li = *liItr;
+          LiveState newLiveness = getLiveStateAt(li, i);
+
+          if (newLiveness == lsRLEAccumulator.first) {
+            ++lsRLEAccumulator.second;
+          } else {
+            renderCellsWithRLE(indent + s(4), os, lsRLEAccumulator, lsStrs);
+            lsRLEAccumulator.first = newLiveness;
+            lsRLEAccumulator.second = 1;
+          }
+        }
+        renderCellsWithRLE(indent + s(4), os, lsRLEAccumulator, lsStrs);
+      }
+      os << indent + s(2) << "</tr>\n";
+    }
+
+    os << indent << "</table>\n";
+
+    if (!ro.regClasses().empty())
+      renderPressureTableLegend(indent, os);
+  }
+
+  void RenderMachineFunction::renderFunctionPage(
+                                    raw_ostream &os,
+                                    const char * const renderContextStr) const {
+    os << "<html>\n"
+       << s(2) << "<head>\n"
+       << s(4) << "<title>" << fqn << "</title>\n";
+
+    insertCSS(s(4), os);
+
+    os << s(2) << "<head>\n"
+       << s(2) << "<body >\n";
+
+    renderFunctionSummary(s(4), os, renderContextStr);
+
+    os << s(4) << "<br/><br/><br/>\n";
+
+    //renderLiveIntervalInfoTable("    ", os);
+
+    os << s(4) << "<br/><br/><br/>\n";
+
+    renderCodeTablePlusPI(s(4), os);
+
+    os << s(2) << "</body>\n"
+       << "</html>\n";
+  }
+
+  void RenderMachineFunction::getAnalysisUsage(AnalysisUsage &au) const {
+    au.addRequired<SlotIndexes>();
+    au.addRequired<LiveIntervals>();
+    au.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(au);
+  }
+
+  bool RenderMachineFunction::runOnMachineFunction(MachineFunction &fn) {
+
+    mf = &fn;
+    mri = &mf->getRegInfo();
+    tri = mf->getTarget().getRegisterInfo();
+    lis = &getAnalysis<LiveIntervals>();
+    sis = &getAnalysis<SlotIndexes>();
+
+    trei.setup(mf, mri, tri, lis);
+    ro.setup(mf, tri, lis, this);
+    spillIntervals.clear();
+    spillFor.clear();
+    useDefs.clear();
+
+    fqn = mf->getFunction()->getParent()->getModuleIdentifier() + "." +
+          mf->getFunction()->getName().str();
+
+    return false;
+  }
+
+  void RenderMachineFunction::releaseMemory() {
+    trei.clear();
+    ro.clear();
+    spillIntervals.clear();
+    spillFor.clear();
+    useDefs.clear();
+  }
+
+  void RenderMachineFunction::rememberUseDefs(const LiveInterval *li) {
+
+    if (!ro.shouldRenderCurrentMachineFunction())
+      return; 
+
+    for (MachineRegisterInfo::reg_iterator rItr = mri->reg_begin(li->reg),
+                                           rEnd = mri->reg_end();
+         rItr != rEnd; ++rItr) {
+      const MachineInstr *mi = &*rItr;
+      if (mi->readsRegister(li->reg)) {
+        useDefs[li].insert(lis->getInstructionIndex(mi).getUseIndex());
+      }
+      if (mi->definesRegister(li->reg)) {
+        useDefs[li].insert(lis->getInstructionIndex(mi).getDefIndex());
+      }
+    }
+  }
+
+  void RenderMachineFunction::rememberSpills(
+                                     const LiveInterval *li,
+                                     const std::vector<LiveInterval*> &spills) {
+
+    if (!ro.shouldRenderCurrentMachineFunction())
+      return; 
+
+    for (std::vector<LiveInterval*>::const_iterator siItr = spills.begin(),
+                                                    siEnd = spills.end();
+         siItr != siEnd; ++siItr) {
+      const LiveInterval *spill = *siItr;
+      spillIntervals[li].insert(spill);
+      spillFor[spill] = li;
+    }
+  }
+
+  bool RenderMachineFunction::isSpill(const LiveInterval *li) const {
+    SpillForMap::const_iterator sfItr = spillFor.find(li);
+    if (sfItr == spillFor.end())
+      return false;
+    return true;
+  }
+
+  void RenderMachineFunction::renderMachineFunction(
+                                                   const char *renderContextStr,
+                                                   const VirtRegMap *vrm,
+                                                   const char *renderSuffix) {
+    if (!ro.shouldRenderCurrentMachineFunction())
+      return; 
+
+    this->vrm = vrm;
+    trei.reset();
+
+    std::string rpFileName(mf->getFunction()->getName().str() +
+                           (renderSuffix ? renderSuffix : "") +
+                           outputFileSuffix);
+
+    std::string errMsg;
+    raw_fd_ostream outFile(rpFileName.c_str(), errMsg, raw_fd_ostream::F_Binary);
+
+    renderFunctionPage(outFile, renderContextStr);
+
+    ro.resetRenderSpecificOptions();
+  }
+
+  std::string RenderMachineFunction::escapeChars(const std::string &s) const {
+    return escapeChars(s.begin(), s.end());
+  }
+
+}
diff --git a/lib/CodeGen/RenderMachineFunction.h b/lib/CodeGen/RenderMachineFunction.h
new file mode 100644
index 0000000000000..8d56a8292ac59
--- /dev/null
+++ b/lib/CodeGen/RenderMachineFunction.h
@@ -0,0 +1,336 @@
+//===-- llvm/CodeGen/RenderMachineFunction.h - MF->HTML -*- C++ -*---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_RENDERMACHINEFUNCTION_H
+#define LLVM_CODEGEN_RENDERMACHINEFUNCTION_H
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+
+namespace llvm {
+
+  class LiveInterval;
+  class LiveIntervals;
+  class MachineInstr;
+  class MachineRegisterInfo;
+  class RenderMachineFunction;
+  class TargetRegisterClass;
+  class TargetRegisterInfo;
+  class VirtRegMap;
+  class raw_ostream;
+
+  /// \brief Helper class to process rendering options. Tries to be as lazy as
+  ///        possible.
+  class MFRenderingOptions {
+  public:
+
+    struct RegClassComp {
+      bool operator()(const TargetRegisterClass *trc1,
+                      const TargetRegisterClass *trc2) const {
+        std::string trc1Name(trc1->getName()), trc2Name(trc2->getName());
+        return std::lexicographical_compare(trc1Name.begin(), trc1Name.end(),
+                                            trc2Name.begin(), trc2Name.end());
+      }
+    };
+
+    typedef std::set<const TargetRegisterClass*, RegClassComp> RegClassSet;
+
+    struct IntervalComp {
+      bool operator()(const LiveInterval *li1, const LiveInterval *li2) const {
+        return li1->reg < li2->reg;
+      }
+    };
+
+    typedef std::set<const LiveInterval*, IntervalComp> IntervalSet;
+
+    /// Initialise the rendering options.
+    void setup(MachineFunction *mf, const TargetRegisterInfo *tri,
+               LiveIntervals *lis, const RenderMachineFunction *rmf);
+
+    /// Clear translations of options to the current function.
+    void clear();
+
+    /// Reset any options computed for this specific rendering.
+    void resetRenderSpecificOptions();
+
+    /// Should we render the current function.
+    bool shouldRenderCurrentMachineFunction() const;
+
+    /// Return the set of register classes to render pressure for.
+    const RegClassSet& regClasses() const;
+
+    /// Return the set of live intervals to render liveness for.
+    const IntervalSet& intervals() const;
+
+    /// Render indexes which are not associated with instructions / MBB starts.
+    bool renderEmptyIndexes() const;
+
+    /// Return whether or not to render using SVG for fancy vertical text.
+    bool fancyVerticals() const;
+
+  private:
+
+    static bool renderingOptionsProcessed;
+    static std::set<std::string> mfNamesToRender;
+    static bool renderAllMFs;
+
+    static std::set<std::string> classNamesToRender;
+    static bool renderAllClasses;
+
+
+    static std::set<std::pair<unsigned, unsigned> > intervalNumsToRender;
+    typedef enum { ExplicitOnly     = 0,
+                   AllPhys          = 1,
+                   VirtNoSpills     = 2,
+                   VirtSpills       = 4,
+                   AllVirt          = 6,
+                   All              = 7 }
+      IntervalTypesToRender;
+    static unsigned intervalTypesToRender;
+
+    template <typename OutputItr>
+    static void splitComaSeperatedList(const std::string &s, OutputItr outItr);
+
+    static void processOptions();
+
+    static void processFuncNames();
+    static void processRegClassNames();
+    static void processIntervalNumbers();
+
+    static void processIntervalRange(const std::string &intervalRangeStr);
+
+    MachineFunction *mf;
+    const TargetRegisterInfo *tri;
+    LiveIntervals *lis;
+    const RenderMachineFunction *rmf;
+
+    mutable bool regClassesTranslatedToCurrentFunction;
+    mutable RegClassSet regClassSet;
+
+    mutable bool intervalsTranslatedToCurrentFunction;
+    mutable IntervalSet intervalSet;
+
+    void translateRegClassNamesToCurrentFunction() const;
+
+    void translateIntervalNumbersToCurrentFunction() const;
+  };
+
+  /// \brief Provide extra information about the physical and virtual registers
+  ///        in the function being compiled.
+  class TargetRegisterExtraInfo {
+  public:
+    TargetRegisterExtraInfo();
+
+    /// \brief Set up TargetRegisterExtraInfo with pointers to necessary
+    ///        sources of information.
+    void setup(MachineFunction *mf, MachineRegisterInfo *mri,
+               const TargetRegisterInfo *tri, LiveIntervals *lis);
+
+    /// \brief Recompute tables for changed function.
+    void reset(); 
+
+    /// \brief Free all tables in TargetRegisterExtraInfo.
+    void clear();
+
+    /// \brief Maximum number of registers from trc which alias reg.
+    unsigned getWorst(unsigned reg, const TargetRegisterClass *trc) const;
+
+    /// \brief Returns the number of allocable registers in trc.
+    unsigned getCapacity(const TargetRegisterClass *trc) const;
+
+    /// \brief Return the number of registers of class trc that may be
+    ///        needed at slot i.
+    unsigned getPressureAtSlot(const TargetRegisterClass *trc,
+                               SlotIndex i) const;
+
+    /// \brief Return true if the number of registers of type trc that may be
+    ///        needed at slot i is greater than the capacity of trc.
+    bool classOverCapacityAtSlot(const TargetRegisterClass *trc,
+                                 SlotIndex i) const;
+
+  private:
+
+    MachineFunction *mf;
+    MachineRegisterInfo *mri;
+    const TargetRegisterInfo *tri;
+    LiveIntervals *lis;
+
+    typedef std::map<const TargetRegisterClass*, unsigned> WorstMapLine;
+    typedef std::map<const TargetRegisterClass*, WorstMapLine> VRWorstMap;
+    VRWorstMap vrWorst;
+
+    typedef std::map<unsigned, WorstMapLine> PRWorstMap;
+    PRWorstMap prWorst;
+
+    typedef std::map<const TargetRegisterClass*, unsigned> CapacityMap;
+    CapacityMap capacityMap;
+
+    typedef std::map<const TargetRegisterClass*, unsigned> PressureMapLine;
+    typedef std::map<SlotIndex, PressureMapLine> PressureMap;
+    PressureMap pressureMap;
+
+    bool mapsPopulated;
+
+    /// \brief Initialise the 'worst' table.
+    void initWorst();
+ 
+    /// \brief Initialise the 'capacity' table.
+    void initCapacity();
+
+    /// \brief Initialise/Reset the 'pressure' and live states tables.
+    void resetPressureAndLiveStates();
+  };
+
+  /// \brief Render MachineFunction objects and related information to a HTML
+  ///        page.
+  class RenderMachineFunction : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    RenderMachineFunction() : MachineFunctionPass(ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &au) const;
+
+    virtual bool runOnMachineFunction(MachineFunction &fn);
+
+    virtual void releaseMemory();
+
+    void rememberUseDefs(const LiveInterval *li);
+
+    void rememberSpills(const LiveInterval *li,
+                        const std::vector<LiveInterval*> &spills);
+
+    bool isSpill(const LiveInterval *li) const;
+
+    /// \brief Render this machine function to HTML.
+    /// 
+    /// @param renderContextStr This parameter will be included in the top of
+    ///                         the html file to explain where (in the
+    ///                         codegen pipeline) this function was rendered
+    ///                         from. Set it to something like
+    ///                         "Pre-register-allocation".
+    /// @param vrm              If non-null the VRM will be queried to determine
+    ///                         whether a virtual register was allocated to a
+    ///                         physical register or spilled.
+    /// @param renderFilePrefix This string will be appended to the function
+    ///                         name (before the output file suffix) to enable
+    ///                         multiple renderings from the same function.
+    void renderMachineFunction(const char *renderContextStr,
+                               const VirtRegMap *vrm = 0,
+                               const char *renderSuffix = 0);
+
+  private:
+    class Spacer;
+    friend raw_ostream& operator<<(raw_ostream &os, const Spacer &s);
+
+    std::string fqn;
+
+    MachineFunction *mf;
+    MachineRegisterInfo *mri;
+    const TargetRegisterInfo *tri;
+    LiveIntervals *lis;
+    SlotIndexes *sis;
+    const VirtRegMap *vrm;
+
+    TargetRegisterExtraInfo trei;
+    MFRenderingOptions ro;
+
+    
+
+    // Utilities.
+    typedef enum { Dead, Defined, Used, AliveReg, AliveStack } LiveState;
+    LiveState getLiveStateAt(const LiveInterval *li, SlotIndex i) const;
+
+    typedef enum { Zero, Low, High } PressureState;
+    PressureState getPressureStateAt(const TargetRegisterClass *trc,
+                                     SlotIndex i) const;
+
+    typedef std::map<const LiveInterval*, std::set<const LiveInterval*> >
+      SpillIntervals;
+    SpillIntervals spillIntervals;
+
+    typedef std::map<const LiveInterval*, const LiveInterval*> SpillForMap;
+    SpillForMap spillFor;
+
+    typedef std::set<SlotIndex> SlotSet;
+    typedef std::map<const LiveInterval*, SlotSet> UseDefs;
+    UseDefs useDefs;
+
+    // ---------- Rendering methods ----------
+
+    /// For inserting spaces when pretty printing.
+    class Spacer {
+    public:
+      explicit Spacer(unsigned numSpaces) : ns(numSpaces) {}
+      Spacer operator+(const Spacer &o) const { return Spacer(ns + o.ns); }
+      void print(raw_ostream &os) const;
+    private:
+      unsigned ns;
+    };
+
+    Spacer s(unsigned ns) const;
+
+    template <typename Iterator>
+    std::string escapeChars(Iterator sBegin, Iterator sEnd) const;
+
+    /// \brief Render a machine instruction.
+    void renderMachineInstr(raw_ostream &os,
+                            const MachineInstr *mi) const;
+
+    /// \brief Render vertical text.
+    template <typename T>
+    void renderVertical(const Spacer &indent,
+                        raw_ostream &os,
+                        const T &t) const;
+
+    /// \brief Insert CSS layout info.
+    void insertCSS(const Spacer &indent,
+                   raw_ostream &os) const;
+
+    /// \brief Render a brief summary of the function (including rendering
+    ///        context).
+    void renderFunctionSummary(const Spacer &indent,
+                               raw_ostream &os,
+                               const char * const renderContextStr) const;
+
+    /// \brief Render a legend for the pressure table.
+    void renderPressureTableLegend(const Spacer &indent,
+                                   raw_ostream &os) const;
+
+    /// \brief Render a consecutive set of HTML cells of the same class using
+    /// the colspan attribute for run-length encoding.
+    template <typename CellType>
+    void renderCellsWithRLE(
+                     const Spacer &indent, raw_ostream &os,
+                     const std::pair<CellType, unsigned> &rleAccumulator,
+                     const std::map<CellType, std::string> &cellTypeStrs) const;
+
+    /// \brief Render code listing, potentially with register pressure
+    ///        and live intervals shown alongside.
+    void renderCodeTablePlusPI(const Spacer &indent,
+                               raw_ostream &os) const;
+
+    /// \brief Render the HTML page representing the MachineFunction.
+    void renderFunctionPage(raw_ostream &os,
+                            const char * const renderContextStr) const;
+
+    std::string escapeChars(const std::string &s) const;
+  };
+}
+
+#endif /* LLVM_CODEGEN_RENDERMACHINEFUNCTION_H */
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 09202f84cb29d..ea93dd5c66633 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -32,7 +32,8 @@ using namespace llvm;
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineLoopInfo &mli,
                                      const MachineDominatorTree &mdt)
-  : ScheduleDAG(mf), MLI(mli), MDT(mdt), LoopRegs(MLI, MDT) {
+  : ScheduleDAG(mf), MLI(mli), MDT(mdt), Defs(TRI->getNumRegs()),
+    Uses(TRI->getNumRegs()), LoopRegs(MLI, MDT) {
   MFI = mf.getFrameInfo();
   DbgValueVec.clear();
 }
@@ -159,8 +160,9 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
   std::map<const Value *, std::vector<SUnit *> > AliasMemUses, NonAliasMemUses;
 
   // Keep track of dangling debug references to registers.
-  std::pair<MachineInstr*, unsigned>
-        DanglingDebugValue[TargetRegisterInfo::FirstVirtualRegister];
+  std::vector<std::pair<MachineInstr*, unsigned> >
+    DanglingDebugValue(TRI->getNumRegs(),
+    std::make_pair(static_cast<MachineInstr*>(0), 0));
 
   // Check to see if the scheduler cares about latencies.
   bool UnitLatencies = ForceUnitLatencies();
@@ -172,7 +174,6 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
   // Remove any stale debug info; sometimes BuildSchedGraph is called again
   // without emitting the info from the previous call.
   DbgValueVec.clear();
-  std::memset(DanglingDebugValue, 0, sizeof(DanglingDebugValue));
 
   // Walk the list of instructions, from bottom moving up.
   for (MachineBasicBlock::iterator MII = InsertPos, MIE = Begin;
diff --git a/lib/CodeGen/ScheduleDAGInstrs.h b/lib/CodeGen/ScheduleDAGInstrs.h
index d90659bb163ef..c8f543f7146dd 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.h
+++ b/lib/CodeGen/ScheduleDAGInstrs.h
@@ -106,8 +106,8 @@ namespace llvm {
     /// are as we iterate upward through the instructions. This is allocated
     /// here instead of inside BuildSchedGraph to avoid the need for it to be
     /// initialized and destructed for each block.
-    std::vector<SUnit *> Defs[TargetRegisterInfo::FirstVirtualRegister];
-    std::vector<SUnit *> Uses[TargetRegisterInfo::FirstVirtualRegister];
+    std::vector<std::vector<SUnit *> > Defs;
+    std::vector<std::vector<SUnit *> > Uses;
  
     /// DbgValueVec - Remember DBG_VALUEs that refer to a particular
     /// register.
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e671752464572..c9c4d91e97361 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4489,6 +4489,16 @@ ConstantFoldBIT_CONVERTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
   // If this is a conversion of N elements of one type to N elements of another
   // type, convert each element.  This handles FP<->INT cases.
   if (SrcBitSize == DstBitSize) {
+    EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
+                              BV->getValueType(0).getVectorNumElements());
+
+    // Due to the FP element handling below calling this routine recursively,
+    // we can end up with a scalar-to-vector node here.
+    if (BV->getOpcode() == ISD::SCALAR_TO_VECTOR)
+      return DAG.getNode(ISD::SCALAR_TO_VECTOR, BV->getDebugLoc(), VT, 
+                         DAG.getNode(ISD::BIT_CONVERT, BV->getDebugLoc(),
+                                     DstEltVT, BV->getOperand(0)));
+      
     SmallVector<SDValue, 8> Ops;
     for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
       SDValue Op = BV->getOperand(i);
@@ -4500,8 +4510,6 @@ ConstantFoldBIT_CONVERTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
                                 DstEltVT, Op));
       AddToWorkList(Ops.back().getNode());
     }
-    EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
-                              BV->getValueType(0).getVectorNumElements());
     return DAG.getNode(ISD::BUILD_VECTOR, BV->getDebugLoc(), VT,
                        &Ops[0], Ops.size());
   }
@@ -5790,7 +5798,8 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
     return SDValue();
 
   SDValue N0 = Value.getOperand(0);
-  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      Chain == SDValue(N0.getNode(), 1)) {
     LoadSDNode *LD = cast<LoadSDNode>(N0);
     if (LD->getBasePtr() != Ptr)
       return SDValue();
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index decaa769e99fe..a4eed71e65c01 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -94,7 +94,7 @@ bool FastISel::hasTrivialKill(const Value *V) const {
          !(I->getOpcode() == Instruction::BitCast ||
            I->getOpcode() == Instruction::PtrToInt ||
            I->getOpcode() == Instruction::IntToPtr) &&
-         cast<Instruction>(I->use_begin())->getParent() == I->getParent();
+         cast<Instruction>(*I->use_begin())->getParent() == I->getParent();
 }
 
 unsigned FastISel::getRegForValue(const Value *V) {
@@ -146,7 +146,7 @@ unsigned FastISel::getRegForValue(const Value *V) {
   return Reg;
 }
 
-/// materializeRegForValue - Helper for getRegForVale. This function is
+/// materializeRegForValue - Helper for getRegForValue. This function is
 /// called when the value isn't already available in a register and must
 /// be materialized with new instructions.
 unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
@@ -276,6 +276,7 @@ std::pair<unsigned, bool> FastISel::getRegForGEPIndex(const Value *Idx) {
 void FastISel::recomputeInsertPt() {
   if (getLastLocalValue()) {
     FuncInfo.InsertPt = getLastLocalValue();
+    FuncInfo.MBB = FuncInfo.InsertPt->getParent();
     ++FuncInfo.InsertPt;
   } else
     FuncInfo.InsertPt = FuncInfo.MBB->getFirstNonPHI();
@@ -472,17 +473,7 @@ bool FastISel::SelectCall(const User *I) {
       return true;
     const AllocaInst *AI = dyn_cast<AllocaInst>(Address);
     // Don't handle byval struct arguments or VLAs, for example.
-    // Note that if we have a byval struct argument, fast ISel is turned off;
-    // those are handled in SelectionDAGBuilder.
-    if (AI) {
-      DenseMap<const AllocaInst*, int>::iterator SI =
-        FuncInfo.StaticAllocaMap.find(AI);
-      if (SI == FuncInfo.StaticAllocaMap.end()) break; // VLAs.
-      int FI = SI->second;
-      if (!DI->getDebugLoc().isUnknown())
-        FuncInfo.MF->getMMI().setVariableDbgInfo(DI->getVariable(),
-                                                 FI, DI->getDebugLoc());
-    } else
+    if (!AI)
       // Building the map above is target independent.  Generating DBG_VALUE
       // inline is target dependent; do this now.
       (void)TargetSelectInstruction(cast<Instruction>(I));
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 928e1ecd4cf4e..5ef6404ee5d6b 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
+#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -111,17 +112,56 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
 
         TySize *= CUI->getZExtValue();   // Get total allocated size.
         if (TySize == 0) TySize = 1; // Don't create zero-sized stack objects.
+
+        // The object may need to be placed onto the stack near the stack
+        // protector if one exists. Determine here if this object is a suitable
+        // candidate. I.e., it would trigger the creation of a stack protector.
+        bool MayNeedSP =
+          (AI->isArrayAllocation() ||
+           (TySize > 8 && isa<ArrayType>(Ty) &&
+            cast<ArrayType>(Ty)->getElementType()->isIntegerTy(8)));
         StaticAllocaMap[AI] =
-          MF->getFrameInfo()->CreateStackObject(TySize, Align, false);
+          MF->getFrameInfo()->CreateStackObject(TySize, Align, false, MayNeedSP);
       }
 
   for (; BB != EB; ++BB)
-    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Mark values used outside their block as exported, by allocating
+      // a virtual register for them.
       if (isUsedOutsideOfDefiningBlock(I))
         if (!isa<AllocaInst>(I) ||
             !StaticAllocaMap.count(cast<AllocaInst>(I)))
           InitializeRegForValue(I);
 
+      // Collect llvm.dbg.declare information. This is done now instead of
+      // during the initial isel pass through the IR so that it is done
+      // in a predictable order.
+      if (const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(I)) {
+        MachineModuleInfo &MMI = MF->getMMI();
+        if (MMI.hasDebugInfo() &&
+            DIVariable(DI->getVariable()).Verify() &&
+            !DI->getDebugLoc().isUnknown()) {
+          // Don't handle byval struct arguments or VLAs, for example.
+          // Non-byval arguments are handled here (they refer to the stack
+          // temporary alloca at this point).
+          const Value *Address = DI->getAddress();
+          if (Address) {
+            if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
+              Address = BCI->getOperand(0);
+            if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
+              DenseMap<const AllocaInst *, int>::iterator SI =
+                StaticAllocaMap.find(AI);
+              if (SI != StaticAllocaMap.end()) { // Check for VLAs.
+                int FI = SI->second;
+                MMI.setVariableDbgInfo(DI->getVariable(),
+                                       FI, DI->getDebugLoc());
+              }
+            }
+          }
+        }
+      }
+    }
+
   // Create an initial MachineBasicBlock for each LLVM BasicBlock in F.  This
   // also creates the initial PHI MachineInstrs, though none of the input
   // operands are populated.
@@ -181,6 +221,7 @@ void FunctionLoweringInfo::clear() {
 #endif
   LiveOutRegInfo.clear();
   ArgDbgValues.clear();
+  ByValArgFrameIndexMap.clear();
   RegFixups.clear();
 }
 
@@ -214,6 +255,28 @@ unsigned FunctionLoweringInfo::CreateRegs(const Type *Ty) {
   return FirstReg;
 }
 
+/// setByValArgumentFrameIndex - Record frame index for the byval
+/// argument. This overrides previous frame index entry for this argument,
+/// if any.
+void FunctionLoweringInfo::setByValArgumentFrameIndex(const Argument *A, 
+                                                      int FI) {
+  assert (A->hasByValAttr() && "Argument does not have byval attribute!");
+  ByValArgFrameIndexMap[A] = FI;
+}
+  
+/// getByValArgumentFrameIndex - Get frame index for the byval argument.
+/// If the argument does not have any assigned frame index then 0 is
+/// returned.
+int FunctionLoweringInfo::getByValArgumentFrameIndex(const Argument *A) {
+  assert (A->hasByValAttr() && "Argument does not have byval attribute!");
+  DenseMap<const Argument *, int>::iterator I = 
+    ByValArgFrameIndexMap.find(A);
+  if (I != ByValArgFrameIndexMap.end())
+    return I->second;
+  DEBUG(dbgs() << "Argument does not have assigned frame index!");
+  return 0;
+}
+
 /// AddCatchInfo - Extract the personality and type infos from an eh.selector
 /// call, and add them to the specified machine basic block.
 void llvm::AddCatchInfo(const CallInst &I, MachineModuleInfo *MMI,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7a47da4ec52ef..2981cd3f1cabd 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -100,8 +100,7 @@ public:
   /// it is already legal or we need to expand it into multiple registers of
   /// smaller integer type, or we need to promote it to a larger type.
   LegalizeAction getTypeAction(EVT VT) const {
-    return
-        (LegalizeAction)ValueTypeActions.getTypeAction(*DAG.getContext(), VT);
+    return (LegalizeAction)ValueTypeActions.getTypeAction(VT);
   }
 
   /// isTypeLegal - Return true if this type is legal on this target.
@@ -1314,21 +1313,30 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           }
           break;
         case TargetLowering::Expand:
-          // f64 = EXTLOAD f32 should expand to LOAD, FP_EXTEND
-          // f128 = EXTLOAD {f32,f64} too
-          if ((SrcVT == MVT::f32 && (Node->getValueType(0) == MVT::f64 ||
-                                     Node->getValueType(0) == MVT::f128)) ||
-              (SrcVT == MVT::f64 && Node->getValueType(0) == MVT::f128)) {
+          if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) && isTypeLegal(SrcVT)) {
             SDValue Load = DAG.getLoad(SrcVT, dl, Tmp1, Tmp2, LD->getSrcValue(),
                                        LD->getSrcValueOffset(),
                                        LD->isVolatile(), LD->isNonTemporal(),
                                        LD->getAlignment());
-            Result = DAG.getNode(ISD::FP_EXTEND, dl,
-                                 Node->getValueType(0), Load);
+            unsigned ExtendOp;
+            switch (ExtType) {
+            case ISD::EXTLOAD:
+              ExtendOp = (SrcVT.isFloatingPoint() ?
+                          ISD::FP_EXTEND : ISD::ANY_EXTEND);
+              break;
+            case ISD::SEXTLOAD: ExtendOp = ISD::SIGN_EXTEND; break;
+            case ISD::ZEXTLOAD: ExtendOp = ISD::ZERO_EXTEND; break;
+            default: llvm_unreachable("Unexpected extend load type!");
+            }
+            Result = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
             Tmp1 = LegalizeOp(Result);  // Relegalize new nodes.
             Tmp2 = LegalizeOp(Load.getValue(1));
             break;
           }
+          // FIXME: This does not work for vectors on most targets.  Sign- and
+          // zero-extend operations are currently folded into extending loads,
+          // whether they are legal or not, and then we end up here without any
+          // support for legalizing them.
           assert(ExtType != ISD::EXTLOAD &&
                  "EXTLOAD should always be supported!");
           // Turn the unsupported load into an EXTLOAD followed by an explicit
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index b94ea9a3a9afb..f8c5890719219 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -234,8 +234,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
   // The pair element type may be legal, or may not promote to the same type as
   // the result, for example i14 = BUILD_PAIR (i7, i7).  Handle all cases.
   return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(),
-                     TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)),
-                     JoinIntegers(N->getOperand(0), N->getOperand(1)));
+                     TLI.getTypeToTransformTo(*DAG.getContext(),
+                     N->getValueType(0)), JoinIntegers(N->getOperand(0),
+                     N->getOperand(1)));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
@@ -245,7 +246,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
   // Zero extend things like i1, sign extend everything else.  It shouldn't
   // matter in theory which one we pick, but this tends to give better code?
   unsigned Opc = VT.isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-  SDValue Result = DAG.getNode(Opc, dl, TLI.getTypeToTransformTo(*DAG.getContext(), VT),
+  SDValue Result = DAG.getNode(Opc, dl,
+                               TLI.getTypeToTransformTo(*DAG.getContext(), VT),
                                SDValue(N, 0));
   assert(isa<ConstantSDNode>(Result) && "Didn't constant fold ext?");
   return Result;
@@ -310,8 +312,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
 
   // If we're promoting a UINT to a larger size and the larger FP_TO_UINT is
   // not Legal, check to see if we can use FP_TO_SINT instead.  (If both UINT
-  // and SINT conversions are Custom, there is no way to tell which is preferable.
-  // We choose SINT because that's the right thing on PPC.)
+  // and SINT conversions are Custom, there is no way to tell which is
+  // preferable. We choose SINT because that's the right thing on PPC.)
   if (N->getOpcode() == ISD::FP_TO_UINT &&
       !TLI.isOperationLegal(ISD::FP_TO_UINT, NVT) &&
       TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT))
@@ -1030,7 +1032,7 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, unsigned Amt,
       Hi = InL;
     } else if (Amt == 1 &&
                TLI.isOperationLegalOrCustom(ISD::ADDC,
-                                            TLI.getTypeToExpandTo(*DAG.getContext(), NVT))) {
+                              TLI.getTypeToExpandTo(*DAG.getContext(), NVT))) {
       // Emit this X << 1 as X+X.
       SDVTList VTList = DAG.getVTList(NVT, MVT::Flag);
       SDValue LoOps[2] = { InL, InL };
@@ -1926,7 +1928,8 @@ ExpandIntRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi) {
     unsigned ExcessBits =
       EVT.getSizeInBits() - Lo.getValueType().getSizeInBits();
     Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi,
-                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), ExcessBits)));
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        ExcessBits)));
   }
 }
 
@@ -2046,7 +2049,8 @@ void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
     unsigned ExcessBits =
       Op.getValueType().getSizeInBits() - NVT.getSizeInBits();
     Hi = DAG.getZeroExtendInReg(Hi, dl,
-                                EVT::getIntegerVT(*DAG.getContext(), ExcessBits));
+                                EVT::getIntegerVT(*DAG.getContext(),
+                                                  ExcessBits));
   }
 }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index bd86694446d6a..d56029208e61a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -75,7 +75,7 @@ private:
 
   /// getTypeAction - Return how we should legalize values of this type.
   LegalizeAction getTypeAction(EVT VT) const {
-    switch (ValueTypeActions.getTypeAction(*DAG.getContext(), VT)) {
+    switch (ValueTypeActions.getTypeAction(VT)) {
     default:
       assert(false && "Unknown legalize action!");
     case TargetLowering::Legal:
@@ -86,8 +86,7 @@ private:
       //   2) For vectors, use a wider vector type (e.g. v3i32 -> v4i32).
       if (!VT.isVector())
         return PromoteInteger;
-      else
-        return WidenVector;
+      return WidenVector;
     case TargetLowering::Expand:
       // Expand can mean
       // 1) split scalar in half, 2) convert a float to an integer,
@@ -95,23 +94,21 @@ private:
       if (!VT.isVector()) {
         if (VT.isInteger())
           return ExpandInteger;
-        else if (VT.getSizeInBits() ==
-                 TLI.getTypeToTransformTo(*DAG.getContext(), VT).getSizeInBits())
+        if (VT.getSizeInBits() ==
+                TLI.getTypeToTransformTo(*DAG.getContext(), VT).getSizeInBits())
           return SoftenFloat;
-        else
-          return ExpandFloat;
-      } else if (VT.getVectorNumElements() == 1) {
-        return ScalarizeVector;
-      } else {
-        return SplitVector;
+        return ExpandFloat;
       }
+        
+      if (VT.getVectorNumElements() == 1)
+        return ScalarizeVector;
+      return SplitVector;
     }
   }
 
   /// isTypeLegal - Return true if this type is legal on this target.
   bool isTypeLegal(EVT VT) const {
-    return (ValueTypeActions.getTypeAction(*DAG.getContext(), VT) == 
-            TargetLowering::Legal);
+    return ValueTypeActions.getTypeAction(VT) == TargetLowering::Legal;
   }
 
   /// IgnoreNodeResults - Pretend all of this node's results are legal.
@@ -584,6 +581,7 @@ private:
   SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
 
   //===--------------------------------------------------------------------===//
   // Vector Widening Support: LegalizeVectorTypes.cpp
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 93aeff5c1e6cf..93bc2d04928e7 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -983,6 +983,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::BIT_CONVERT:       Res = SplitVecOp_BIT_CONVERT(N); break;
     case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
     case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
+    case ISD::CONCAT_VECTORS:    Res = SplitVecOp_CONCAT_VECTORS(N); break;
     case ISD::STORE:
       Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
       break;
@@ -1091,8 +1092,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
       return SDValue(DAG.UpdateNodeOperands(N, Lo, Idx), 0);
     return SDValue(DAG.UpdateNodeOperands(N, Hi,
                                   DAG.getConstant(IdxVal - LoElts,
-                                                  Idx.getValueType())),
-                   0);
+                                                  Idx.getValueType())), 0);
   }
 
   // Store the vector to the stack.
@@ -1113,7 +1113,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
 SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   assert(N->isUnindexed() && "Indexed store of vector?");
   assert(OpNo == 1 && "Can only split the stored value");
-  DebugLoc dl = N->getDebugLoc();
+  DebugLoc DL = N->getDebugLoc();
 
   bool isTruncating = N->isTruncatingStore();
   SDValue Ch  = N->getChain();
@@ -1132,25 +1132,49 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
 
   if (isTruncating)
-    Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getSrcValue(), SVOffset,
+    Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getSrcValue(), SVOffset,
                            LoMemVT, isVol, isNT, Alignment);
   else
-    Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getSrcValue(), SVOffset,
+    Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getSrcValue(), SVOffset,
                       isVol, isNT, Alignment);
 
   // Increment the pointer to the other half.
-  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+  Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
                     DAG.getIntPtrConstant(IncrementSize));
   SVOffset += IncrementSize;
 
   if (isTruncating)
-    Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getSrcValue(), SVOffset,
+    Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr, N->getSrcValue(), SVOffset,
                            HiMemVT, isVol, isNT, Alignment);
   else
-    Hi = DAG.getStore(Ch, dl, Hi, Ptr, N->getSrcValue(), SVOffset,
+    Hi = DAG.getStore(Ch, DL, Hi, Ptr, N->getSrcValue(), SVOffset,
                       isVol, isNT, Alignment);
 
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
+  DebugLoc DL = N->getDebugLoc();
+  
+  // The input operands all must have the same type, and we know the result the
+  // result type is valid.  Convert this to a buildvector which extracts all the
+  // input elements.
+  // TODO: If the input elements are power-two vectors, we could convert this to
+  // a new CONCAT_VECTORS node with elements that are half-wide.
+  SmallVector<SDValue, 32> Elts;
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  for (unsigned op = 0, e = N->getNumOperands(); op != e; ++op) {
+    SDValue Op = N->getOperand(op);
+    for (unsigned i = 0, e = Op.getValueType().getVectorNumElements();
+         i != e; ++i) {
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
+                                 Op, DAG.getIntPtrConstant(i)));
+
+    }
+  }
+  
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, N->getValueType(0),
+                     &Elts[0], Elts.size());
 }
 
 
@@ -1274,8 +1298,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
   EVT VT = WidenVT;
   unsigned NumElts =  VT.getVectorNumElements();
   while (!TLI.isTypeSynthesizable(VT) && NumElts != 1) {
-     NumElts = NumElts / 2;
-     VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
+    NumElts = NumElts / 2;
+    VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
   }
 
   if (NumElts != 1 && !TLI.canOpTrap(N->getOpcode(), VT)) {
@@ -1283,124 +1307,123 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
     SDValue InOp1 = GetWidenedVector(N->getOperand(0));
     SDValue InOp2 = GetWidenedVector(N->getOperand(1));
     return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2);
-  } else if (NumElts == 1) {
-    // No legal vector version so unroll the vector operation and then widen.
+  }
+  
+  // No legal vector version so unroll the vector operation and then widen.
+  if (NumElts == 1)
     return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
-  } else {
-    // Since the operation can trap, apply operation on the original vector.
-    EVT MaxVT = VT;
-    SDValue InOp1 = GetWidenedVector(N->getOperand(0));
-    SDValue InOp2 = GetWidenedVector(N->getOperand(1));
-    unsigned CurNumElts = N->getValueType(0).getVectorNumElements();
-
-    SmallVector<SDValue, 16> ConcatOps(CurNumElts);
-    unsigned ConcatEnd = 0;  // Current ConcatOps index.
-    int Idx = 0;        // Current Idx into input vectors.
-
-    // NumElts := greatest synthesizable vector size (at most WidenVT)
-    // while (orig. vector has unhandled elements) {
-    //   take munches of size NumElts from the beginning and add to ConcatOps
-    //   NumElts := next smaller supported vector size or 1
-    // }
-    while (CurNumElts != 0) {
-      while (CurNumElts >= NumElts) {
-        SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
-                                   DAG.getIntPtrConstant(Idx));
-        SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
-                                   DAG.getIntPtrConstant(Idx));
-        ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2);
-        Idx += NumElts;
-        CurNumElts -= NumElts;
-      }
-      do {
-        NumElts = NumElts / 2;
-        VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
-      } while (!TLI.isTypeSynthesizable(VT) && NumElts != 1);
-
-      if (NumElts == 1) {
-        for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
-          SDValue EOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, 
-                                     InOp1, DAG.getIntPtrConstant(Idx));
-          SDValue EOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, 
-                                     InOp2, DAG.getIntPtrConstant(Idx));
-          ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
-                                               EOp1, EOp2);
-        }
-        CurNumElts = 0;
+  
+  // Since the operation can trap, apply operation on the original vector.
+  EVT MaxVT = VT;
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  unsigned CurNumElts = N->getValueType(0).getVectorNumElements();
+
+  SmallVector<SDValue, 16> ConcatOps(CurNumElts);
+  unsigned ConcatEnd = 0;  // Current ConcatOps index.
+  int Idx = 0;        // Current Idx into input vectors.
+
+  // NumElts := greatest synthesizable vector size (at most WidenVT)
+  // while (orig. vector has unhandled elements) {
+  //   take munches of size NumElts from the beginning and add to ConcatOps
+  //   NumElts := next smaller supported vector size or 1
+  // }
+  while (CurNumElts != 0) {
+    while (CurNumElts >= NumElts) {
+      SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
+                                 DAG.getIntPtrConstant(Idx));
+      SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
+                                 DAG.getIntPtrConstant(Idx));
+      ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2);
+      Idx += NumElts;
+      CurNumElts -= NumElts;
+    }
+    do {
+      NumElts = NumElts / 2;
+      VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
+    } while (!TLI.isTypeSynthesizable(VT) && NumElts != 1);
+
+    if (NumElts == 1) {
+      for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
+        SDValue EOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, 
+                                   InOp1, DAG.getIntPtrConstant(Idx));
+        SDValue EOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, 
+                                   InOp2, DAG.getIntPtrConstant(Idx));
+        ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
+                                             EOp1, EOp2);
       }
+      CurNumElts = 0;
     }
+  }
 
-    // Check to see if we have a single operation with the widen type.
-    if (ConcatEnd == 1) {
-      VT = ConcatOps[0].getValueType();
-      if (VT == WidenVT)
-        return ConcatOps[0];
-    }
+  // Check to see if we have a single operation with the widen type.
+  if (ConcatEnd == 1) {
+    VT = ConcatOps[0].getValueType();
+    if (VT == WidenVT)
+      return ConcatOps[0];
+  }
 
-    // while (Some element of ConcatOps is not of type MaxVT) {
-    //   From the end of ConcatOps, collect elements of the same type and put
-    //   them into an op of the next larger supported type
-    // }
-    while (ConcatOps[ConcatEnd-1].getValueType() != MaxVT) {
-      Idx = ConcatEnd - 1;
-      VT = ConcatOps[Idx--].getValueType();
-      while (Idx >= 0 && ConcatOps[Idx].getValueType() == VT)
-        Idx--;
-
-      int NextSize = VT.isVector() ? VT.getVectorNumElements() : 1;
-      EVT NextVT;
-      do {
-        NextSize *= 2;
-        NextVT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NextSize);
-      } while (!TLI.isTypeSynthesizable(NextVT));
-
-      if (!VT.isVector()) {
-        // Scalar type, create an INSERT_VECTOR_ELEMENT of type NextVT
-        SDValue VecOp = DAG.getUNDEF(NextVT);
-        unsigned NumToInsert = ConcatEnd - Idx - 1;
-        for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
-          VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp,
-                              ConcatOps[OpIdx], DAG.getIntPtrConstant(i));
-        }
-        ConcatOps[Idx+1] = VecOp;
-        ConcatEnd = Idx + 2;
-      } 
-      else {
-        // Vector type, create a CONCAT_VECTORS of type NextVT
-        SDValue undefVec = DAG.getUNDEF(VT);
-        unsigned OpsToConcat = NextSize/VT.getVectorNumElements();
-        SmallVector<SDValue, 16> SubConcatOps(OpsToConcat);
-        unsigned RealVals = ConcatEnd - Idx - 1;
-        unsigned SubConcatEnd = 0;
-        unsigned SubConcatIdx = Idx + 1;
-        while (SubConcatEnd < RealVals)
-          SubConcatOps[SubConcatEnd++] = ConcatOps[++Idx];
-        while (SubConcatEnd < OpsToConcat)
-          SubConcatOps[SubConcatEnd++] = undefVec;
-        ConcatOps[SubConcatIdx] = DAG.getNode(ISD::CONCAT_VECTORS, dl,
-                                              NextVT, &SubConcatOps[0],
-                                              OpsToConcat);
-        ConcatEnd = SubConcatIdx + 1;
+  // while (Some element of ConcatOps is not of type MaxVT) {
+  //   From the end of ConcatOps, collect elements of the same type and put
+  //   them into an op of the next larger supported type
+  // }
+  while (ConcatOps[ConcatEnd-1].getValueType() != MaxVT) {
+    Idx = ConcatEnd - 1;
+    VT = ConcatOps[Idx--].getValueType();
+    while (Idx >= 0 && ConcatOps[Idx].getValueType() == VT)
+      Idx--;
+
+    int NextSize = VT.isVector() ? VT.getVectorNumElements() : 1;
+    EVT NextVT;
+    do {
+      NextSize *= 2;
+      NextVT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NextSize);
+    } while (!TLI.isTypeSynthesizable(NextVT));
+
+    if (!VT.isVector()) {
+      // Scalar type, create an INSERT_VECTOR_ELEMENT of type NextVT
+      SDValue VecOp = DAG.getUNDEF(NextVT);
+      unsigned NumToInsert = ConcatEnd - Idx - 1;
+      for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
+        VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp,
+                            ConcatOps[OpIdx], DAG.getIntPtrConstant(i));
       }
+      ConcatOps[Idx+1] = VecOp;
+      ConcatEnd = Idx + 2;
+    } else {
+      // Vector type, create a CONCAT_VECTORS of type NextVT
+      SDValue undefVec = DAG.getUNDEF(VT);
+      unsigned OpsToConcat = NextSize/VT.getVectorNumElements();
+      SmallVector<SDValue, 16> SubConcatOps(OpsToConcat);
+      unsigned RealVals = ConcatEnd - Idx - 1;
+      unsigned SubConcatEnd = 0;
+      unsigned SubConcatIdx = Idx + 1;
+      while (SubConcatEnd < RealVals)
+        SubConcatOps[SubConcatEnd++] = ConcatOps[++Idx];
+      while (SubConcatEnd < OpsToConcat)
+        SubConcatOps[SubConcatEnd++] = undefVec;
+      ConcatOps[SubConcatIdx] = DAG.getNode(ISD::CONCAT_VECTORS, dl,
+                                            NextVT, &SubConcatOps[0],
+                                            OpsToConcat);
+      ConcatEnd = SubConcatIdx + 1;
     }
+  }
 
-    // Check to see if we have a single operation with the widen type.
-    if (ConcatEnd == 1) {
-      VT = ConcatOps[0].getValueType();
-      if (VT == WidenVT)
-        return ConcatOps[0];
-    }
-    
-    // add undefs of size MaxVT until ConcatOps grows to length of WidenVT
-    unsigned NumOps = 
-        WidenVT.getVectorNumElements()/MaxVT.getVectorNumElements();
-    if (NumOps != ConcatEnd ) {
-      SDValue UndefVal = DAG.getUNDEF(MaxVT);
-      for (unsigned j = ConcatEnd; j < NumOps; ++j)
-        ConcatOps[j] = UndefVal;
-    }
-    return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &ConcatOps[0], NumOps);
+  // Check to see if we have a single operation with the widen type.
+  if (ConcatEnd == 1) {
+    VT = ConcatOps[0].getValueType();
+    if (VT == WidenVT)
+      return ConcatOps[0];
   }
+  
+  // add undefs of size MaxVT until ConcatOps grows to length of WidenVT
+  unsigned NumOps = WidenVT.getVectorNumElements()/MaxVT.getVectorNumElements();
+  if (NumOps != ConcatEnd ) {
+    SDValue UndefVal = DAG.getUNDEF(MaxVT);
+    for (unsigned j = ConcatEnd; j < NumOps; ++j)
+      ConcatOps[j] = UndefVal;
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &ConcatOps[0], NumOps);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
@@ -1561,8 +1584,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_BIT_CONVERT(SDNode *N) {
     unsigned NewNumElts = WidenSize / InSize;
     if (InVT.isVector()) {
       EVT InEltVT = InVT.getVectorElementType();
-      NewInVT= EVT::getVectorVT(*DAG.getContext(), InEltVT,
-                                WidenSize / InEltVT.getSizeInBits());
+      NewInVT = EVT::getVectorVT(*DAG.getContext(), InEltVT,
+                                 WidenSize / InEltVT.getSizeInBits());
     } else {
       NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumElts);
     }
@@ -1686,8 +1709,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
   SDValue RndOp = N->getOperand(3);
   SDValue SatOp = N->getOperand(4);
 
-  EVT      WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),
-                                              N->getValueType(0));
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
 
   EVT InVT = InOp.getValueType();
@@ -1720,9 +1742,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
       SmallVector<SDValue, 16> Ops(NumConcat);
       Ops[0] = InOp;
       SDValue UndefVal = DAG.getUNDEF(InVT);
-      for (unsigned i = 1; i != NumConcat; ++i) {
+      for (unsigned i = 1; i != NumConcat; ++i)
         Ops[i] = UndefVal;
-      }
+
       InOp = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWidenVT, &Ops[0],NumConcat);
       return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
                                   SatOp, CvtCode);
@@ -2225,25 +2247,24 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16>& LdChain,
 
   // Check if we can load the element with one instruction
   if (LdWidth <= NewVTWidth) {
-    if (NewVT.isVector()) {
-      if (NewVT != WidenVT) {
-        assert(WidenWidth % NewVTWidth == 0);
-        unsigned NumConcat = WidenWidth / NewVTWidth;
-        SmallVector<SDValue, 16> ConcatOps(NumConcat);
-        SDValue UndefVal = DAG.getUNDEF(NewVT);
-        ConcatOps[0] = LdOp;
-        for (unsigned i = 1; i != NumConcat; ++i)
-          ConcatOps[i] = UndefVal;
-        return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &ConcatOps[0],
-                           NumConcat);
-      } else
-        return LdOp;
-    } else {
+    if (!NewVT.isVector()) {
       unsigned NumElts = WidenWidth / NewVTWidth;
       EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
       SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
       return DAG.getNode(ISD::BIT_CONVERT, dl, WidenVT, VecOp);
     }
+    if (NewVT == WidenVT)
+      return LdOp;
+
+    assert(WidenWidth % NewVTWidth == 0);
+    unsigned NumConcat = WidenWidth / NewVTWidth;
+    SmallVector<SDValue, 16> ConcatOps(NumConcat);
+    SDValue UndefVal = DAG.getUNDEF(NewVT);
+    ConcatOps[0] = LdOp;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      ConcatOps[i] = UndefVal;
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &ConcatOps[0],
+                       NumConcat);
   }
 
   // Load vector by using multiple loads from largest vector to scalar
@@ -2276,52 +2297,55 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16>& LdChain,
 
   // Build the vector from the loads operations
   unsigned End = LdOps.size();
-  if (LdOps[0].getValueType().isVector()) {
-    // If the load contains vectors, build the vector using concat vector.
-    // All of the vectors used to loads are power of 2 and the scalars load
-    // can be combined to make a power of 2 vector.
-    SmallVector<SDValue, 16> ConcatOps(End);
-    int i = End - 1;
-    int Idx = End;
-    EVT LdTy = LdOps[i].getValueType();
-    // First combine the scalar loads to a vector
-    if (!LdTy.isVector())  {
-      for (--i; i >= 0; --i) {
-        LdTy = LdOps[i].getValueType();
-        if (LdTy.isVector())
-          break;
-      }
-      ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i+1, End);
-    }
-    ConcatOps[--Idx] = LdOps[i];
+  if (!LdOps[0].getValueType().isVector())
+    // All the loads are scalar loads.
+    return BuildVectorFromScalar(DAG, WidenVT, LdOps, 0, End);
+  
+  // If the load contains vectors, build the vector using concat vector.
+  // All of the vectors used to loads are power of 2 and the scalars load
+  // can be combined to make a power of 2 vector.
+  SmallVector<SDValue, 16> ConcatOps(End);
+  int i = End - 1;
+  int Idx = End;
+  EVT LdTy = LdOps[i].getValueType();
+  // First combine the scalar loads to a vector
+  if (!LdTy.isVector())  {
     for (--i; i >= 0; --i) {
-      EVT NewLdTy = LdOps[i].getValueType();
-      if (NewLdTy != LdTy) {
-        // Create a larger vector
-        ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
-                                       &ConcatOps[Idx], End - Idx);
-        Idx = End - 1;
-        LdTy = NewLdTy;
-      }
-      ConcatOps[--Idx] = LdOps[i];
+      LdTy = LdOps[i].getValueType();
+      if (LdTy.isVector())
+        break;
     }
+    ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i+1, End);
+  }
+  ConcatOps[--Idx] = LdOps[i];
+  for (--i; i >= 0; --i) {
+    EVT NewLdTy = LdOps[i].getValueType();
+    if (NewLdTy != LdTy) {
+      // Create a larger vector
+      ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
+                                     &ConcatOps[Idx], End - Idx);
+      Idx = End - 1;
+      LdTy = NewLdTy;
+    }
+    ConcatOps[--Idx] = LdOps[i];
+  }
 
-    if (WidenWidth != LdTy.getSizeInBits()*(End - Idx)) {
-      // We need to fill the rest with undefs to build the vector
-      unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
-      SmallVector<SDValue, 16> WidenOps(NumOps);
-      SDValue UndefVal = DAG.getUNDEF(LdTy);
-      unsigned i = 0;
-      for (; i != End-Idx; ++i)
-        WidenOps[i] = ConcatOps[Idx+i];
-      for (; i != NumOps; ++i)
-        WidenOps[i] = UndefVal;
-      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &WidenOps[0],NumOps);
-    } else
-      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
-                         &ConcatOps[Idx], End - Idx);
-  } else // All the loads are scalar loads.
-    return BuildVectorFromScalar(DAG, WidenVT, LdOps, 0, End);
+  if (WidenWidth == LdTy.getSizeInBits()*(End - Idx))
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
+                       &ConcatOps[Idx], End - Idx);
+
+  // We need to fill the rest with undefs to build the vector
+  unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
+  SmallVector<SDValue, 16> WidenOps(NumOps);
+  SDValue UndefVal = DAG.getUNDEF(LdTy);
+  {
+    unsigned i = 0;
+    for (; i != End-Idx; ++i)
+      WidenOps[i] = ConcatOps[Idx+i];
+    for (; i != NumOps; ++i)
+      WidenOps[i] = UndefVal;
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &WidenOps[0],NumOps);
 }
 
 SDValue
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 3b86c3286585f..fae27294e3646 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -13,6 +13,7 @@
 
 #define DEBUG_TYPE "pre-RA-sched"
 #include "ScheduleDAGSDNodes.h"
+#include "llvm/InlineAsm.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -432,6 +433,30 @@ static EVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
   return N->getValueType(NumRes);
 }
 
+/// CheckForLiveRegDef - Return true and update live register vector if the
+/// specified register def of the specified SUnit clobbers any "live" registers.
+static bool CheckForLiveRegDef(SUnit *SU, unsigned Reg,
+                               std::vector<SUnit*> &LiveRegDefs,
+                               SmallSet<unsigned, 4> &RegAdded,
+                               SmallVector<unsigned, 4> &LRegs,
+                               const TargetRegisterInfo *TRI) {
+  bool Added = false;
+  if (LiveRegDefs[Reg] && LiveRegDefs[Reg] != SU) {
+    if (RegAdded.insert(Reg)) {
+      LRegs.push_back(Reg);
+      Added = true;
+    }
+  }
+  for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias)
+    if (LiveRegDefs[*Alias] && LiveRegDefs[*Alias] != SU) {
+      if (RegAdded.insert(*Alias)) {
+        LRegs.push_back(*Alias);
+        Added = true;
+      }
+    }
+  return Added;
+}
+
 /// DelayForLiveRegsBottomUp - Returns true if it is necessary to delay
 /// scheduling of the given node to satisfy live physical register dependencies.
 /// If the specific node is the last one that's available to schedule, do
@@ -446,37 +471,44 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
   for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     if (I->isAssignedRegDep()) {
-      unsigned Reg = I->getReg();
-      if (LiveRegDefs[Reg] && LiveRegDefs[Reg] != I->getSUnit()) {
-        if (RegAdded.insert(Reg))
-          LRegs.push_back(Reg);
-      }
-      for (const unsigned *Alias = TRI->getAliasSet(Reg);
-           *Alias; ++Alias)
-        if (LiveRegDefs[*Alias] && LiveRegDefs[*Alias] != I->getSUnit()) {
-          if (RegAdded.insert(*Alias))
-            LRegs.push_back(*Alias);
-        }
+      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs,
+                         RegAdded, LRegs, TRI);
     }
   }
 
   for (SDNode *Node = SU->getNode(); Node; Node = Node->getFlaggedNode()) {
+    if (Node->getOpcode() == ISD::INLINEASM) {
+      // Inline asm can clobber physical defs.
+      unsigned NumOps = Node->getNumOperands();
+      if (Node->getOperand(NumOps-1).getValueType() == MVT::Flag)
+        --NumOps;  // Ignore the flag operand.
+
+      for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+        unsigned Flags =
+          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+
+        ++i; // Skip the ID value.
+        if (InlineAsm::isRegDefKind(Flags) ||
+            InlineAsm::isRegDefEarlyClobberKind(Flags)) {
+          // Check for def of register or earlyclobber register.
+          for (; NumVals; --NumVals, ++i) {
+            unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+            if (TargetRegisterInfo::isPhysicalRegister(Reg))
+              CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+          }
+        } else
+          i += NumVals;
+      }
+      continue;
+    }
     if (!Node->isMachineOpcode())
       continue;
     const TargetInstrDesc &TID = TII->get(Node->getMachineOpcode());
     if (!TID.ImplicitDefs)
       continue;
     for (const unsigned *Reg = TID.ImplicitDefs; *Reg; ++Reg) {
-      if (LiveRegDefs[*Reg] && LiveRegDefs[*Reg] != SU) {
-        if (RegAdded.insert(*Reg))
-          LRegs.push_back(*Reg);
-      }
-      for (const unsigned *Alias = TRI->getAliasSet(*Reg);
-           *Alias; ++Alias)
-        if (LiveRegDefs[*Alias] && LiveRegDefs[*Alias] != SU) {
-          if (RegAdded.insert(*Alias))
-            LRegs.push_back(*Alias);
-        }
+      CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI);
     }
   }
   return !LRegs.empty();
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 3ef521c398e11..4c3e4e3b07680 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
@@ -54,10 +55,16 @@ static RegisterScheduler
 
 static RegisterScheduler
   hybridListDAGScheduler("list-hybrid",
-                         "Bottom-up rr list scheduling which avoid stalls for "
-                         "long latency instructions",
+                         "Bottom-up register pressure aware list scheduling "
+                         "which tries to balance latency and register pressure",
                          createHybridListDAGScheduler);
 
+static RegisterScheduler
+  ILPListDAGScheduler("list-ilp",
+                      "Bottom-up register pressure aware list scheduling "
+                      "which tries to balance ILP and register pressure",
+                      createILPListDAGScheduler);
+
 namespace {
 //===----------------------------------------------------------------------===//
 /// ScheduleDAGRRList - The actual register reduction list scheduler
@@ -181,7 +188,9 @@ private:
 
 /// Schedule - Schedule the DAG using list scheduling.
 void ScheduleDAGRRList::Schedule() {
-  DEBUG(dbgs() << "********** List Scheduling **********\n");
+  DEBUG(dbgs()
+        << "********** List Scheduling BB#" << BB->getNumber()
+        << " **********\n");
 
   NumLiveRegs = 0;
   LiveRegDefs.resize(TRI->getNumRegs(), NULL);  
@@ -273,6 +282,8 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
   SU->setHeightToAtLeast(CurCycle);
   Sequence.push_back(SU);
 
+  AvailableQueue->ScheduledNode(SU);
+
   ReleasePredecessors(SU, CurCycle);
 
   // Release all the implicit physical register defs that are live.
@@ -291,7 +302,6 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
   }
 
   SU->isScheduled = true;
-  AvailableQueue->ScheduledNode(SU);
 }
 
 /// CapturePred - This does the opposite of ReleasePred. Since SU is being
@@ -315,8 +325,6 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
   DEBUG(dbgs() << "*** Unscheduling [" << SU->getHeight() << "]: ");
   DEBUG(SU->dump(this));
 
-  AvailableQueue->UnscheduledNode(SU);
-
   for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     CapturePred(&*I);
@@ -346,6 +354,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
   SU->isScheduled = false;
   SU->isAvailable = true;
   AvailableQueue->push(SU);
+  AvailableQueue->UnscheduledNode(SU);
 }
 
 /// BacktrackBottomUp - Backtrack scheduling to a previous cycle specified in
@@ -956,7 +965,8 @@ namespace {
   template<class SF>
   class RegReductionPriorityQueue;
   
-  /// Sorting functions for the Available queue.
+  /// bu_ls_rr_sort - Priority function for bottom up register pressure
+  // reduction scheduler.
   struct bu_ls_rr_sort : public std::binary_function<SUnit*, SUnit*, bool> {
     RegReductionPriorityQueue<bu_ls_rr_sort> *SPQ;
     bu_ls_rr_sort(RegReductionPriorityQueue<bu_ls_rr_sort> *spq) : SPQ(spq) {}
@@ -965,6 +975,8 @@ namespace {
     bool operator()(const SUnit* left, const SUnit* right) const;
   };
 
+  // td_ls_rr_sort - Priority function for top down register pressure reduction
+  // scheduler.
   struct td_ls_rr_sort : public std::binary_function<SUnit*, SUnit*, bool> {
     RegReductionPriorityQueue<td_ls_rr_sort> *SPQ;
     td_ls_rr_sort(RegReductionPriorityQueue<td_ls_rr_sort> *spq) : SPQ(spq) {}
@@ -973,6 +985,7 @@ namespace {
     bool operator()(const SUnit* left, const SUnit* right) const;
   };
 
+  // src_ls_rr_sort - Priority function for source order scheduler.
   struct src_ls_rr_sort : public std::binary_function<SUnit*, SUnit*, bool> {
     RegReductionPriorityQueue<src_ls_rr_sort> *SPQ;
     src_ls_rr_sort(RegReductionPriorityQueue<src_ls_rr_sort> *spq)
@@ -983,13 +996,26 @@ namespace {
     bool operator()(const SUnit* left, const SUnit* right) const;
   };
 
+  // hybrid_ls_rr_sort - Priority function for hybrid scheduler.
   struct hybrid_ls_rr_sort : public std::binary_function<SUnit*, SUnit*, bool> {
     RegReductionPriorityQueue<hybrid_ls_rr_sort> *SPQ;
     hybrid_ls_rr_sort(RegReductionPriorityQueue<hybrid_ls_rr_sort> *spq)
       : SPQ(spq) {}
     hybrid_ls_rr_sort(const hybrid_ls_rr_sort &RHS)
       : SPQ(RHS.SPQ) {}
-    
+
+    bool operator()(const SUnit* left, const SUnit* right) const;
+  };
+
+  // ilp_ls_rr_sort - Priority function for ILP (instruction level parallelism)
+  // scheduler.
+  struct ilp_ls_rr_sort : public std::binary_function<SUnit*, SUnit*, bool> {
+    RegReductionPriorityQueue<ilp_ls_rr_sort> *SPQ;
+    ilp_ls_rr_sort(RegReductionPriorityQueue<ilp_ls_rr_sort> *spq)
+      : SPQ(spq) {}
+    ilp_ls_rr_sort(const ilp_ls_rr_sort &RHS)
+      : SPQ(RHS.SPQ) {}
+
     bool operator()(const SUnit* left, const SUnit* right) const;
   };
 }  // end anonymous namespace
@@ -1029,23 +1055,48 @@ namespace {
     std::vector<SUnit*> Queue;
     SF Picker;
     unsigned CurQueueId;
+    bool TracksRegPressure;
 
   protected:
     // SUnits - The SUnits for the current graph.
     std::vector<SUnit> *SUnits;
-    
+
+    MachineFunction &MF;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
+    const TargetLowering *TLI;
     ScheduleDAGRRList *scheduleDAG;
 
     // SethiUllmanNumbers - The SethiUllman number for each node.
     std::vector<unsigned> SethiUllmanNumbers;
 
+    /// RegPressure - Tracking current reg pressure per register class.
+    ///
+    std::vector<unsigned> RegPressure;
+
+    /// RegLimit - Tracking the number of allocatable registers per register
+    /// class.
+    std::vector<unsigned> RegLimit;
+
   public:
-    RegReductionPriorityQueue(const TargetInstrInfo *tii,
-                              const TargetRegisterInfo *tri)
-      : Picker(this), CurQueueId(0),
-        TII(tii), TRI(tri), scheduleDAG(NULL) {}
+    RegReductionPriorityQueue(MachineFunction &mf,
+                              bool tracksrp,
+                              const TargetInstrInfo *tii,
+                              const TargetRegisterInfo *tri,
+                              const TargetLowering *tli)
+      : Picker(this), CurQueueId(0), TracksRegPressure(tracksrp),
+        MF(mf), TII(tii), TRI(tri), TLI(tli), scheduleDAG(NULL) {
+      if (TracksRegPressure) {
+        unsigned NumRC = TRI->getNumRegClasses();
+        RegLimit.resize(NumRC);
+        RegPressure.resize(NumRC);
+        std::fill(RegLimit.begin(), RegLimit.end(), 0);
+        std::fill(RegPressure.begin(), RegPressure.end(), 0);
+        for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+               E = TRI->regclass_end(); I != E; ++I)
+          RegLimit[(*I)->getID()] = tli->getRegPressureLimit(*I, MF);
+      }
+    }
     
     void initNodes(std::vector<SUnit> &sunits) {
       SUnits = &sunits;
@@ -1072,6 +1123,7 @@ namespace {
     void releaseState() {
       SUnits = 0;
       SethiUllmanNumbers.clear();
+      std::fill(RegPressure.begin(), RegPressure.end(), 0);
     }
 
     unsigned getNodePriority(const SUnit *SU) const {
@@ -1139,10 +1191,244 @@ namespace {
       SU->NodeQueueId = 0;
     }
 
+    bool HighRegPressure(const SUnit *SU) const {
+      if (!TLI)
+        return false;
+
+      for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
+           I != E; ++I) {
+        if (I->isCtrl())
+          continue;
+        SUnit *PredSU = I->getSUnit();
+        const SDNode *PN = PredSU->getNode();
+        if (!PN->isMachineOpcode()) {
+          if (PN->getOpcode() == ISD::CopyFromReg) {
+            EVT VT = PN->getValueType(0);
+            unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+            unsigned Cost = TLI->getRepRegClassCostFor(VT);
+            if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
+              return true;
+          }
+          continue;
+        }
+        unsigned POpc = PN->getMachineOpcode();
+        if (POpc == TargetOpcode::IMPLICIT_DEF)
+          continue;
+        if (POpc == TargetOpcode::EXTRACT_SUBREG) {
+          EVT VT = PN->getOperand(0).getValueType();
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          unsigned Cost = TLI->getRepRegClassCostFor(VT);
+          // Check if this increases register pressure of the specific register
+          // class to the point where it would cause spills.
+          if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
+            return true;
+          continue;            
+        } else if (POpc == TargetOpcode::INSERT_SUBREG ||
+                   POpc == TargetOpcode::SUBREG_TO_REG) {
+          EVT VT = PN->getValueType(0);
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          unsigned Cost = TLI->getRepRegClassCostFor(VT);
+          // Check if this increases register pressure of the specific register
+          // class to the point where it would cause spills.
+          if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
+            return true;
+          continue;
+        }
+        unsigned NumDefs = TII->get(PN->getMachineOpcode()).getNumDefs();
+        for (unsigned i = 0; i != NumDefs; ++i) {
+          EVT VT = PN->getValueType(i);
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          if (RegPressure[RCId] >= RegLimit[RCId])
+            return true; // Reg pressure already high.
+          unsigned Cost = TLI->getRepRegClassCostFor(VT);
+          if (!PN->hasAnyUseOfValue(i))
+            continue;
+          // Check if this increases register pressure of the specific register
+          // class to the point where it would cause spills.
+          if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
+            return true;
+        }
+      }
+
+      return false;
+    }
+
+    void ScheduledNode(SUnit *SU) {
+      if (!TracksRegPressure)
+        return;
+
+      const SDNode *N = SU->getNode();
+      if (!N->isMachineOpcode()) {
+        if (N->getOpcode() != ISD::CopyToReg)
+          return;
+      } else {
+        unsigned Opc = N->getMachineOpcode();
+        if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+            Opc == TargetOpcode::INSERT_SUBREG ||
+            Opc == TargetOpcode::SUBREG_TO_REG ||
+            Opc == TargetOpcode::REG_SEQUENCE ||
+            Opc == TargetOpcode::IMPLICIT_DEF)
+          return;
+      }
+
+      for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+           I != E; ++I) {
+        if (I->isCtrl())
+          continue;
+        SUnit *PredSU = I->getSUnit();
+        if (PredSU->NumSuccsLeft != PredSU->NumSuccs)
+          continue;
+        const SDNode *PN = PredSU->getNode();
+        if (!PN->isMachineOpcode()) {
+          if (PN->getOpcode() == ISD::CopyFromReg) {
+            EVT VT = PN->getValueType(0);
+            unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+            RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+          }
+          continue;
+        }
+        unsigned POpc = PN->getMachineOpcode();
+        if (POpc == TargetOpcode::IMPLICIT_DEF)
+          continue;
+        if (POpc == TargetOpcode::EXTRACT_SUBREG) {
+          EVT VT = PN->getOperand(0).getValueType();
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+          continue;            
+        } else if (POpc == TargetOpcode::INSERT_SUBREG ||
+                   POpc == TargetOpcode::SUBREG_TO_REG) {
+          EVT VT = PN->getValueType(0);
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+          continue;
+        }
+        unsigned NumDefs = TII->get(PN->getMachineOpcode()).getNumDefs();
+        for (unsigned i = 0; i != NumDefs; ++i) {
+          EVT VT = PN->getValueType(i);
+          if (!PN->hasAnyUseOfValue(i))
+            continue;
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+        }
+      }
+
+      // Check for isMachineOpcode() as PrescheduleNodesWithMultipleUses()
+      // may transfer data dependencies to CopyToReg.
+      if (SU->NumSuccs && N->isMachineOpcode()) {
+        unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+        for (unsigned i = 0; i != NumDefs; ++i) {
+          EVT VT = N->getValueType(i);
+          if (!N->hasAnyUseOfValue(i))
+            continue;
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          if (RegPressure[RCId] < TLI->getRepRegClassCostFor(VT))
+            // Register pressure tracking is imprecise. This can happen.
+            RegPressure[RCId] = 0;
+          else
+            RegPressure[RCId] -= TLI->getRepRegClassCostFor(VT);
+        }
+      }
+
+      dumpRegPressure();
+    }
+
+    void UnscheduledNode(SUnit *SU) {
+      if (!TracksRegPressure)
+        return;
+
+      const SDNode *N = SU->getNode();
+      if (!N->isMachineOpcode()) {
+        if (N->getOpcode() != ISD::CopyToReg)
+          return;
+      } else {
+        unsigned Opc = N->getMachineOpcode();
+        if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+            Opc == TargetOpcode::INSERT_SUBREG ||
+            Opc == TargetOpcode::SUBREG_TO_REG ||
+            Opc == TargetOpcode::REG_SEQUENCE ||
+            Opc == TargetOpcode::IMPLICIT_DEF)
+          return;
+      }
+
+      for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+           I != E; ++I) {
+        if (I->isCtrl())
+          continue;
+        SUnit *PredSU = I->getSUnit();
+        if (PredSU->NumSuccsLeft != PredSU->NumSuccs)
+          continue;
+        const SDNode *PN = PredSU->getNode();
+        if (!PN->isMachineOpcode()) {
+          if (PN->getOpcode() == ISD::CopyFromReg) {
+            EVT VT = PN->getValueType(0);
+            unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+            RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+          }
+          continue;
+        }
+        unsigned POpc = PN->getMachineOpcode();
+        if (POpc == TargetOpcode::IMPLICIT_DEF)
+          continue;
+        if (POpc == TargetOpcode::EXTRACT_SUBREG) {
+          EVT VT = PN->getOperand(0).getValueType();
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+          continue;            
+        } else if (POpc == TargetOpcode::INSERT_SUBREG ||
+                   POpc == TargetOpcode::SUBREG_TO_REG) {
+          EVT VT = PN->getValueType(0);
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+          continue;
+        }
+        unsigned NumDefs = TII->get(PN->getMachineOpcode()).getNumDefs();
+        for (unsigned i = 0; i != NumDefs; ++i) {
+          EVT VT = PN->getValueType(i);
+          if (!PN->hasAnyUseOfValue(i))
+            continue;
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          if (RegPressure[RCId] < TLI->getRepRegClassCostFor(VT))
+            // Register pressure tracking is imprecise. This can happen.
+            RegPressure[RCId] = 0;
+          else
+            RegPressure[RCId] -= TLI->getRepRegClassCostFor(VT);
+        }
+      }
+
+      // Check for isMachineOpcode() as PrescheduleNodesWithMultipleUses()
+      // may transfer data dependencies to CopyToReg.
+      if (SU->NumSuccs && N->isMachineOpcode()) {
+        unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+        for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
+          EVT VT = N->getValueType(i);
+          if (VT == MVT::Flag || VT == MVT::Other)
+            continue;
+          if (!N->hasAnyUseOfValue(i))
+            continue;
+          unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+          RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+        }
+      }
+
+      dumpRegPressure();
+    }
+
     void setScheduleDAG(ScheduleDAGRRList *scheduleDag) { 
       scheduleDAG = scheduleDag; 
     }
 
+    void dumpRegPressure() const {
+      for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+             E = TRI->regclass_end(); I != E; ++I) {
+        const TargetRegisterClass *RC = *I;
+        unsigned Id = RC->getID();
+        unsigned RP = RegPressure[Id];
+        if (!RP) continue;
+        DEBUG(dbgs() << RC->getName() << ": " << RP << " / " << RegLimit[Id]
+              << '\n');
+      }
+    }
+
   protected:
     bool canClobber(const SUnit *SU, const SUnit *Op);
     void AddPseudoTwoAddrDeps();
@@ -1161,6 +1447,9 @@ namespace {
 
   typedef RegReductionPriorityQueue<hybrid_ls_rr_sort>
     HybridBURRPriorityQueue;
+
+  typedef RegReductionPriorityQueue<ilp_ls_rr_sort>
+    ILPBURRPriorityQueue;
 }
 
 /// closestSucc - Returns the scheduled cycle of the successor which is
@@ -1260,30 +1549,63 @@ bool src_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const {
 }
 
 bool hybrid_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{
-  bool LStall = left->SchedulingPref == Sched::Latency &&
-    SPQ->getCurCycle() < left->getHeight();
-  bool RStall = right->SchedulingPref == Sched::Latency &&
-    SPQ->getCurCycle() < right->getHeight();
-  // If scheduling one of the node will cause a pipeline stall, delay it.
-  // If scheduling either one of the node will cause a pipeline stall, sort them
-  // according to their height.
-  // If neither will cause a pipeline stall, try to reduce register pressure.
-  if (LStall) {
-    if (!RStall)
-      return true;
-    if (left->getHeight() != right->getHeight())
-      return left->getHeight() > right->getHeight();
-  } else if (RStall)
+  bool LHigh = SPQ->HighRegPressure(left);
+  bool RHigh = SPQ->HighRegPressure(right);
+  // Avoid causing spills. If register pressure is high, schedule for
+  // register pressure reduction.
+  if (LHigh && !RHigh)
+    return true;
+  else if (!LHigh && RHigh)
+    return false;
+  else if (!LHigh && !RHigh) {
+    // Low register pressure situation, schedule for latency if possible.
+    bool LStall = left->SchedulingPref == Sched::Latency &&
+      SPQ->getCurCycle() < left->getHeight();
+    bool RStall = right->SchedulingPref == Sched::Latency &&
+      SPQ->getCurCycle() < right->getHeight();
+    // If scheduling one of the node will cause a pipeline stall, delay it.
+    // If scheduling either one of the node will cause a pipeline stall, sort
+    // them according to their height.
+    // If neither will cause a pipeline stall, try to reduce register pressure.
+    if (LStall) {
+      if (!RStall)
+        return true;
+      if (left->getHeight() != right->getHeight())
+        return left->getHeight() > right->getHeight();
+    } else if (RStall)
       return false;
 
-  // If either node is scheduling for latency, sort them by height and latency
-  // first.
-  if (left->SchedulingPref == Sched::Latency ||
-      right->SchedulingPref == Sched::Latency) {
-    if (left->getHeight() != right->getHeight())
-      return left->getHeight() > right->getHeight();
-    if (left->Latency != right->Latency)
-      return left->Latency > right->Latency;
+    // If either node is scheduling for latency, sort them by height and latency
+    // first.
+    if (left->SchedulingPref == Sched::Latency ||
+        right->SchedulingPref == Sched::Latency) {
+      if (left->getHeight() != right->getHeight())
+        return left->getHeight() > right->getHeight();
+      if (left->Latency != right->Latency)
+        return left->Latency > right->Latency;
+    }
+  }
+
+  return BURRSort(left, right, SPQ);
+}
+
+bool ilp_ls_rr_sort::operator()(const SUnit *left,
+                                const SUnit *right) const {
+  bool LHigh = SPQ->HighRegPressure(left);
+  bool RHigh = SPQ->HighRegPressure(right);
+  // Avoid causing spills. If register pressure is high, schedule for
+  // register pressure reduction.
+  if (LHigh && !RHigh)
+    return true;
+  else if (!LHigh && RHigh)
+    return false;
+  else if (!LHigh && !RHigh) {
+    // Low register pressure situation, schedule to maximize instruction level
+    // parallelism.
+    if (left->NumPreds > right->NumPreds)
+      return false;
+    else if (left->NumPreds < right->NumPreds)
+      return false;
   }
 
   return BURRSort(left, right, SPQ);
@@ -1635,8 +1957,8 @@ llvm::createBURRListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
   const TargetInstrInfo *TII = TM.getInstrInfo();
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
   
-  BURegReductionPriorityQueue *PQ = new BURegReductionPriorityQueue(TII, TRI);
-
+  BURegReductionPriorityQueue *PQ =
+    new BURegReductionPriorityQueue(*IS->MF, false, TII, TRI, 0);
   ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, false, PQ);
   PQ->setScheduleDAG(SD);
   return SD;  
@@ -1648,8 +1970,8 @@ llvm::createTDRRListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
   const TargetInstrInfo *TII = TM.getInstrInfo();
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
   
-  TDRegReductionPriorityQueue *PQ = new TDRegReductionPriorityQueue(TII, TRI);
-
+  TDRegReductionPriorityQueue *PQ =
+    new TDRegReductionPriorityQueue(*IS->MF, false, TII, TRI, 0);
   ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, false, PQ);
   PQ->setScheduleDAG(SD);
   return SD;
@@ -1661,8 +1983,8 @@ llvm::createSourceListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
   const TargetInstrInfo *TII = TM.getInstrInfo();
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
   
-  SrcRegReductionPriorityQueue *PQ = new SrcRegReductionPriorityQueue(TII, TRI);
-
+  SrcRegReductionPriorityQueue *PQ =
+    new SrcRegReductionPriorityQueue(*IS->MF, false, TII, TRI, 0);
   ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, false, PQ);
   PQ->setScheduleDAG(SD);
   return SD;  
@@ -1673,9 +1995,24 @@ llvm::createHybridListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
   const TargetMachine &TM = IS->TM;
   const TargetInstrInfo *TII = TM.getInstrInfo();
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetLowering *TLI = &IS->getTargetLowering();
   
-  HybridBURRPriorityQueue *PQ = new HybridBURRPriorityQueue(TII, TRI);
+  HybridBURRPriorityQueue *PQ =
+    new HybridBURRPriorityQueue(*IS->MF, true, TII, TRI, TLI);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, true, PQ);
+  PQ->setScheduleDAG(SD);
+  return SD;  
+}
 
+llvm::ScheduleDAGSDNodes *
+llvm::createILPListDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  const TargetMachine &TM = IS->TM;
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetLowering *TLI = &IS->getTargetLowering();
+  
+  ILPBURRPriorityQueue *PQ =
+    new ILPBURRPriorityQueue(*IS->MF, true, TII, TRI, TLI);
   ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, true, PQ);
   PQ->setScheduleDAG(SD);
   return SD;  
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 06cf053087550..f1bf82ab145a4 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -59,8 +59,9 @@ SUnit *ScheduleDAGSDNodes::NewSUnit(SDNode *N) {
   SUnits.back().OrigNode = &SUnits.back();
   SUnit *SU = &SUnits.back();
   const TargetLowering &TLI = DAG->getTargetLoweringInfo();
-  if (N->isMachineOpcode() &&
-      N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF)
+  if (!N ||
+      (N->isMachineOpcode() &&
+       N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF))
     SU->SchedulingPref = Sched::None;
   else
     SU->SchedulingPref = TLI.getSchedulingPreference(N);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index e83a0346b5351..ad06ebda5b007 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2236,7 +2236,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
 
 bool SelectionDAG::isKnownNeverNaN(SDValue Op) const {
   // If we're told that NaNs won't happen, assume they won't.
-  if (FiniteOnlyFPMath())
+  if (NoNaNsFPMath)
     return true;
 
   // If the value is a constant, we can obviously see if it is a NaN or not.
@@ -2281,35 +2281,6 @@ bool SelectionDAG::isVerifiedDebugInfoDesc(SDValue Op) const {
 }
 
 
-/// getShuffleScalarElt - Returns the scalar element that will make up the ith
-/// element of the result of the vector shuffle.
-SDValue SelectionDAG::getShuffleScalarElt(const ShuffleVectorSDNode *N,
-                                          unsigned i) {
-  EVT VT = N->getValueType(0);
-  if (N->getMaskElt(i) < 0)
-    return getUNDEF(VT.getVectorElementType());
-  unsigned Index = N->getMaskElt(i);
-  unsigned NumElems = VT.getVectorNumElements();
-  SDValue V = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1);
-  Index %= NumElems;
-
-  if (V.getOpcode() == ISD::BIT_CONVERT) {
-    V = V.getOperand(0);
-    EVT VVT = V.getValueType();
-    if (!VVT.isVector() || VVT.getVectorNumElements() != (unsigned)NumElems)
-      return SDValue();
-  }
-  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
-    return (Index == 0) ? V.getOperand(0)
-                      : getUNDEF(VT.getVectorElementType());
-  if (V.getOpcode() == ISD::BUILD_VECTOR)
-    return V.getOperand(Index);
-  if (const ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(V))
-    return getShuffleScalarElt(SVN, Index);
-  return SDValue();
-}
-
-
 /// getNode - Gets or creates the specified node.
 ///
 SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT) {
@@ -2624,7 +2595,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     // one big BUILD_VECTOR.
     if (N1.getOpcode() == ISD::BUILD_VECTOR &&
         N2.getOpcode() == ISD::BUILD_VECTOR) {
-      SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(), N1.getNode()->op_end());
+      SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(),
+                                    N1.getNode()->op_end());
       Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
       return getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], Elts.size());
     }
@@ -3021,7 +2993,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     if (N1.getOpcode() == ISD::BUILD_VECTOR &&
         N2.getOpcode() == ISD::BUILD_VECTOR &&
         N3.getOpcode() == ISD::BUILD_VECTOR) {
-      SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(), N1.getNode()->op_end());
+      SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(),
+                                    N1.getNode()->op_end());
       Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
       Elts.append(N3.getNode()->op_begin(), N3.getNode()->op_end());
       return getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], Elts.size());
@@ -5872,6 +5845,7 @@ std::string ISD::ArgFlagsTy::getArgFlagsString() {
 void SDNode::dump() const { dump(0); }
 void SDNode::dump(const SelectionDAG *G) const {
   print(dbgs(), G);
+  dbgs() << '\n';
 }
 
 void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {
@@ -5895,7 +5869,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
       for (MachineSDNode::mmo_iterator i = MN->memoperands_begin(),
            e = MN->memoperands_end(); i != e; ++i) {
         OS << **i;
-        if (next(i) != e)
+        if (llvm::next(i) != e)
           OS << " ";
       }
       OS << ">";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 458e865a6b3c8..e65744592c8bb 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -70,22 +70,29 @@ LimitFPPrecision("limit-float-precision",
                  cl::location(LimitFloatPrecision),
                  cl::init(0));
 
+static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
+                                      const SDValue *Parts, unsigned NumParts,
+                                      EVT PartVT, EVT ValueVT);
+  
 /// getCopyFromParts - Create a value that contains the specified legal parts
 /// combined into the value they represent.  If the parts combine to a type
 /// larger then ValueVT then AssertOp can be used to specify whether the extra
 /// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT
 /// (ISD::AssertSext).
-static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc dl,
+static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc DL,
                                 const SDValue *Parts,
                                 unsigned NumParts, EVT PartVT, EVT ValueVT,
                                 ISD::NodeType AssertOp = ISD::DELETED_NODE) {
+  if (ValueVT.isVector())
+    return getCopyFromPartsVector(DAG, DL, Parts, NumParts, PartVT, ValueVT);
+  
   assert(NumParts > 0 && "No parts to assemble!");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Val = Parts[0];
 
   if (NumParts > 1) {
     // Assemble the value from multiple parts.
-    if (!ValueVT.isVector() && ValueVT.isInteger()) {
+    if (ValueVT.isInteger()) {
       unsigned PartBits = PartVT.getSizeInBits();
       unsigned ValueBits = ValueVT.getSizeInBits();
 
@@ -100,25 +107,25 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc dl,
       EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), RoundBits/2);
 
       if (RoundParts > 2) {
-        Lo = getCopyFromParts(DAG, dl, Parts, RoundParts / 2,
+        Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2,
                               PartVT, HalfVT);
-        Hi = getCopyFromParts(DAG, dl, Parts + RoundParts / 2,
+        Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2,
                               RoundParts / 2, PartVT, HalfVT);
       } else {
-        Lo = DAG.getNode(ISD::BIT_CONVERT, dl, HalfVT, Parts[0]);
-        Hi = DAG.getNode(ISD::BIT_CONVERT, dl, HalfVT, Parts[1]);
+        Lo = DAG.getNode(ISD::BIT_CONVERT, DL, HalfVT, Parts[0]);
+        Hi = DAG.getNode(ISD::BIT_CONVERT, DL, HalfVT, Parts[1]);
       }
 
       if (TLI.isBigEndian())
         std::swap(Lo, Hi);
 
-      Val = DAG.getNode(ISD::BUILD_PAIR, dl, RoundVT, Lo, Hi);
+      Val = DAG.getNode(ISD::BUILD_PAIR, DL, RoundVT, Lo, Hi);
 
       if (RoundParts < NumParts) {
         // Assemble the trailing non-power-of-2 part.
         unsigned OddParts = NumParts - RoundParts;
         EVT OddVT = EVT::getIntegerVT(*DAG.getContext(), OddParts * PartBits);
-        Hi = getCopyFromParts(DAG, dl,
+        Hi = getCopyFromParts(DAG, DL,
                               Parts + RoundParts, OddParts, PartVT, OddVT);
 
         // Combine the round and odd parts.
@@ -126,68 +133,29 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc dl,
         if (TLI.isBigEndian())
           std::swap(Lo, Hi);
         EVT TotalVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
-        Hi = DAG.getNode(ISD::ANY_EXTEND, dl, TotalVT, Hi);
-        Hi = DAG.getNode(ISD::SHL, dl, TotalVT, Hi,
+        Hi = DAG.getNode(ISD::ANY_EXTEND, DL, TotalVT, Hi);
+        Hi = DAG.getNode(ISD::SHL, DL, TotalVT, Hi,
                          DAG.getConstant(Lo.getValueType().getSizeInBits(),
                                          TLI.getPointerTy()));
-        Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, TotalVT, Lo);
-        Val = DAG.getNode(ISD::OR, dl, TotalVT, Lo, Hi);
-      }
-    } else if (ValueVT.isVector()) {
-      // Handle a multi-element vector.
-      EVT IntermediateVT, RegisterVT;
-      unsigned NumIntermediates;
-      unsigned NumRegs =
-        TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
-                                   NumIntermediates, RegisterVT);
-      assert(NumRegs == NumParts
-             && "Part count doesn't match vector breakdown!");
-      NumParts = NumRegs; // Silence a compiler warning.
-      assert(RegisterVT == PartVT
-             && "Part type doesn't match vector breakdown!");
-      assert(RegisterVT == Parts[0].getValueType() &&
-             "Part type doesn't match part!");
-
-      // Assemble the parts into intermediate operands.
-      SmallVector<SDValue, 8> Ops(NumIntermediates);
-      if (NumIntermediates == NumParts) {
-        // If the register was not expanded, truncate or copy the value,
-        // as appropriate.
-        for (unsigned i = 0; i != NumParts; ++i)
-          Ops[i] = getCopyFromParts(DAG, dl, &Parts[i], 1,
-                                    PartVT, IntermediateVT);
-      } else if (NumParts > 0) {
-        // If the intermediate type was expanded, build the intermediate
-        // operands from the parts.
-        assert(NumParts % NumIntermediates == 0 &&
-               "Must expand into a divisible number of parts!");
-        unsigned Factor = NumParts / NumIntermediates;
-        for (unsigned i = 0; i != NumIntermediates; ++i)
-          Ops[i] = getCopyFromParts(DAG, dl, &Parts[i * Factor], Factor,
-                                    PartVT, IntermediateVT);
+        Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, TotalVT, Lo);
+        Val = DAG.getNode(ISD::OR, DL, TotalVT, Lo, Hi);
       }
-
-      // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
-      // intermediate operands.
-      Val = DAG.getNode(IntermediateVT.isVector() ?
-                        ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, dl,
-                        ValueVT, &Ops[0], NumIntermediates);
     } else if (PartVT.isFloatingPoint()) {
       // FP split into multiple FP parts (for ppcf128)
       assert(ValueVT == EVT(MVT::ppcf128) && PartVT == EVT(MVT::f64) &&
              "Unexpected split");
       SDValue Lo, Hi;
-      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, EVT(MVT::f64), Parts[0]);
-      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, EVT(MVT::f64), Parts[1]);
+      Lo = DAG.getNode(ISD::BIT_CONVERT, DL, EVT(MVT::f64), Parts[0]);
+      Hi = DAG.getNode(ISD::BIT_CONVERT, DL, EVT(MVT::f64), Parts[1]);
       if (TLI.isBigEndian())
         std::swap(Lo, Hi);
-      Val = DAG.getNode(ISD::BUILD_PAIR, dl, ValueVT, Lo, Hi);
+      Val = DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, Lo, Hi);
     } else {
       // FP split into integer parts (soft fp)
       assert(ValueVT.isFloatingPoint() && PartVT.isInteger() &&
              !PartVT.isVector() && "Unexpected split");
       EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
-      Val = getCopyFromParts(DAG, dl, Parts, NumParts, PartVT, IntVT);
+      Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT);
     }
   }
 
@@ -197,219 +165,315 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc dl,
   if (PartVT == ValueVT)
     return Val;
 
-  if (PartVT.isVector()) {
-    assert(ValueVT.isVector() && "Unknown vector conversion!");
-    return DAG.getNode(ISD::BIT_CONVERT, dl, ValueVT, Val);
-  }
-
-  if (ValueVT.isVector()) {
-    assert(ValueVT.getVectorElementType() == PartVT &&
-           ValueVT.getVectorNumElements() == 1 &&
-           "Only trivial scalar-to-vector conversions should get here!");
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, ValueVT, Val);
-  }
-
-  if (PartVT.isInteger() &&
-      ValueVT.isInteger()) {
+  if (PartVT.isInteger() && ValueVT.isInteger()) {
     if (ValueVT.bitsLT(PartVT)) {
       // For a truncate, see if we have any information to
       // indicate whether the truncated bits will always be
       // zero or sign-extension.
       if (AssertOp != ISD::DELETED_NODE)
-        Val = DAG.getNode(AssertOp, dl, PartVT, Val,
+        Val = DAG.getNode(AssertOp, DL, PartVT, Val,
                           DAG.getValueType(ValueVT));
-      return DAG.getNode(ISD::TRUNCATE, dl, ValueVT, Val);
-    } else {
-      return DAG.getNode(ISD::ANY_EXTEND, dl, ValueVT, Val);
+      return DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
     }
+    return DAG.getNode(ISD::ANY_EXTEND, DL, ValueVT, Val);
   }
 
   if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
-    if (ValueVT.bitsLT(Val.getValueType())) {
-      // FP_ROUND's are always exact here.
-      return DAG.getNode(ISD::FP_ROUND, dl, ValueVT, Val,
+    // FP_ROUND's are always exact here.
+    if (ValueVT.bitsLT(Val.getValueType()))
+      return DAG.getNode(ISD::FP_ROUND, DL, ValueVT, Val,
                          DAG.getIntPtrConstant(1));
-    }
 
-    return DAG.getNode(ISD::FP_EXTEND, dl, ValueVT, Val);
+    return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val);
   }
 
   if (PartVT.getSizeInBits() == ValueVT.getSizeInBits())
-    return DAG.getNode(ISD::BIT_CONVERT, dl, ValueVT, Val);
+    return DAG.getNode(ISD::BIT_CONVERT, DL, ValueVT, Val);
 
   llvm_unreachable("Unknown mismatch!");
   return SDValue();
 }
 
+/// getCopyFromParts - Create a value that contains the specified legal parts
+/// combined into the value they represent.  If the parts combine to a type
+/// larger then ValueVT then AssertOp can be used to specify whether the extra
+/// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT
+/// (ISD::AssertSext).
+static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
+                                      const SDValue *Parts, unsigned NumParts,
+                                      EVT PartVT, EVT ValueVT) {
+  assert(ValueVT.isVector() && "Not a vector value");
+  assert(NumParts > 0 && "No parts to assemble!");
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Val = Parts[0];
+  
+  // Handle a multi-element vector.
+  if (NumParts > 1) {
+    EVT IntermediateVT, RegisterVT;
+    unsigned NumIntermediates;
+    unsigned NumRegs =
+    TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
+                               NumIntermediates, RegisterVT);
+    assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
+    NumParts = NumRegs; // Silence a compiler warning.
+    assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
+    assert(RegisterVT == Parts[0].getValueType() &&
+           "Part type doesn't match part!");
+    
+    // Assemble the parts into intermediate operands.
+    SmallVector<SDValue, 8> Ops(NumIntermediates);
+    if (NumIntermediates == NumParts) {
+      // If the register was not expanded, truncate or copy the value,
+      // as appropriate.
+      for (unsigned i = 0; i != NumParts; ++i)
+        Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1,
+                                  PartVT, IntermediateVT);
+    } else if (NumParts > 0) {
+      // If the intermediate type was expanded, build the intermediate
+      // operands from the parts.
+      assert(NumParts % NumIntermediates == 0 &&
+             "Must expand into a divisible number of parts!");
+      unsigned Factor = NumParts / NumIntermediates;
+      for (unsigned i = 0; i != NumIntermediates; ++i)
+        Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor,
+                                  PartVT, IntermediateVT);
+    }
+    
+    // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
+    // intermediate operands.
+    Val = DAG.getNode(IntermediateVT.isVector() ?
+                      ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, DL,
+                      ValueVT, &Ops[0], NumIntermediates);
+  }
+  
+  // There is now one part, held in Val.  Correct it to match ValueVT.
+  PartVT = Val.getValueType();
+  
+  if (PartVT == ValueVT)
+    return Val;
+  
+  if (PartVT.isVector()) {
+    // If the element type of the source/dest vectors are the same, but the
+    // parts vector has more elements than the value vector, then we have a
+    // vector widening case (e.g. <2 x float> -> <4 x float>).  Extract the
+    // elements we want.
+    if (PartVT.getVectorElementType() == ValueVT.getVectorElementType()) {
+      assert(PartVT.getVectorNumElements() > ValueVT.getVectorNumElements() &&
+             "Cannot narrow, it would be a lossy transformation");
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
+                         DAG.getIntPtrConstant(0));
+    }                                      
+    
+    // Vector/Vector bitcast.
+    return DAG.getNode(ISD::BIT_CONVERT, DL, ValueVT, Val);
+  }
+  
+  assert(ValueVT.getVectorElementType() == PartVT &&
+         ValueVT.getVectorNumElements() == 1 &&
+         "Only trivial scalar-to-vector conversions should get here!");
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val);
+}
+
+
+
+
+static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc dl,
+                                 SDValue Val, SDValue *Parts, unsigned NumParts,
+                                 EVT PartVT);
+  
 /// getCopyToParts - Create a series of nodes that contain the specified value
 /// split into legal parts.  If the parts contain more bits than Val, then, for
 /// integers, ExtendKind can be used to specify how to generate the extra bits.
-static void getCopyToParts(SelectionDAG &DAG, DebugLoc dl,
+static void getCopyToParts(SelectionDAG &DAG, DebugLoc DL,
                            SDValue Val, SDValue *Parts, unsigned NumParts,
                            EVT PartVT,
                            ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT PtrVT = TLI.getPointerTy();
   EVT ValueVT = Val.getValueType();
+  
+  // Handle the vector case separately.
+  if (ValueVT.isVector())
+    return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT);
+  
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned PartBits = PartVT.getSizeInBits();
   unsigned OrigNumParts = NumParts;
   assert(TLI.isTypeLegal(PartVT) && "Copying to an illegal type!");
 
-  if (!NumParts)
+  if (NumParts == 0)
     return;
 
-  if (!ValueVT.isVector()) {
-    if (PartVT == ValueVT) {
-      assert(NumParts == 1 && "No-op copy with multiple parts!");
-      Parts[0] = Val;
-      return;
-    }
-
-    if (NumParts * PartBits > ValueVT.getSizeInBits()) {
-      // If the parts cover more bits than the value has, promote the value.
-      if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
-        assert(NumParts == 1 && "Do not know what to promote to!");
-        Val = DAG.getNode(ISD::FP_EXTEND, dl, PartVT, Val);
-      } else if (PartVT.isInteger() && ValueVT.isInteger()) {
-        ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
-        Val = DAG.getNode(ExtendKind, dl, ValueVT, Val);
-      } else {
-        llvm_unreachable("Unknown mismatch!");
-      }
-    } else if (PartBits == ValueVT.getSizeInBits()) {
-      // Different types of the same size.
-      assert(NumParts == 1 && PartVT != ValueVT);
-      Val = DAG.getNode(ISD::BIT_CONVERT, dl, PartVT, Val);
-    } else if (NumParts * PartBits < ValueVT.getSizeInBits()) {
-      // If the parts cover less bits than value has, truncate the value.
-      if (PartVT.isInteger() && ValueVT.isInteger()) {
-        ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
-        Val = DAG.getNode(ISD::TRUNCATE, dl, ValueVT, Val);
-      } else {
-        llvm_unreachable("Unknown mismatch!");
-      }
-    }
-
-    // The value may have changed - recompute ValueVT.
-    ValueVT = Val.getValueType();
-    assert(NumParts * PartBits == ValueVT.getSizeInBits() &&
-           "Failed to tile the value with PartVT!");
-
-    if (NumParts == 1) {
-      assert(PartVT == ValueVT && "Type conversion failed!");
-      Parts[0] = Val;
-      return;
-    }
+  assert(!ValueVT.isVector() && "Vector case handled elsewhere");
+  if (PartVT == ValueVT) {
+    assert(NumParts == 1 && "No-op copy with multiple parts!");
+    Parts[0] = Val;
+    return;
+  }
 
-    // Expand the value into multiple parts.
-    if (NumParts & (NumParts - 1)) {
-      // The number of parts is not a power of 2.  Split off and copy the tail.
+  if (NumParts * PartBits > ValueVT.getSizeInBits()) {
+    // If the parts cover more bits than the value has, promote the value.
+    if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
+      assert(NumParts == 1 && "Do not know what to promote to!");
+      Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val);
+    } else {
       assert(PartVT.isInteger() && ValueVT.isInteger() &&
-             "Do not know what to expand to!");
-      unsigned RoundParts = 1 << Log2_32(NumParts);
-      unsigned RoundBits = RoundParts * PartBits;
-      unsigned OddParts = NumParts - RoundParts;
-      SDValue OddVal = DAG.getNode(ISD::SRL, dl, ValueVT, Val,
-                                   DAG.getConstant(RoundBits,
-                                                   TLI.getPointerTy()));
-      getCopyToParts(DAG, dl, OddVal, Parts + RoundParts,
-                     OddParts, PartVT);
-
-      if (TLI.isBigEndian())
-        // The odd parts were reversed by getCopyToParts - unreverse them.
-        std::reverse(Parts + RoundParts, Parts + NumParts);
-
-      NumParts = RoundParts;
+             "Unknown mismatch!");             
       ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
-      Val = DAG.getNode(ISD::TRUNCATE, dl, ValueVT, Val);
+      Val = DAG.getNode(ExtendKind, DL, ValueVT, Val);
     }
+  } else if (PartBits == ValueVT.getSizeInBits()) {
+    // Different types of the same size.
+    assert(NumParts == 1 && PartVT != ValueVT);
+    Val = DAG.getNode(ISD::BIT_CONVERT, DL, PartVT, Val);
+  } else if (NumParts * PartBits < ValueVT.getSizeInBits()) {
+    // If the parts cover less bits than value has, truncate the value.
+    assert(PartVT.isInteger() && ValueVT.isInteger() &&
+           "Unknown mismatch!");
+    ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+    Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
+  }
+
+  // The value may have changed - recompute ValueVT.
+  ValueVT = Val.getValueType();
+  assert(NumParts * PartBits == ValueVT.getSizeInBits() &&
+         "Failed to tile the value with PartVT!");
 
-    // The number of parts is a power of 2.  Repeatedly bisect the value using
-    // EXTRACT_ELEMENT.
-    Parts[0] = DAG.getNode(ISD::BIT_CONVERT, dl,
-                           EVT::getIntegerVT(*DAG.getContext(),
-                                             ValueVT.getSizeInBits()),
-                           Val);
-
-    for (unsigned StepSize = NumParts; StepSize > 1; StepSize /= 2) {
-      for (unsigned i = 0; i < NumParts; i += StepSize) {
-        unsigned ThisBits = StepSize * PartBits / 2;
-        EVT ThisVT = EVT::getIntegerVT(*DAG.getContext(), ThisBits);
-        SDValue &Part0 = Parts[i];
-        SDValue &Part1 = Parts[i+StepSize/2];
-
-        Part1 = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
-                            ThisVT, Part0,
-                            DAG.getConstant(1, PtrVT));
-        Part0 = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
-                            ThisVT, Part0,
-                            DAG.getConstant(0, PtrVT));
-
-        if (ThisBits == PartBits && ThisVT != PartVT) {
-          Part0 = DAG.getNode(ISD::BIT_CONVERT, dl,
-                                                PartVT, Part0);
-          Part1 = DAG.getNode(ISD::BIT_CONVERT, dl,
-                                                PartVT, Part1);
-        }
+  if (NumParts == 1) {
+    assert(PartVT == ValueVT && "Type conversion failed!");
+    Parts[0] = Val;
+    return;
+  }
+
+  // Expand the value into multiple parts.
+  if (NumParts & (NumParts - 1)) {
+    // The number of parts is not a power of 2.  Split off and copy the tail.
+    assert(PartVT.isInteger() && ValueVT.isInteger() &&
+           "Do not know what to expand to!");
+    unsigned RoundParts = 1 << Log2_32(NumParts);
+    unsigned RoundBits = RoundParts * PartBits;
+    unsigned OddParts = NumParts - RoundParts;
+    SDValue OddVal = DAG.getNode(ISD::SRL, DL, ValueVT, Val,
+                                 DAG.getIntPtrConstant(RoundBits));
+    getCopyToParts(DAG, DL, OddVal, Parts + RoundParts, OddParts, PartVT);
+
+    if (TLI.isBigEndian())
+      // The odd parts were reversed by getCopyToParts - unreverse them.
+      std::reverse(Parts + RoundParts, Parts + NumParts);
+
+    NumParts = RoundParts;
+    ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+    Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
+  }
+
+  // The number of parts is a power of 2.  Repeatedly bisect the value using
+  // EXTRACT_ELEMENT.
+  Parts[0] = DAG.getNode(ISD::BIT_CONVERT, DL,
+                         EVT::getIntegerVT(*DAG.getContext(),
+                                           ValueVT.getSizeInBits()),
+                         Val);
+
+  for (unsigned StepSize = NumParts; StepSize > 1; StepSize /= 2) {
+    for (unsigned i = 0; i < NumParts; i += StepSize) {
+      unsigned ThisBits = StepSize * PartBits / 2;
+      EVT ThisVT = EVT::getIntegerVT(*DAG.getContext(), ThisBits);
+      SDValue &Part0 = Parts[i];
+      SDValue &Part1 = Parts[i+StepSize/2];
+
+      Part1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
+                          ThisVT, Part0, DAG.getIntPtrConstant(1));
+      Part0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
+                          ThisVT, Part0, DAG.getIntPtrConstant(0));
+
+      if (ThisBits == PartBits && ThisVT != PartVT) {
+        Part0 = DAG.getNode(ISD::BIT_CONVERT, DL, PartVT, Part0);
+        Part1 = DAG.getNode(ISD::BIT_CONVERT, DL, PartVT, Part1);
       }
     }
+  }
 
-    if (TLI.isBigEndian())
-      std::reverse(Parts, Parts + OrigNumParts);
+  if (TLI.isBigEndian())
+    std::reverse(Parts, Parts + OrigNumParts);
+}
 
-    return;
-  }
 
-  // Vector ValueVT.
+/// getCopyToPartsVector - Create a series of nodes that contain the specified
+/// value split into legal parts.
+static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL,
+                                 SDValue Val, SDValue *Parts, unsigned NumParts,
+                                 EVT PartVT) {
+  EVT ValueVT = Val.getValueType();
+  assert(ValueVT.isVector() && "Not a vector");
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  
   if (NumParts == 1) {
-    if (PartVT != ValueVT) {
-      if (PartVT.getSizeInBits() == ValueVT.getSizeInBits()) {
-        Val = DAG.getNode(ISD::BIT_CONVERT, dl, PartVT, Val);
-      } else {
-        assert(ValueVT.getVectorElementType() == PartVT &&
-               ValueVT.getVectorNumElements() == 1 &&
-               "Only trivial vector-to-scalar conversions should get here!");
-        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                          PartVT, Val,
-                          DAG.getConstant(0, PtrVT));
-      }
-    }
+    if (PartVT == ValueVT) {
+      // Nothing to do.
+    } else if (PartVT.getSizeInBits() == ValueVT.getSizeInBits()) {
+      // Bitconvert vector->vector case.
+      Val = DAG.getNode(ISD::BIT_CONVERT, DL, PartVT, Val);
+    } else if (PartVT.isVector() &&
+               PartVT.getVectorElementType() == ValueVT.getVectorElementType()&&
+               PartVT.getVectorNumElements() > ValueVT.getVectorNumElements()) {
+      EVT ElementVT = PartVT.getVectorElementType();
+      // Vector widening case, e.g. <2 x float> -> <4 x float>.  Shuffle in
+      // undef elements.
+      SmallVector<SDValue, 16> Ops;
+      for (unsigned i = 0, e = ValueVT.getVectorNumElements(); i != e; ++i)
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                                  ElementVT, Val, DAG.getIntPtrConstant(i)));
+      
+      for (unsigned i = ValueVT.getVectorNumElements(),
+           e = PartVT.getVectorNumElements(); i != e; ++i)
+        Ops.push_back(DAG.getUNDEF(ElementVT));
+
+      Val = DAG.getNode(ISD::BUILD_VECTOR, DL, PartVT, &Ops[0], Ops.size());
 
+      // FIXME: Use CONCAT for 2x -> 4x.
+      
+      //SDValue UndefElts = DAG.getUNDEF(VectorTy);
+      //Val = DAG.getNode(ISD::CONCAT_VECTORS, DL, PartVT, Val, UndefElts);
+    } else {
+      // Vector -> scalar conversion.
+      assert(ValueVT.getVectorElementType() == PartVT &&
+             ValueVT.getVectorNumElements() == 1 &&
+             "Only trivial vector-to-scalar conversions should get here!");
+      Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                        PartVT, Val, DAG.getIntPtrConstant(0));
+    }
+    
     Parts[0] = Val;
     return;
   }
-
+  
   // Handle a multi-element vector.
   EVT IntermediateVT, RegisterVT;
   unsigned NumIntermediates;
   unsigned NumRegs = TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT,
-                              IntermediateVT, NumIntermediates, RegisterVT);
+                                                IntermediateVT,
+                                                NumIntermediates, RegisterVT);
   unsigned NumElements = ValueVT.getVectorNumElements();
-
+  
   assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
   NumParts = NumRegs; // Silence a compiler warning.
   assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
-
+  
   // Split the vector into intermediate operands.
   SmallVector<SDValue, 8> Ops(NumIntermediates);
   for (unsigned i = 0; i != NumIntermediates; ++i) {
     if (IntermediateVT.isVector())
-      Ops[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
+      Ops[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
                            IntermediateVT, Val,
-                           DAG.getConstant(i * (NumElements / NumIntermediates),
-                                           PtrVT));
+                   DAG.getIntPtrConstant(i * (NumElements / NumIntermediates)));
     else
-      Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                           IntermediateVT, Val,
-                           DAG.getConstant(i, PtrVT));
+      Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                           IntermediateVT, Val, DAG.getIntPtrConstant(i));
   }
-
+  
   // Split the intermediate operands into legal parts.
   if (NumParts == NumIntermediates) {
     // If the register was not expanded, promote or copy the value,
     // as appropriate.
     for (unsigned i = 0; i != NumParts; ++i)
-      getCopyToParts(DAG, dl, Ops[i], &Parts[i], 1, PartVT);
+      getCopyToParts(DAG, DL, Ops[i], &Parts[i], 1, PartVT);
   } else if (NumParts > 0) {
     // If the intermediate type was expanded, split each the value into
     // legal parts.
@@ -417,10 +481,13 @@ static void getCopyToParts(SelectionDAG &DAG, DebugLoc dl,
            "Must expand into a divisible number of parts!");
     unsigned Factor = NumParts / NumIntermediates;
     for (unsigned i = 0; i != NumIntermediates; ++i)
-      getCopyToParts(DAG, dl, Ops[i], &Parts[i*Factor], Factor, PartVT);
+      getCopyToParts(DAG, DL, Ops[i], &Parts[i*Factor], Factor, PartVT);
   }
 }
 
+
+
+
 namespace {
   /// RegsForValue - This struct represents the registers (physical or virtual)
   /// that a particular set of values is assigned, and the type information
@@ -460,11 +527,6 @@ namespace {
                  EVT regvt, EVT valuevt)
       : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {}
 
-    RegsForValue(const SmallVector<unsigned, 4> &regs,
-                 const SmallVector<EVT, 4> &regvts,
-                 const SmallVector<EVT, 4> &valuevts)
-      : ValueVTs(valuevts), RegVTs(regvts), Regs(regs) {}
-
     RegsForValue(LLVMContext &Context, const TargetLowering &tli,
                  unsigned Reg, const Type *Ty) {
       ComputeValueVTs(tli, Ty, ValueVTs);
@@ -530,6 +592,10 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
                                       FunctionLoweringInfo &FuncInfo,
                                       DebugLoc dl,
                                       SDValue &Chain, SDValue *Flag) const {
+  // A Value with type {} or [0 x %t] needs no registers.
+  if (ValueVTs.empty())
+    return SDValue();
+
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Assemble the legal parts into the final values.
@@ -623,8 +689,7 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
     unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
     EVT RegisterVT = RegVTs[Value];
 
-    getCopyToParts(DAG, dl,
-                   Val.getValue(Val.getResNo() + Value),
+    getCopyToParts(DAG, dl, Val.getValue(Val.getResNo() + Value),
                    &Parts[Part], NumParts, RegisterVT);
     Part += NumParts;
   }
@@ -701,6 +766,7 @@ void SelectionDAGBuilder::clear() {
   UnusedArgNodeMap.clear();
   PendingLoads.clear();
   PendingExports.clear();
+  DanglingDebugInfoMap.clear();
   CurDebugLoc = DebugLoc();
   HasTailCall = false;
 }
@@ -805,6 +871,33 @@ void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
   }
 }
 
+// resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V,
+// generate the debug data structures now that we've seen its definition.
+void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
+                                                   SDValue Val) {
+  DanglingDebugInfo &DDI = DanglingDebugInfoMap[V];
+  if (DDI.getDI()) {
+    const DbgValueInst *DI = DDI.getDI();
+    DebugLoc dl = DDI.getdl();
+    unsigned DbgSDNodeOrder = DDI.getSDNodeOrder();
+    MDNode *Variable = DI->getVariable();
+    uint64_t Offset = DI->getOffset();
+    SDDbgValue *SDV;
+    if (Val.getNode()) {
+      if (!EmitFuncArgumentDbgValue(V, Variable, Offset, Val)) {
+        SDV = DAG.getDbgValue(Variable, Val.getNode(),
+                              Val.getResNo(), Offset, dl, DbgSDNodeOrder);
+        DAG.AddDbgValue(SDV, Val.getNode(), false);
+      }
+    } else {
+      SDV = DAG.getDbgValue(Variable, UndefValue::get(V->getType()),
+                            Offset, dl, SDNodeOrder);
+      DAG.AddDbgValue(SDV, 0, false);
+    }
+    DanglingDebugInfoMap[V] = DanglingDebugInfo();
+  }
+}
+
 // getValue - Return an SDValue for the given Value.
 SDValue SelectionDAGBuilder::getValue(const Value *V) {
   // If we already have an SDValue for this value, use it. It's important
@@ -826,6 +919,7 @@ SDValue SelectionDAGBuilder::getValue(const Value *V) {
   // Otherwise create a new SDValue and remember it.
   SDValue Val = getValueImpl(V);
   NodeMap[V] = Val;
+  resolveDanglingDebugInfo(V, Val);
   return Val;
 }
 
@@ -839,10 +933,11 @@ SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
   // Otherwise create a new SDValue and remember it.
   SDValue Val = getValueImpl(V);
   NodeMap[V] = Val;
+  resolveDanglingDebugInfo(V, Val);
   return Val;
 }
 
-/// getValueImpl - Helper function for getValue and getMaterializedValue.
+/// getValueImpl - Helper function for getValue and getNonRegisterValue.
 /// Create an SDValue for the given value.
 SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
   if (const Constant *C = dyn_cast<Constant>(V)) {
@@ -986,10 +1081,10 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     unsigned NumValues = ValueVTs.size();
 
     SmallVector<SDValue, 4> Chains(NumValues);
-    EVT PtrVT = PtrValueVTs[0];
     for (unsigned i = 0; i != NumValues; ++i) {
-      SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(), PtrVT, RetPtr,
-                                DAG.getConstant(Offsets[i], PtrVT));
+      SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(),
+                                RetPtr.getValueType(), RetPtr,
+                                DAG.getIntPtrConstant(Offsets[i]));
       Chains[i] =
         DAG.getStore(Chain, getCurDebugLoc(),
                      SDValue(RetOp.getNode(), RetOp.getResNo() + i),
@@ -2709,11 +2804,6 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       }
 
       Ty = StTy->getElementType(Field);
-    } else if (const UnionType *UnTy = dyn_cast<UnionType>(Ty)) {
-      unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
-      
-      // Offset canonically 0 for unions, but type changes
-      Ty = UnTy->getElementType(Field);
     } else {
       Ty = cast<SequentialType>(Ty)->getElementType();
 
@@ -2818,7 +2908,7 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
 
   // Inform the Frame Information that we have just allocated a variable-sized
   // object.
-  FuncInfo.MF->getFrameInfo()->CreateVariableSizedObject();
+  FuncInfo.MF->getFrameInfo()->CreateVariableSizedObject(Align ? Align : 1);
 }
 
 void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
@@ -3824,11 +3914,11 @@ static SDValue ExpandPowI(DebugLoc DL, SDValue LHS, SDValue RHS,
 /// argument, create the corresponding DBG_VALUE machine instruction for it now.
 /// At the end of instruction selection, they will be inserted to the entry BB.
 bool
-SelectionDAGBuilder::EmitFuncArgumentDbgValue(const DbgValueInst &DI,
-                                              const Value *V, MDNode *Variable,
-                                              uint64_t Offset,
+SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
+                                              int64_t Offset, 
                                               const SDValue &N) {
-  if (!isa<Argument>(V))
+  const Argument *Arg = dyn_cast<Argument>(V);
+  if (!Arg)
     return false;
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -3842,7 +3932,15 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const DbgValueInst &DI,
     return false;
 
   unsigned Reg = 0;
-  if (N.getOpcode() == ISD::CopyFromReg) {
+  if (Arg->hasByValAttr()) {
+    // Byval arguments' frame index is recorded during argument lowering.
+    // Use this info directly.
+    const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
+    Reg = TRI->getFrameRegister(MF);
+    Offset = FuncInfo.getByValArgumentFrameIndex(Arg);
+  }
+
+  if (N.getNode() && N.getOpcode() == ISD::CopyFromReg) {
     Reg = cast<RegisterSDNode>(N.getOperand(1))->getReg();
     if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) {
       MachineRegisterInfo &RegInfo = MF.getRegInfo();
@@ -3966,42 +4064,40 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst &DI = cast<DbgDeclareInst>(I);
-    if (!DIVariable(DI.getVariable()).Verify())
-      return 0;
-
     MDNode *Variable = DI.getVariable();
-    // Parameters are handled specially.
-    bool isParameter = 
-      DIVariable(Variable).getTag() == dwarf::DW_TAG_arg_variable;
     const Value *Address = DI.getAddress();
-    if (!Address)
+    if (!Address || !DIVariable(DI.getVariable()).Verify())
       return 0;
-    if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
-      Address = BCI->getOperand(0);
-    const AllocaInst *AI = dyn_cast<AllocaInst>(Address);
-    if (AI) {
-      // Don't handle byval arguments or VLAs, for example.
-      // Non-byval arguments are handled here (they refer to the stack temporary
-      // alloca at this point).
-      DenseMap<const AllocaInst*, int>::iterator SI =
-        FuncInfo.StaticAllocaMap.find(AI);
-      if (SI == FuncInfo.StaticAllocaMap.end())
-        return 0; // VLAs.
-      int FI = SI->second;
-
-      MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
-      if (!DI.getDebugLoc().isUnknown() && MMI.hasDebugInfo())
-        MMI.setVariableDbgInfo(Variable, FI, DI.getDebugLoc());
-    }
 
     // Build an entry in DbgOrdering.  Debug info input nodes get an SDNodeOrder
     // but do not always have a corresponding SDNode built.  The SDNodeOrder
     // absolute, but not relative, values are different depending on whether
     // debug info exists.
     ++SDNodeOrder;
+
+    // Check if address has undef value.
+    if (isa<UndefValue>(Address) ||
+        (Address->use_empty() && !isa<Argument>(Address))) {
+      SDDbgValue*SDV = 
+        DAG.getDbgValue(Variable, UndefValue::get(Address->getType()),
+                        0, dl, SDNodeOrder);
+      DAG.AddDbgValue(SDV, 0, false);
+      return 0;
+    }
+
     SDValue &N = NodeMap[Address];
+    if (!N.getNode() && isa<Argument>(Address))
+      // Check unused arguments map.
+      N = UnusedArgNodeMap[Address];
     SDDbgValue *SDV;
     if (N.getNode()) {
+      // Parameters are handled specially.
+      bool isParameter = 
+        DIVariable(Variable).getTag() == dwarf::DW_TAG_arg_variable;
+      if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
+        Address = BCI->getOperand(0);
+      const AllocaInst *AI = dyn_cast<AllocaInst>(Address);
+
       if (isParameter && !AI) {
         FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
         if (FINode)
@@ -4020,10 +4116,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
         return 0;
       DAG.AddDbgValue(SDV, N.getNode(), isParameter);
     } else {
-      // This isn't useful, but it shows what we're missing.
-      SDV = DAG.getDbgValue(Variable, UndefValue::get(Address->getType()),
-                            0, dl, SDNodeOrder);
-      DAG.AddDbgValue(SDV, 0, isParameter);
+      // If Address is an arugment then try to emits its dbg value using
+      // virtual register info from the FuncInfo.ValueMap. Otherwise add undef
+      // to help track missing debug info.
+      if (!EmitFuncArgumentDbgValue(Address, Variable, 0, N)) {
+        SDV = DAG.getDbgValue(Variable, UndefValue::get(Address->getType()),
+                              0, dl, SDNodeOrder);
+        DAG.AddDbgValue(SDV, 0, false);
+      }
     }
     return 0;
   }
@@ -4048,31 +4148,24 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       SDV = DAG.getDbgValue(Variable, V, Offset, dl, SDNodeOrder);
       DAG.AddDbgValue(SDV, 0, false);
     } else {
-      bool createUndef = false;
-      // FIXME : Why not use getValue() directly ?
+      // Do not use getValue() in here; we don't want to generate code at
+      // this point if it hasn't been done yet.
       SDValue N = NodeMap[V];
       if (!N.getNode() && isa<Argument>(V))
         // Check unused arguments map.
         N = UnusedArgNodeMap[V];
       if (N.getNode()) {
-        if (!EmitFuncArgumentDbgValue(DI, V, Variable, Offset, N)) {
+        if (!EmitFuncArgumentDbgValue(V, Variable, Offset, N)) {
           SDV = DAG.getDbgValue(Variable, N.getNode(),
                                 N.getResNo(), Offset, dl, SDNodeOrder);
           DAG.AddDbgValue(SDV, N.getNode(), false);
         }
-      } else if (isa<PHINode>(V) && !V->use_empty()) {
-        SDValue N = getValue(V);
-        if (N.getNode()) {
-          if (!EmitFuncArgumentDbgValue(DI, V, Variable, Offset, N)) {
-            SDV = DAG.getDbgValue(Variable, N.getNode(),
-                                  N.getResNo(), Offset, dl, SDNodeOrder);
-            DAG.AddDbgValue(SDV, N.getNode(), false);
-          }
-        } else
-          createUndef = true;
-      } else
-        createUndef = true;
-      if (createUndef) {
+      } else if (isa<PHINode>(V) && !V->use_empty() ) {
+        // Do not call getValue(V) yet, as we don't want to generate code.
+        // Remember it for later.
+        DanglingDebugInfo DDI(&DI, dl, SDNodeOrder);
+        DanglingDebugInfoMap[V] = DDI;
+      } else {
         // We may expand this to cover more cases.  One case where we have no
         // data available is an unreferenced parameter; we need this fallback.
         SDV = DAG.getDbgValue(Variable, UndefValue::get(V->getType()),
@@ -4572,6 +4665,11 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
       !isInTailCallPosition(CS, CS.getAttributes().getRetAttributes(), TLI))
     isTailCall = false;
 
+  // If there's a possibility that fast-isel has already selected some amount
+  // of the current basic block, don't emit a tail call.
+  if (isTailCall && EnableFastISel)
+    isTailCall = false;
+
   std::pair<SDValue,SDValue> Result =
     TLI.LowerCallTo(getRoot(), RetTy,
                     CS.paramHasAttr(0, Attribute::SExt),
@@ -6054,6 +6152,12 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
       i += NumParts;
     }
 
+    // Note down frame index for byval arguments.
+    if (I->hasByValAttr() && !ArgValues.empty())
+      if (FrameIndexSDNode *FI = 
+          dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
+        FuncInfo->setByValArgumentFrameIndex(I, FI->getIndex());
+
     if (!I->use_empty()) {
       SDValue Res;
       if (!ArgValues.empty())
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 46733d6db1241..5f400e9c83acb 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -18,9 +18,6 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
-#ifndef NDEBUG
-#include "llvm/ADT/SmallSet.h"
-#endif
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/Support/CallSite.h"
@@ -64,6 +61,7 @@ class PHINode;
 class PtrToIntInst;
 class ReturnInst;
 class SDISelAsmOperandInfo;
+class SDDbgValue;
 class SExtInst;
 class SelectInst;
 class ShuffleVectorInst;
@@ -93,6 +91,24 @@ class SelectionDAGBuilder {
   /// to preserve debug information for incoming arguments.
   DenseMap<const Value*, SDValue> UnusedArgNodeMap;
 
+  /// DanglingDebugInfo - Helper type for DanglingDebugInfoMap.
+  class DanglingDebugInfo {
+    const DbgValueInst* DI;
+    DebugLoc dl;
+    unsigned SDNodeOrder;
+  public:
+    DanglingDebugInfo() : DI(0), dl(DebugLoc()), SDNodeOrder(0) { }
+    DanglingDebugInfo(const DbgValueInst *di, DebugLoc DL, unsigned SDNO) :
+      DI(di), dl(DL), SDNodeOrder(SDNO) { }
+    const DbgValueInst* getDI() { return DI; }
+    DebugLoc getdl() { return dl; }
+    unsigned getSDNodeOrder() { return SDNodeOrder; }
+  };
+
+  /// DanglingDebugInfoMap - Keeps track of dbg_values for which we have not
+  /// yet seen the referent.  We defer handling these until we do see it.
+  DenseMap<const Value*, DanglingDebugInfo> DanglingDebugInfoMap;
+
 public:
   /// PendingLoads - Loads are not emitted to the program immediately.  We bunch
   /// them up and then emit token factor nodes when possible.  This allows us to
@@ -345,6 +361,9 @@ public:
 
   void visit(unsigned Opcode, const User &I);
 
+  // resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V,
+  // generate the debug data structures now that we've seen its definition.
+  void resolveDanglingDebugInfo(const Value *V, SDValue Val);
   SDValue getValue(const Value *V);
   SDValue getNonRegisterValue(const Value *V);
   SDValue getValueImpl(const Value *V);
@@ -506,13 +525,11 @@ private:
 
   void HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
 
-  /// EmitFuncArgumentDbgValue - If the DbgValueInst is a dbg_value of a
-  /// function argument, create the corresponding DBG_VALUE machine instruction
-  /// for it now. At the end of instruction selection, they will be inserted to
-  /// the entry BB.
-  bool EmitFuncArgumentDbgValue(const DbgValueInst &DI,
-                                const Value *V, MDNode *Variable,
-                                uint64_t Offset, const SDValue &N);
+  /// EmitFuncArgumentDbgValue - If V is an function argument then create
+  /// corresponding DBG_VALUE machine instruction for it now. At the end of 
+  /// instruction selection, they will be inserted to the entry BB.
+  bool EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
+                                int64_t Offset, const SDValue &N);
 };
 
 } // end namespace llvm
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 08ba5482f7d22..66cb5ceb09e53 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -132,14 +132,16 @@ namespace llvm {
     const TargetLowering &TLI = IS->getTargetLowering();
 
     if (OptLevel == CodeGenOpt::None)
-      return createFastDAGScheduler(IS, OptLevel);
+      return createSourceListDAGScheduler(IS, OptLevel);
     if (TLI.getSchedulingPreference() == Sched::Latency)
       return createTDListDAGScheduler(IS, OptLevel);
     if (TLI.getSchedulingPreference() == Sched::RegPressure)
       return createBURRListDAGScheduler(IS, OptLevel);
-    assert(TLI.getSchedulingPreference() == Sched::Hybrid &&
+    if (TLI.getSchedulingPreference() == Sched::Hybrid)
+      return createHybridListDAGScheduler(IS, OptLevel);
+    assert(TLI.getSchedulingPreference() == Sched::ILP &&
            "Unknown sched type!");
-    return createHybridListDAGScheduler(IS, OptLevel);
+    return createILPListDAGScheduler(IS, OptLevel);
   }
 }
 
@@ -169,7 +171,7 @@ TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 //===----------------------------------------------------------------------===//
 
 SelectionDAGISel::SelectionDAGISel(const TargetMachine &tm, CodeGenOpt::Level OL) :
-  MachineFunctionPass(&ID), TM(tm), TLI(*tm.getTargetLowering()),
+  MachineFunctionPass(ID), TM(tm), TLI(*tm.getTargetLowering()),
   FuncInfo(new FunctionLoweringInfo(TLI)),
   CurDAG(new SelectionDAG(tm)),
   SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
@@ -216,7 +218,7 @@ static bool FunctionCallsSetJmp(const Function *F) {
         for (Value::const_use_iterator
                I = Callee->use_begin(), E = Callee->use_end();
              I != E; ++I)
-          if (const CallInst *CI = dyn_cast<CallInst>(I))
+          if (const CallInst *CI = dyn_cast<CallInst>(*I))
             if (CI->getParent()->getParent() == F)
               return true;
     }
@@ -362,38 +364,6 @@ SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
   CodeGenAndEmitDAG();
 }
 
-namespace {
-/// WorkListRemover - This class is a DAGUpdateListener that removes any deleted
-/// nodes from the worklist.
-class SDOPsWorkListRemover : public SelectionDAG::DAGUpdateListener {
-  SmallVector<SDNode*, 128> &Worklist;
-  SmallPtrSet<SDNode*, 128> &InWorklist;
-public:
-  SDOPsWorkListRemover(SmallVector<SDNode*, 128> &wl,
-                       SmallPtrSet<SDNode*, 128> &inwl)
-    : Worklist(wl), InWorklist(inwl) {}
-
-  void RemoveFromWorklist(SDNode *N) {
-    if (!InWorklist.erase(N)) return;
-    
-    SmallVector<SDNode*, 128>::iterator I =
-    std::find(Worklist.begin(), Worklist.end(), N);
-    assert(I != Worklist.end() && "Not in worklist");
-    
-    *I = Worklist.back();
-    Worklist.pop_back();
-  }
-  
-  virtual void NodeDeleted(SDNode *N, SDNode *E) {
-    RemoveFromWorklist(N);
-  }
-
-  virtual void NodeUpdated(SDNode *N) {
-    // Ignore updates.
-  }
-};
-}
-
 void SelectionDAGISel::ComputeLiveOutVRegInfo() {
   SmallPtrSet<SDNode*, 128> VisitedNodes;
   SmallVector<SDNode*, 128> Worklist;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index 6cae804422ce6..8313de5e32bba 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -199,7 +199,7 @@ const std::string SelectionDAG::getGraphAttrs(const SDNode *N) const {
 #else
   errs() << "SelectionDAG::getGraphAttrs is only available in debug builds"
          << " on systems with Graphviz or gv!\n";
-  return std::string("");
+  return std::string();
 #endif
 }
 
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4f3866956cac5..b74f600cfa2db 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -651,6 +651,53 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
   return NumVectorRegs;
 }
 
+/// isLegalRC - Return true if the value types that can be represented by the
+/// specified register class are all legal.
+bool TargetLowering::isLegalRC(const TargetRegisterClass *RC) const {
+  for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
+       I != E; ++I) {
+    if (isTypeLegal(*I))
+      return true;
+  }
+  return false;
+}
+
+/// hasLegalSuperRegRegClasses - Return true if the specified register class
+/// has one or more super-reg register classes that are legal.
+bool
+TargetLowering::hasLegalSuperRegRegClasses(const TargetRegisterClass *RC) const{
+  if (*RC->superregclasses_begin() == 0)
+    return false;
+  for (TargetRegisterInfo::regclass_iterator I = RC->superregclasses_begin(),
+         E = RC->superregclasses_end(); I != E; ++I) {
+    const TargetRegisterClass *RRC = *I;
+    if (isLegalRC(RRC))
+      return true;
+  }
+  return false;
+}
+
+/// findRepresentativeClass - Return the largest legal super-reg register class
+/// of the register class for the specified type and its associated "cost".
+std::pair<const TargetRegisterClass*, uint8_t>
+TargetLowering::findRepresentativeClass(EVT VT) const {
+  const TargetRegisterClass *RC = RegClassForVT[VT.getSimpleVT().SimpleTy];
+  if (!RC)
+    return std::make_pair(RC, 0);
+  const TargetRegisterClass *BestRC = RC;
+  for (TargetRegisterInfo::regclass_iterator I = RC->superregclasses_begin(),
+         E = RC->superregclasses_end(); I != E; ++I) {
+    const TargetRegisterClass *RRC = *I;
+    if (RRC->isASubClass() || !isLegalRC(RRC))
+      continue;
+    if (!hasLegalSuperRegRegClasses(RRC))
+      return std::make_pair(RRC, 1);
+    BestRC = RRC;
+  }
+  return std::make_pair(BestRC, 1);
+}
+
+
 /// computeRegisterProperties - Once all of the register classes are added,
 /// this allows us to compute derived properties we expose.
 void TargetLowering::computeRegisterProperties() {
@@ -736,6 +783,28 @@ void TargetLowering::computeRegisterProperties() {
     MVT VT = (MVT::SimpleValueType)i;
     if (isTypeLegal(VT)) continue;
     
+    // Determine if there is a legal wider type.  If so, we should promote to
+    // that wider vector type.
+    EVT EltVT = VT.getVectorElementType();
+    unsigned NElts = VT.getVectorNumElements();
+    if (NElts != 1) {
+      bool IsLegalWiderType = false;
+      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+        EVT SVT = (MVT::SimpleValueType)nVT;
+        if (SVT.getVectorElementType() == EltVT &&
+            SVT.getVectorNumElements() > NElts && 
+            isTypeSynthesizable(SVT)) {
+          TransformToType[i] = SVT;
+          RegisterTypeForVT[i] = SVT;
+          NumRegistersForVT[i] = 1;
+          ValueTypeActions.setTypeAction(VT, Promote);
+          IsLegalWiderType = true;
+          break;
+        }
+      }
+      if (IsLegalWiderType) continue;
+    }
+    
     MVT IntermediateVT;
     EVT RegisterVT;
     unsigned NumIntermediates;
@@ -744,32 +813,29 @@ void TargetLowering::computeRegisterProperties() {
                                 RegisterVT, this);
     RegisterTypeForVT[i] = RegisterVT;
     
-    // Determine if there is a legal wider type.
-    bool IsLegalWiderType = false;
-    EVT EltVT = VT.getVectorElementType();
-    unsigned NElts = VT.getVectorNumElements();
-    for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
-      EVT SVT = (MVT::SimpleValueType)nVT;
-      if (isTypeSynthesizable(SVT) && SVT.getVectorElementType() == EltVT &&
-          SVT.getVectorNumElements() > NElts && NElts != 1) {
-        TransformToType[i] = SVT;
-        ValueTypeActions.setTypeAction(VT, Promote);
-        IsLegalWiderType = true;
-        break;
-      }
-    }
-    if (!IsLegalWiderType) {
-      EVT NVT = VT.getPow2VectorType();
-      if (NVT == VT) {
-        // Type is already a power of 2.  The default action is to split.
-        TransformToType[i] = MVT::Other;
-        ValueTypeActions.setTypeAction(VT, Expand);
-      } else {
-        TransformToType[i] = NVT;
-        ValueTypeActions.setTypeAction(VT, Promote);
-      }
+    EVT NVT = VT.getPow2VectorType();
+    if (NVT == VT) {
+      // Type is already a power of 2.  The default action is to split.
+      TransformToType[i] = MVT::Other;
+      ValueTypeActions.setTypeAction(VT, Expand);
+    } else {
+      TransformToType[i] = NVT;
+      ValueTypeActions.setTypeAction(VT, Promote);
     }
   }
+
+  // Determine the 'representative' register class for each value type.
+  // An representative register class is the largest (meaning one which is
+  // not a sub-register class / subreg register class) legal register class for
+  // a group of value types. For example, on i386, i8, i16, and i32
+  // representative would be GR32; while on x86_64 it's GR64.
+  for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
+    const TargetRegisterClass* RRC;
+    uint8_t Cost;
+    tie(RRC, Cost) =  findRepresentativeClass((MVT::SimpleValueType)i);
+    RepRegClassForVT[i] = RRC;
+    RepRegClassCostForVT[i] = Cost;
+  }
 }
 
 const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -798,8 +864,21 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
                                                 EVT &IntermediateVT,
                                                 unsigned &NumIntermediates,
                                                 EVT &RegisterVT) const {
-  // Figure out the right, legal destination reg to copy into.
   unsigned NumElts = VT.getVectorNumElements();
+  
+  // If there is a wider vector type with the same element type as this one,
+  // we should widen to that legal vector type.  This handles things like
+  // <2 x float> -> <4 x float>.
+  if (NumElts != 1 && getTypeAction(VT) == Promote) {
+    RegisterVT = getTypeToTransformTo(Context, VT);
+    if (isTypeLegal(RegisterVT)) {
+      IntermediateVT = RegisterVT;
+      NumIntermediates = 1;
+      return 1;
+    }
+  }
+  
+  // Figure out the right, legal destination reg to copy into.
   EVT EltTy = VT.getVectorElementType();
   
   unsigned NumVectorRegs = 1;
@@ -828,16 +907,12 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
 
   EVT DestVT = getRegisterType(Context, NewVT);
   RegisterVT = DestVT;
-  if (DestVT.bitsLT(NewVT)) {
-    // Value is expanded, e.g. i64 -> i16.
+  if (DestVT.bitsLT(NewVT))   // Value is expanded, e.g. i64 -> i16.
     return NumVectorRegs*(NewVT.getSizeInBits()/DestVT.getSizeInBits());
-  } else {
-    // Otherwise, promotion or legal types use the same number of registers as
-    // the vector decimated to the appropriate level.
-    return NumVectorRegs;
-  }
   
-  return 1;
+  // Otherwise, promotion or legal types use the same number of registers as
+  // the vector decimated to the appropriate level.
+  return NumVectorRegs;
 }
 
 /// Get the EVTs and ArgFlags collections that represent the legalized return 
@@ -1308,9 +1383,32 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         }
       }      
       
-      if (SimplifyDemandedBits(Op.getOperand(0), NewMask.lshr(ShAmt),
+      if (SimplifyDemandedBits(InOp, NewMask.lshr(ShAmt),
                                KnownZero, KnownOne, TLO, Depth+1))
         return true;
+
+      // Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits
+      // are not demanded. This will likely allow the anyext to be folded away.
+      if (InOp.getNode()->getOpcode() == ISD::ANY_EXTEND) {
+        SDValue InnerOp = InOp.getNode()->getOperand(0);
+        EVT InnerVT = InnerOp.getValueType();
+        if ((APInt::getHighBitsSet(BitWidth,
+                                   BitWidth - InnerVT.getSizeInBits()) &
+               DemandedMask) == 0 &&
+            isTypeDesirableForOp(ISD::SHL, InnerVT)) {
+          EVT ShTy = getShiftAmountTy();
+          if (!APInt(BitWidth, ShAmt).isIntN(ShTy.getSizeInBits()))
+            ShTy = InnerVT;
+          SDValue NarrowShl =
+            TLO.DAG.getNode(ISD::SHL, dl, InnerVT, InnerOp,
+                            TLO.DAG.getConstant(ShAmt, ShTy));
+          return
+            TLO.CombineTo(Op,
+                          TLO.DAG.getNode(ISD::ANY_EXTEND, dl, Op.getValueType(),
+                                          NarrowShl));
+        }
+      }
+
       KnownZero <<= SA->getZExtValue();
       KnownOne  <<= SA->getZExtValue();
       // low bits known zero.
@@ -1415,11 +1513,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     // present in the input.
     APInt NewBits =
       APInt::getHighBitsSet(BitWidth,
-                            BitWidth - EVT.getScalarType().getSizeInBits()) &
-      NewMask;
+                            BitWidth - EVT.getScalarType().getSizeInBits());
     
     // If none of the extended bits are demanded, eliminate the sextinreg.
-    if (NewBits == 0)
+    if ((NewBits & NewMask) == 0)
       return TLO.CombineTo(Op, Op.getOperand(0));
 
     APInt InSignBit = APInt::getSignBit(EVT.getScalarType().getSizeInBits());
@@ -1886,12 +1983,9 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       EVT ExtDstTy = N0.getValueType();
       unsigned ExtDstTyBits = ExtDstTy.getSizeInBits();
 
-      // If the extended part has any inconsistent bits, it cannot ever
-      // compare equal.  In other words, they have to be all ones or all
-      // zeros.
-      APInt ExtBits =
-        APInt::getHighBitsSet(ExtDstTyBits, ExtDstTyBits - ExtSrcTyBits);
-      if ((C1 & ExtBits) != 0 && (C1 & ExtBits) != ExtBits)
+      // If the constant doesn't fit into the number of bits for the source of
+      // the sign extension, it is impossible for both sides to be equal.
+      if (C1.getMinSignedBits() > ExtSrcTyBits)
         return DAG.getConstant(Cond == ISD::SETNE, VT);
       
       SDValue ZextOp;
@@ -2476,7 +2570,7 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         int64_t Offs = GA->getOffset();
         if (C) Offs += C->getZExtValue();
         Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), 
-                                                 C->getDebugLoc(),
+                                                 C ? C->getDebugLoc() : DebugLoc(),
                                                  Op.getValueType(), Offs));
         return;
       }
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.cpp b/lib/CodeGen/SimpleRegisterCoalescing.cpp
index e69d3e4fa78aa..b29ea19835bc9 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.cpp
+++ b/lib/CodeGen/SimpleRegisterCoalescing.cpp
@@ -59,13 +59,16 @@ DisableCrossClassJoin("disable-cross-class-join",
                cl::desc("Avoid coalescing cross register class copies"),
                cl::init(false), cl::Hidden);
 
-static RegisterPass<SimpleRegisterCoalescing>
-X("simple-register-coalescing", "Simple Register Coalescing");
+static cl::opt<bool>
+DisablePhysicalJoin("disable-physical-join",
+               cl::desc("Avoid coalescing physical register copies"),
+               cl::init(false), cl::Hidden);
 
-// Declare that we implement the RegisterCoalescer interface
-static RegisterAnalysisGroup<RegisterCoalescer, true/*The Default*/> V(X);
+INITIALIZE_AG_PASS(SimpleRegisterCoalescing, RegisterCoalescer,
+                "simple-register-coalescing", "Simple Register Coalescing", 
+                false, false, true);
 
-const PassInfo *const llvm::SimpleRegisterCoalescingID = &X;
+char &llvm::SimpleRegisterCoalescingID = SimpleRegisterCoalescing::ID;
 
 void SimpleRegisterCoalescing::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
@@ -386,16 +389,12 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(const CoalescerPair &CP,
   if (HasOtherReachingDefs(IntA, IntB, AValNo, BValNo))
     return false;
 
-  bool BHasSubRegs = false;
-  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg))
-    BHasSubRegs = *tri_->getSubRegisters(IntB.reg);
-
-  // Abort if the subregisters of IntB.reg have values that are not simply the
+  // Abort if the aliases of IntB.reg have values that are not simply the
   // clobbers from the superreg.
-  if (BHasSubRegs)
-    for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR)
-      if (li_->hasInterval(*SR) &&
-          HasOtherReachingDefs(IntA, li_->getInterval(*SR), AValNo, 0))
+  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg))
+    for (const unsigned *AS = tri_->getAliasSet(IntB.reg); *AS; ++AS)
+      if (li_->hasInterval(*AS) &&
+          HasOtherReachingDefs(IntA, li_->getInterval(*AS), AValNo, 0))
         return false;
 
   // If some of the uses of IntA.reg is already coalesced away, return false.
@@ -412,6 +411,8 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(const CoalescerPair &CP,
       return false;
   }
 
+  DEBUG(dbgs() << "\tRemoveCopyByCommutingDef: " << *DefMI);
+
   // At this point we have decided that it is legal to do this
   // transformation.  Start by commuting the instruction.
   MachineBasicBlock *MBB = DefMI->getParent();
@@ -470,16 +471,12 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(const CoalescerPair &CP,
       if (Extended)
         UseMO.setIsKill(false);
     }
-    unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-    if (UseMI->isCopy()) {
-      if (UseMI->getOperand(0).getReg() != IntB.reg ||
-          UseMI->getOperand(0).getSubReg())
-        continue;
-    } else if (tii_->isMoveInstr(*UseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)){
-      if (DstReg != IntB.reg || DstSubIdx)
-        continue;
-    } else
+    if (!UseMI->isCopy())
       continue;
+    if (UseMI->getOperand(0).getReg() != IntB.reg ||
+        UseMI->getOperand(0).getSubReg())
+      continue;
+
     // This copy will become a noop. If it's defining a new val#,
     // remove that val# as well. However this live range is being
     // extended to the end of the existing live range defined by the copy.
@@ -504,13 +501,13 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(const CoalescerPair &CP,
   // Remove val#'s defined by copies that will be coalesced away.
   for (unsigned i = 0, e = BDeadValNos.size(); i != e; ++i) {
     VNInfo *DeadVNI = BDeadValNos[i];
-    if (BHasSubRegs) {
-      for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
-        if (!li_->hasInterval(*SR))
+    if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) {
+      for (const unsigned *AS = tri_->getAliasSet(IntB.reg); *AS; ++AS) {
+        if (!li_->hasInterval(*AS))
           continue;
-        LiveInterval &SRLI = li_->getInterval(*SR);
-        if (const LiveRange *SRLR = SRLI.getLiveRangeContaining(DeadVNI->def))
-          SRLI.removeValNo(SRLR->valno);
+        LiveInterval &ASLI = li_->getInterval(*AS);
+        if (const LiveRange *ASLR = ASLI.getLiveRangeContaining(DeadVNI->def))
+          ASLI.removeValNo(ASLR->valno);
       }
     }
     IntB.removeValNo(BDeadValNos[i]);
@@ -628,14 +625,6 @@ SimpleRegisterCoalescing::TrimLiveIntervalToLastUse(SlotIndex CopyIdx,
       if (DefMO.getReg() == li.reg && !DefMO.getSubReg())
         DefMO.setIsDead();
     }
-    unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-    if (tii_->isMoveInstr(*LastUseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
-        DstReg == li.reg && DstSubIdx == 0) {
-      // Last use is itself an identity code.
-      int DeadIdx = LastUseMI->findRegisterDefOperandIdx(li.reg,
-                                                         false, false, tri_);
-      LastUseMI->getOperand(DeadIdx).setIsDead();
-    }
     return true;
   }
 
@@ -772,16 +761,6 @@ SimpleRegisterCoalescing::UpdateRegDefsUses(const CoalescerPair &CP) {
     // A PhysReg copy that won't be coalesced can perhaps be rematerialized
     // instead.
     if (DstIsPhys) {
-      unsigned CopySrcReg, CopyDstReg, CopySrcSubIdx, CopyDstSubIdx;
-      if (tii_->isMoveInstr(*UseMI, CopySrcReg, CopyDstReg,
-                            CopySrcSubIdx, CopyDstSubIdx) &&
-          CopySrcSubIdx == 0 && CopyDstSubIdx == 0 &&
-          CopySrcReg != CopyDstReg && CopySrcReg == SrcReg &&
-          CopyDstReg != DstReg && !JoinedCopies.count(UseMI) &&
-          ReMaterializeTrivialDef(li_->getInterval(SrcReg), CopyDstReg, 0,
-                                  UseMI))
-        continue;
-
       if (UseMI->isCopy() &&
           !UseMI->getOperand(1).getSubReg() &&
           !UseMI->getOperand(0).getSubReg() &&
@@ -834,28 +813,6 @@ SimpleRegisterCoalescing::UpdateRegDefsUses(const CoalescerPair &CP) {
           dbgs() << li_->getInstructionIndex(UseMI) << "\t";
         dbgs() << *UseMI;
       });
-
-
-    // After updating the operand, check if the machine instruction has
-    // become a copy. If so, update its val# information.
-    const TargetInstrDesc &TID = UseMI->getDesc();
-    if (DstIsPhys || TID.getNumDefs() != 1 || TID.getNumOperands() <= 2)
-      continue;
-
-    unsigned CopySrcReg, CopyDstReg, CopySrcSubIdx, CopyDstSubIdx;
-    if (tii_->isMoveInstr(*UseMI, CopySrcReg, CopyDstReg,
-                          CopySrcSubIdx, CopyDstSubIdx) &&
-        CopySrcReg != CopyDstReg &&
-        (TargetRegisterInfo::isVirtualRegister(CopyDstReg) ||
-         allocatableRegs_[CopyDstReg])) {
-      LiveInterval &LI = li_->getInterval(CopyDstReg);
-      SlotIndex DefIdx =
-        li_->getInstructionIndex(UseMI).getDefIndex();
-      if (const LiveRange *DLR = LI.getLiveRangeContaining(DefIdx)) {
-        if (DLR->valno->def == DefIdx)
-          DLR->valno->setCopy(UseMI);
-      }
-    }
   }
 }
 
@@ -1082,13 +1039,18 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
     return false;  // Not coalescable.
   }
 
+  if (DisablePhysicalJoin && CP.isPhys()) {
+    DEBUG(dbgs() << "\tPhysical joins disabled.\n");
+    return false;
+  }
+
   DEBUG(dbgs() << "\tConsidering merging %reg" << CP.getSrcReg());
 
   // Enforce policies.
   if (CP.isPhys()) {
     DEBUG(dbgs() <<" with physreg %" << tri_->getName(CP.getDstReg()) << "\n");
     // Only coalesce to allocatable physreg.
-    if (!allocatableRegs_[CP.getDstReg()]) {
+    if (!li_->isAllocatable(CP.getDstReg())) {
       DEBUG(dbgs() << "\tRegister is an unallocatable physreg.\n");
       return false;  // Not coalescable.
     }
@@ -1137,7 +1099,6 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
     // happens.
     if (li_->hasInterval(CP.getDstReg()) &&
         li_->getInterval(CP.getDstReg()).ranges.size() > 1000) {
-      mri_->setRegAllocationHint(CP.getSrcReg(), 0, CP.getDstReg());
       ++numAborts;
       DEBUG(dbgs()
            << "\tPhysical register live interval too complicated, abort!\n");
@@ -1156,7 +1117,6 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
           ReMaterializeTrivialDef(JoinVInt, CP.getDstReg(), 0, CopyMI))
         return true;
 
-      mri_->setRegAllocationHint(CP.getSrcReg(), 0, CP.getDstReg());
       ++numAborts;
       DEBUG(dbgs() << "\tMay tie down a physical register, abort!\n");
       Again = true;  // May be possible to coalesce later.
@@ -1543,21 +1503,19 @@ void SimpleRegisterCoalescing::CopyCoalesceInMBB(MachineBasicBlock *MBB,
     MachineInstr *Inst = MII++;
 
     // If this isn't a copy nor a extract_subreg, we can't join intervals.
-    unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-    bool isInsUndef = false;
+    unsigned SrcReg, DstReg;
     if (Inst->isCopy()) {
       DstReg = Inst->getOperand(0).getReg();
       SrcReg = Inst->getOperand(1).getReg();
     } else if (Inst->isSubregToReg()) {
       DstReg = Inst->getOperand(0).getReg();
       SrcReg = Inst->getOperand(2).getReg();
-    } else if (!tii_->isMoveInstr(*Inst, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
+    } else
       continue;
 
     bool SrcIsPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
     bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
-    if (isInsUndef ||
-        (li_->hasInterval(SrcReg) && li_->getInterval(SrcReg).empty()))
+    if (li_->hasInterval(SrcReg) && li_->getInterval(SrcReg).empty())
       ImpDefCopies.push_back(CopyRec(Inst, 0));
     else if (SrcIsPhys || DstIsPhys)
       PhysCopies.push_back(CopyRec(Inst, 0));
@@ -1679,11 +1637,6 @@ SimpleRegisterCoalescing::lastRegisterUse(SlotIndex Start,
       MachineInstr *UseMI = Use.getParent();
       if (UseMI->isIdentityCopy())
         continue;
-      unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-      if (tii_->isMoveInstr(*UseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
-          SrcReg == DstReg && SrcSubIdx == DstSubIdx)
-        // Ignore identity copies.
-        continue;
       SlotIndex Idx = li_->getInstructionIndex(UseMI);
       // FIXME: Should this be Idx != UseIdx? SlotIndex() will return something
       // that compares higher than any other interval.
@@ -1708,10 +1661,7 @@ SimpleRegisterCoalescing::lastRegisterUse(SlotIndex Start,
       return NULL;
 
     // Ignore identity copies.
-    unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-    if (!MI->isIdentityCopy() &&
-        !(tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
-          SrcReg == DstReg && SrcSubIdx == DstSubIdx))
+    if (!MI->isIdentityCopy())
       for (unsigned i = 0, NumOps = MI->getNumOperands(); i != NumOps; ++i) {
         MachineOperand &Use = MI->getOperand(i);
         if (Use.isReg() && Use.isUse() && Use.getReg() &&
@@ -1747,7 +1697,6 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
                << "********** Function: "
                << ((Value*)mf_->getFunction())->getName() << '\n');
 
-  allocatableRegs_ = tri_->getAllocatableSet(fn);
   for (TargetRegisterInfo::regclass_iterator I = tri_->regclass_begin(),
          E = tri_->regclass_end(); I != E; ++I)
     allocatableRCRegs_.insert(std::make_pair(*I,
@@ -1775,30 +1724,35 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
     for (MachineBasicBlock::iterator mii = mbb->begin(), mie = mbb->end();
          mii != mie; ) {
       MachineInstr *MI = mii;
-      unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
       if (JoinedCopies.count(MI)) {
         // Delete all coalesced copies.
         bool DoDelete = true;
-        if (!tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
-          assert(MI->isCopyLike() && "Unrecognized copy instruction");
-          SrcReg = MI->getOperand(MI->isSubregToReg() ? 2 : 1).getReg();
-          if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
-            // Do not delete extract_subreg, insert_subreg of physical
-            // registers unless the definition is dead. e.g.
-            // %DO<def> = INSERT_SUBREG %D0<undef>, %S0<kill>, 1
-            // or else the scavenger may complain. LowerSubregs will
-            // delete them later.
-            DoDelete = false;
-        }
+        assert(MI->isCopyLike() && "Unrecognized copy instruction");
+        unsigned SrcReg = MI->getOperand(MI->isSubregToReg() ? 2 : 1).getReg();
+        if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+            MI->getNumOperands() > 2)
+          // Do not delete extract_subreg, insert_subreg of physical
+          // registers unless the definition is dead. e.g.
+          // %DO<def> = INSERT_SUBREG %D0<undef>, %S0<kill>, 1
+          // or else the scavenger may complain. LowerSubregs will
+          // delete them later.
+          DoDelete = false;
+        
         if (MI->allDefsAreDead()) {
           LiveInterval &li = li_->getInterval(SrcReg);
           if (!ShortenDeadCopySrcLiveRange(li, MI))
             ShortenDeadCopyLiveRange(li, MI);
           DoDelete = true;
         }
-        if (!DoDelete)
+        if (!DoDelete) {
+          // We need the instruction to adjust liveness, so make it a KILL.
+          if (MI->isSubregToReg()) {
+            MI->RemoveOperand(3);
+            MI->RemoveOperand(1);
+          }
+          MI->setDesc(tii_->get(TargetOpcode::KILL));
           mii = llvm::next(mii);
-        else {
+        } else {
           li_->RemoveMachineInstrFromMaps(MI);
           mii = mbbi->erase(mii);
           ++numPeep;
@@ -1840,9 +1794,8 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
       }
 
       // If the move will be an identity move delete it
-      bool isMove= tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx);
-      if (MI->isIdentityCopy() ||
-          (isMove && SrcReg == DstReg && SrcSubIdx == DstSubIdx)) {
+      if (MI->isIdentityCopy()) {
+        unsigned SrcReg = MI->getOperand(1).getReg();
         if (li_->hasInterval(SrcReg)) {
           LiveInterval &RegInt = li_->getInterval(SrcReg);
           // If def of this move instruction is dead, remove its live range
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.h b/lib/CodeGen/SimpleRegisterCoalescing.h
index e154da60affa2..855bdb98b36c6 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.h
+++ b/lib/CodeGen/SimpleRegisterCoalescing.h
@@ -47,7 +47,6 @@ namespace llvm {
     const MachineLoopInfo* loopInfo;
     AliasAnalysis *AA;
     
-    BitVector allocatableRegs_;
     DenseMap<const TargetRegisterClass*, BitVector> allocatableRCRegs_;
 
     /// JoinedCopies - Keep track of copies eliminated due to coalescing.
@@ -64,7 +63,7 @@ namespace llvm {
 
   public:
     static char ID; // Pass identifcation, replacement for typeid
-    SimpleRegisterCoalescing() : MachineFunctionPass(&ID) {}
+    SimpleRegisterCoalescing() : MachineFunctionPass(ID) {}
 
     struct InstrSlots {
       enum {
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index e90869d600dd6..b637980f885c3 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -58,7 +58,7 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit SjLjEHPass(const TargetLowering *tli = NULL)
-      : FunctionPass(&ID), TLI(tli) { }
+      : FunctionPass(ID), TLI(tli) { }
     bool doInitialization(Module &M);
     bool runOnFunction(Function &F);
 
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index 7a227cf02d57d..1bc148f160bc8 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -40,7 +40,8 @@ namespace {
 }
 
 char SlotIndexes::ID = 0;
-static RegisterPass<SlotIndexes> X("slotindexes", "Slot index numbering");
+INITIALIZE_PASS(SlotIndexes, "slotindexes",
+                "Slot index numbering", false, false);
 
 IndexListEntry* IndexListEntry::getEmptyKeyEntry() {
   return &*IndexListEntryEmptyKey;
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
index 56bcb2824ae8a..59d5ab33c994f 100644
--- a/lib/CodeGen/Spiller.cpp
+++ b/lib/CodeGen/Spiller.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -49,29 +50,31 @@ namespace {
 /// Utility class for spillers.
 class SpillerBase : public Spiller {
 protected:
+  MachineFunctionPass *pass;
   MachineFunction *mf;
+  VirtRegMap *vrm;
   LiveIntervals *lis;
   MachineFrameInfo *mfi;
   MachineRegisterInfo *mri;
   const TargetInstrInfo *tii;
   const TargetRegisterInfo *tri;
-  VirtRegMap *vrm;
 
   /// Construct a spiller base.
-  SpillerBase(MachineFunction *mf, LiveIntervals *lis, VirtRegMap *vrm)
-    : mf(mf), lis(lis), vrm(vrm)
+  SpillerBase(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm)
+    : pass(&pass), mf(&mf), vrm(&vrm)
   {
-    mfi = mf->getFrameInfo();
-    mri = &mf->getRegInfo();
-    tii = mf->getTarget().getInstrInfo();
-    tri = mf->getTarget().getRegisterInfo();
+    lis = &pass.getAnalysis<LiveIntervals>();
+    mfi = mf.getFrameInfo();
+    mri = &mf.getRegInfo();
+    tii = mf.getTarget().getInstrInfo();
+    tri = mf.getTarget().getRegisterInfo();
   }
 
   /// Add spill ranges for every use/def of the live interval, inserting loads
   /// immediately before each use, and stores after each def. No folding or
   /// remat is attempted.
   void trivialSpillEverywhere(LiveInterval *li,
-                              std::vector<LiveInterval*> &newIntervals) {
+                              SmallVectorImpl<LiveInterval*> &newIntervals) {
     DEBUG(dbgs() << "Spilling everywhere " << *li << "\n");
 
     assert(li->weight != HUGE_VALF &&
@@ -173,13 +176,13 @@ namespace {
 class TrivialSpiller : public SpillerBase {
 public:
 
-  TrivialSpiller(MachineFunction *mf, LiveIntervals *lis, VirtRegMap *vrm)
-    : SpillerBase(mf, lis, vrm) {}
+  TrivialSpiller(MachineFunctionPass &pass, MachineFunction &mf,
+                 VirtRegMap &vrm)
+    : SpillerBase(pass, mf, vrm) {}
 
   void spill(LiveInterval *li,
-             std::vector<LiveInterval*> &newIntervals,
-             SmallVectorImpl<LiveInterval*> &,
-             SlotIndex*) {
+             SmallVectorImpl<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &) {
     // Ignore spillIs - we don't use it.
     trivialSpillEverywhere(li, newIntervals);
   }
@@ -193,18 +196,19 @@ namespace {
 class StandardSpiller : public Spiller {
 protected:
   LiveIntervals *lis;
-  const MachineLoopInfo *loopInfo;
+  MachineLoopInfo *loopInfo;
   VirtRegMap *vrm;
 public:
-  StandardSpiller(LiveIntervals *lis, const MachineLoopInfo *loopInfo,
-                  VirtRegMap *vrm)
-    : lis(lis), loopInfo(loopInfo), vrm(vrm) {}
+  StandardSpiller(MachineFunctionPass &pass, MachineFunction &mf,
+                  VirtRegMap &vrm)
+    : lis(&pass.getAnalysis<LiveIntervals>()),
+      loopInfo(pass.getAnalysisIfAvailable<MachineLoopInfo>()),
+      vrm(&vrm) {}
 
   /// Falls back on LiveIntervals::addIntervalsForSpills.
   void spill(LiveInterval *li,
-             std::vector<LiveInterval*> &newIntervals,
-             SmallVectorImpl<LiveInterval*> &spillIs,
-             SlotIndex*) {
+             SmallVectorImpl<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &spillIs) {
     std::vector<LiveInterval*> added =
       lis->addIntervalsForSpills(*li, spillIs, loopInfo, *vrm);
     newIntervals.insert(newIntervals.end(), added.begin(), added.end());
@@ -221,23 +225,21 @@ namespace {
 /// then the spiller falls back on the standard spilling mechanism.
 class SplittingSpiller : public StandardSpiller {
 public:
-  SplittingSpiller(MachineFunction *mf, LiveIntervals *lis,
-                   const MachineLoopInfo *loopInfo, VirtRegMap *vrm)
-    : StandardSpiller(lis, loopInfo, vrm) {
-
-    mri = &mf->getRegInfo();
-    tii = mf->getTarget().getInstrInfo();
-    tri = mf->getTarget().getRegisterInfo();
+  SplittingSpiller(MachineFunctionPass &pass, MachineFunction &mf,
+                   VirtRegMap &vrm)
+    : StandardSpiller(pass, mf, vrm) {
+    mri = &mf.getRegInfo();
+    tii = mf.getTarget().getInstrInfo();
+    tri = mf.getTarget().getRegisterInfo();
   }
 
   void spill(LiveInterval *li,
-             std::vector<LiveInterval*> &newIntervals,
-             SmallVectorImpl<LiveInterval*> &spillIs,
-             SlotIndex *earliestStart) {
+             SmallVectorImpl<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &spillIs) {
     if (worthTryingToSplit(li))
-      tryVNISplit(li, earliestStart);
+      tryVNISplit(li);
     else
-      StandardSpiller::spill(li, newIntervals, spillIs, earliestStart);
+      StandardSpiller::spill(li, newIntervals, spillIs);
   }
 
 private:
@@ -252,8 +254,7 @@ private:
   }
 
   /// Try to break a LiveInterval into its component values.
-  std::vector<LiveInterval*> tryVNISplit(LiveInterval *li,
-                                         SlotIndex *earliestStart) {
+  std::vector<LiveInterval*> tryVNISplit(LiveInterval *li) {
 
     DEBUG(dbgs() << "Trying VNI split of %reg" << *li << "\n");
 
@@ -277,10 +278,6 @@ private:
         DEBUG(dbgs() << *splitInterval << "\n");
         added.push_back(splitInterval);
         alreadySplit.insert(splitInterval);
-        if (earliestStart != 0) {
-          if (splitInterval->beginIndex() < *earliestStart)
-            *earliestStart = splitInterval->beginIndex();
-        }
       } else {
         DEBUG(dbgs() << "0\n");
       }
@@ -293,10 +290,6 @@ private:
     if (!li->empty()) {
       added.push_back(li);
       alreadySplit.insert(li);
-      if (earliestStart != 0) {
-        if (li->beginIndex() < *earliestStart)
-          *earliestStart = li->beginIndex();
-      }
     }
 
     return added;
@@ -506,20 +499,19 @@ private:
 
 
 namespace llvm {
-Spiller *createInlineSpiller(MachineFunction*,
-                             LiveIntervals*,
-                             const MachineLoopInfo*,
-                             VirtRegMap*);
+Spiller *createInlineSpiller(MachineFunctionPass &pass,
+                             MachineFunction &mf,
+                             VirtRegMap &vrm);
 }
 
-llvm::Spiller* llvm::createSpiller(MachineFunction *mf, LiveIntervals *lis,
-                                   const MachineLoopInfo *loopInfo,
-                                   VirtRegMap *vrm) {
+llvm::Spiller* llvm::createSpiller(MachineFunctionPass &pass,
+                                   MachineFunction &mf,
+                                   VirtRegMap &vrm) {
   switch (spillerOpt) {
   default: assert(0 && "unknown spiller");
-  case trivial: return new TrivialSpiller(mf, lis, vrm);
-  case standard: return new StandardSpiller(lis, loopInfo, vrm);
-  case splitting: return new SplittingSpiller(mf, lis, loopInfo, vrm);
-  case inline_: return createInlineSpiller(mf, lis, loopInfo, vrm);
+  case trivial: return new TrivialSpiller(pass, mf, vrm);
+  case standard: return new StandardSpiller(pass, mf, vrm);
+  case splitting: return new SplittingSpiller(pass, mf, vrm);
+  case inline_: return createInlineSpiller(pass, mf, vrm);
   }
 }
diff --git a/lib/CodeGen/Spiller.h b/lib/CodeGen/Spiller.h
index 450447b3933a8..59bc0ec6ae70f 100644
--- a/lib/CodeGen/Spiller.h
+++ b/lib/CodeGen/Spiller.h
@@ -11,19 +11,14 @@
 #define LLVM_CODEGEN_SPILLER_H
 
 #include "llvm/ADT/SmallVector.h"
-#include <vector>
 
 namespace llvm {
 
   class LiveInterval;
-  class LiveIntervals;
-  class LiveStacks;
   class MachineFunction;
-  class MachineInstr;
-  class MachineLoopInfo;
+  class MachineFunctionPass;
   class SlotIndex;
   class VirtRegMap;
-  class VNInfo;
 
   /// Spiller interface.
   ///
@@ -40,18 +35,16 @@ namespace llvm {
     /// @param spillIs       A list of intervals that are about to be spilled,
     ///                      and so cannot be used for remat etc.
     /// @param newIntervals  The newly created intervals will be appended here.
-    /// @param earliestIndex The earliest point for splitting. (OK, it's another
-    ///                      pointer to the allocator guts).
     virtual void spill(LiveInterval *li,
-                       std::vector<LiveInterval*> &newIntervals,
-                       SmallVectorImpl<LiveInterval*> &spillIs,
-                       SlotIndex *earliestIndex = 0) = 0;
+                       SmallVectorImpl<LiveInterval*> &newIntervals,
+                       SmallVectorImpl<LiveInterval*> &spillIs) = 0;
 
   };
 
   /// Create and return a spiller object, as specified on the command line.
-  Spiller* createSpiller(MachineFunction *mf, LiveIntervals *li,
-                         const MachineLoopInfo *loopInfo, VirtRegMap *vrm);
+  Spiller* createSpiller(MachineFunctionPass &pass,
+                         MachineFunction &mf,
+                         VirtRegMap &vrm);
 }
 
 #endif
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
new file mode 100644
index 0000000000000..29474f0d55121
--- /dev/null
+++ b/lib/CodeGen/SplitKit.cpp
@@ -0,0 +1,1097 @@
+//===---------- SplitKit.cpp - Toolkit for splitting live ranges ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SplitAnalysis class as well as mutator functions for
+// live range splitting.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "splitter"
+#include "SplitKit.h"
+#include "VirtRegMap.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+AllowSplit("spiller-splits-edges",
+           cl::desc("Allow critical edge splitting during spilling"));
+
+//===----------------------------------------------------------------------===//
+//                                 Split Analysis
+//===----------------------------------------------------------------------===//
+
+SplitAnalysis::SplitAnalysis(const MachineFunction &mf,
+                             const LiveIntervals &lis,
+                             const MachineLoopInfo &mli)
+  : mf_(mf),
+    lis_(lis),
+    loops_(mli),
+    tii_(*mf.getTarget().getInstrInfo()),
+    curli_(0) {}
+
+void SplitAnalysis::clear() {
+  usingInstrs_.clear();
+  usingBlocks_.clear();
+  usingLoops_.clear();
+  curli_ = 0;
+}
+
+bool SplitAnalysis::canAnalyzeBranch(const MachineBasicBlock *MBB) {
+  MachineBasicBlock *T, *F;
+  SmallVector<MachineOperand, 4> Cond;
+  return !tii_.AnalyzeBranch(const_cast<MachineBasicBlock&>(*MBB), T, F, Cond);
+}
+
+/// analyzeUses - Count instructions, basic blocks, and loops using curli.
+void SplitAnalysis::analyzeUses() {
+  const MachineRegisterInfo &MRI = mf_.getRegInfo();
+  for (MachineRegisterInfo::reg_iterator I = MRI.reg_begin(curli_->reg);
+       MachineInstr *MI = I.skipInstruction();) {
+    if (MI->isDebugValue() || !usingInstrs_.insert(MI))
+      continue;
+    MachineBasicBlock *MBB = MI->getParent();
+    if (usingBlocks_[MBB]++)
+      continue;
+    if (MachineLoop *Loop = loops_.getLoopFor(MBB))
+      usingLoops_[Loop]++;
+  }
+  DEBUG(dbgs() << "  counted "
+               << usingInstrs_.size() << " instrs, "
+               << usingBlocks_.size() << " blocks, "
+               << usingLoops_.size()  << " loops.\n");
+}
+
+/// removeUse - Update statistics by noting that MI no longer uses curli.
+void SplitAnalysis::removeUse(const MachineInstr *MI) {
+  if (!usingInstrs_.erase(MI))
+    return;
+
+  // Decrement MBB count.
+  const MachineBasicBlock *MBB = MI->getParent();
+  BlockCountMap::iterator bi = usingBlocks_.find(MBB);
+  assert(bi != usingBlocks_.end() && "MBB missing");
+  assert(bi->second && "0 count in map");
+  if (--bi->second)
+    return;
+  // No more uses in MBB.
+  usingBlocks_.erase(bi);
+
+  // Decrement loop count.
+  MachineLoop *Loop = loops_.getLoopFor(MBB);
+  if (!Loop)
+    return;
+  LoopCountMap::iterator li = usingLoops_.find(Loop);
+  assert(li != usingLoops_.end() && "Loop missing");
+  assert(li->second && "0 count in map");
+  if (--li->second)
+    return;
+  // No more blocks in Loop.
+  usingLoops_.erase(li);
+}
+
+// Get three sets of basic blocks surrounding a loop: Blocks inside the loop,
+// predecessor blocks, and exit blocks.
+void SplitAnalysis::getLoopBlocks(const MachineLoop *Loop, LoopBlocks &Blocks) {
+  Blocks.clear();
+
+  // Blocks in the loop.
+  Blocks.Loop.insert(Loop->block_begin(), Loop->block_end());
+
+  // Predecessor blocks.
+  const MachineBasicBlock *Header = Loop->getHeader();
+  for (MachineBasicBlock::const_pred_iterator I = Header->pred_begin(),
+       E = Header->pred_end(); I != E; ++I)
+    if (!Blocks.Loop.count(*I))
+      Blocks.Preds.insert(*I);
+
+  // Exit blocks.
+  for (MachineLoop::block_iterator I = Loop->block_begin(),
+       E = Loop->block_end(); I != E; ++I) {
+    const MachineBasicBlock *MBB = *I;
+    for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+       SE = MBB->succ_end(); SI != SE; ++SI)
+      if (!Blocks.Loop.count(*SI))
+        Blocks.Exits.insert(*SI);
+  }
+}
+
+/// analyzeLoopPeripheralUse - Return an enum describing how curli_ is used in
+/// and around the Loop.
+SplitAnalysis::LoopPeripheralUse SplitAnalysis::
+analyzeLoopPeripheralUse(const SplitAnalysis::LoopBlocks &Blocks) {
+  LoopPeripheralUse use = ContainedInLoop;
+  for (BlockCountMap::iterator I = usingBlocks_.begin(), E = usingBlocks_.end();
+       I != E; ++I) {
+    const MachineBasicBlock *MBB = I->first;
+    // Is this a peripheral block?
+    if (use < MultiPeripheral &&
+        (Blocks.Preds.count(MBB) || Blocks.Exits.count(MBB))) {
+      if (I->second > 1) use = MultiPeripheral;
+      else               use = SinglePeripheral;
+      continue;
+    }
+    // Is it a loop block?
+    if (Blocks.Loop.count(MBB))
+      continue;
+    // It must be an unrelated block.
+    return OutsideLoop;
+  }
+  return use;
+}
+
+/// getCriticalExits - It may be necessary to partially break critical edges
+/// leaving the loop if an exit block has phi uses of curli. Collect the exit
+/// blocks that need special treatment into CriticalExits.
+void SplitAnalysis::getCriticalExits(const SplitAnalysis::LoopBlocks &Blocks,
+                                     BlockPtrSet &CriticalExits) {
+  CriticalExits.clear();
+
+  // A critical exit block contains a phi def of curli, and has a predecessor
+  // that is not in the loop nor a loop predecessor.
+  // For such an exit block, the edges carrying the new variable must be moved
+  // to a new pre-exit block.
+  for (BlockPtrSet::iterator I = Blocks.Exits.begin(), E = Blocks.Exits.end();
+       I != E; ++I) {
+    const MachineBasicBlock *Succ = *I;
+    SlotIndex SuccIdx = lis_.getMBBStartIdx(Succ);
+    VNInfo *SuccVNI = curli_->getVNInfoAt(SuccIdx);
+    // This exit may not have curli live in at all. No need to split.
+    if (!SuccVNI)
+      continue;
+    // If this is not a PHI def, it is either using a value from before the
+    // loop, or a value defined inside the loop. Both are safe.
+    if (!SuccVNI->isPHIDef() || SuccVNI->def.getBaseIndex() != SuccIdx)
+      continue;
+    // This exit block does have a PHI. Does it also have a predecessor that is
+    // not a loop block or loop predecessor?
+    for (MachineBasicBlock::const_pred_iterator PI = Succ->pred_begin(),
+         PE = Succ->pred_end(); PI != PE; ++PI) {
+      const MachineBasicBlock *Pred = *PI;
+      if (Blocks.Loop.count(Pred) || Blocks.Preds.count(Pred))
+        continue;
+      // This is a critical exit block, and we need to split the exit edge.
+      CriticalExits.insert(Succ);
+      break;
+    }
+  }
+}
+
+/// canSplitCriticalExits - Return true if it is possible to insert new exit
+/// blocks before the blocks in CriticalExits.
+bool
+SplitAnalysis::canSplitCriticalExits(const SplitAnalysis::LoopBlocks &Blocks,
+                                     BlockPtrSet &CriticalExits) {
+  // If we don't allow critical edge splitting, require no critical exits.
+  if (!AllowSplit)
+    return CriticalExits.empty();
+
+  for (BlockPtrSet::iterator I = CriticalExits.begin(), E = CriticalExits.end();
+       I != E; ++I) {
+    const MachineBasicBlock *Succ = *I;
+    // We want to insert a new pre-exit MBB before Succ, and change all the
+    // in-loop blocks to branch to the pre-exit instead of Succ.
+    // Check that all the in-loop predecessors can be changed.
+    for (MachineBasicBlock::const_pred_iterator PI = Succ->pred_begin(),
+         PE = Succ->pred_end(); PI != PE; ++PI) {
+      const MachineBasicBlock *Pred = *PI;
+      // The external predecessors won't be altered.
+      if (!Blocks.Loop.count(Pred) && !Blocks.Preds.count(Pred))
+        continue;
+      if (!canAnalyzeBranch(Pred))
+        return false;
+    }
+
+    // If Succ's layout predecessor falls through, that too must be analyzable.
+    // We need to insert the pre-exit block in the gap.
+    MachineFunction::const_iterator MFI = Succ;
+    if (MFI == mf_.begin())
+      continue;
+    if (!canAnalyzeBranch(--MFI))
+      return false;
+  }
+  // No problems found.
+  return true;
+}
+
+void SplitAnalysis::analyze(const LiveInterval *li) {
+  clear();
+  curli_ = li;
+  analyzeUses();
+}
+
+const MachineLoop *SplitAnalysis::getBestSplitLoop() {
+  assert(curli_ && "Call analyze() before getBestSplitLoop");
+  if (usingLoops_.empty())
+    return 0;
+
+  LoopPtrSet Loops, SecondLoops;
+  LoopBlocks Blocks;
+  BlockPtrSet CriticalExits;
+
+  // Find first-class and second class candidate loops.
+  // We prefer to split around loops where curli is used outside the periphery.
+  for (LoopCountMap::const_iterator I = usingLoops_.begin(),
+       E = usingLoops_.end(); I != E; ++I) {
+    const MachineLoop *Loop = I->first;
+    getLoopBlocks(Loop, Blocks);
+
+    // FIXME: We need an SSA updater to properly handle multiple exit blocks.
+    if (Blocks.Exits.size() > 1) {
+      DEBUG(dbgs() << "  multiple exits from " << *Loop);
+      continue;
+    }
+
+    LoopPtrSet *LPS = 0;
+    switch(analyzeLoopPeripheralUse(Blocks)) {
+    case OutsideLoop:
+      LPS = &Loops;
+      break;
+    case MultiPeripheral:
+      LPS = &SecondLoops;
+      break;
+    case ContainedInLoop:
+      DEBUG(dbgs() << "  contained in " << *Loop);
+      continue;
+    case SinglePeripheral:
+      DEBUG(dbgs() << "  single peripheral use in " << *Loop);
+      continue;
+    }
+    // Will it be possible to split around this loop?
+    getCriticalExits(Blocks, CriticalExits);
+    DEBUG(dbgs() << "  " << CriticalExits.size() << " critical exits from "
+                 << *Loop);
+    if (!canSplitCriticalExits(Blocks, CriticalExits))
+      continue;
+    // This is a possible split.
+    assert(LPS);
+    LPS->insert(Loop);
+  }
+
+  DEBUG(dbgs() << "  getBestSplitLoop found " << Loops.size() << " + "
+               << SecondLoops.size() << " candidate loops.\n");
+
+  // If there are no first class loops available, look at second class loops.
+  if (Loops.empty())
+    Loops = SecondLoops;
+
+  if (Loops.empty())
+    return 0;
+
+  // Pick the earliest loop.
+  // FIXME: Are there other heuristics to consider?
+  const MachineLoop *Best = 0;
+  SlotIndex BestIdx;
+  for (LoopPtrSet::const_iterator I = Loops.begin(), E = Loops.end(); I != E;
+       ++I) {
+    SlotIndex Idx = lis_.getMBBStartIdx((*I)->getHeader());
+    if (!Best || Idx < BestIdx)
+      Best = *I, BestIdx = Idx;
+  }
+  DEBUG(dbgs() << "  getBestSplitLoop found " << *Best);
+  return Best;
+}
+
+/// getMultiUseBlocks - if curli has more than one use in a basic block, it
+/// may be an advantage to split curli for the duration of the block.
+bool SplitAnalysis::getMultiUseBlocks(BlockPtrSet &Blocks) {
+  // If curli is local to one block, there is no point to splitting it.
+  if (usingBlocks_.size() <= 1)
+    return false;
+  // Add blocks with multiple uses.
+  for (BlockCountMap::iterator I = usingBlocks_.begin(), E = usingBlocks_.end();
+       I != E; ++I)
+    switch (I->second) {
+    case 0:
+    case 1:
+      continue;
+    case 2: {
+      // It doesn't pay to split a 2-instr block if it redefines curli.
+      VNInfo *VN1 = curli_->getVNInfoAt(lis_.getMBBStartIdx(I->first));
+      VNInfo *VN2 =
+        curli_->getVNInfoAt(lis_.getMBBEndIdx(I->first).getPrevIndex());
+      // live-in and live-out with a different value.
+      if (VN1 && VN2 && VN1 != VN2)
+        continue;
+    } // Fall through.
+    default:
+      Blocks.insert(I->first);
+    }
+  return !Blocks.empty();
+}
+
+//===----------------------------------------------------------------------===//
+//                               LiveIntervalMap
+//===----------------------------------------------------------------------===//
+
+// defValue - Introduce a li_ def for ParentVNI that could be later than
+// ParentVNI->def.
+VNInfo *LiveIntervalMap::defValue(const VNInfo *ParentVNI, SlotIndex Idx) {
+  assert(ParentVNI && "Mapping  NULL value");
+  assert(Idx.isValid() && "Invalid SlotIndex");
+  assert(parentli_.getVNInfoAt(Idx) == ParentVNI && "Bad ParentVNI");
+
+  // Is this a simple 1-1 mapping? Not likely.
+  if (Idx == ParentVNI->def)
+    return mapValue(ParentVNI, Idx);
+
+  // This is a complex def. Mark with a NULL in valueMap.
+  VNInfo *OldVNI =
+    valueMap_.insert(
+      ValueMap::value_type(ParentVNI, static_cast<VNInfo *>(0))).first->second;
+      // The static_cast<VNInfo *> is only needed to work around a bug in an
+      // old version of the C++0x standard which the following compilers
+      // implemented and have yet to fix:
+      //
+      // Microsoft Visual Studio 2010 Version 10.0.30319.1 RTMRel
+      // Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 16.00.30319.01
+      //
+      // If/When we move to C++0x, this can be replaced by nullptr.
+  (void)OldVNI;
+  assert(OldVNI == 0 && "Simple/Complex values mixed");
+
+  // Should we insert a minimal snippet of VNI LiveRange, or can we count on
+  // callers to do that? We need it for lookups of complex values.
+  VNInfo *VNI = li_.getNextValue(Idx, 0, true, lis_.getVNInfoAllocator());
+  return VNI;
+}
+
+// mapValue - Find the mapped value for ParentVNI at Idx.
+// Potentially create phi-def values.
+VNInfo *LiveIntervalMap::mapValue(const VNInfo *ParentVNI, SlotIndex Idx) {
+  assert(ParentVNI && "Mapping  NULL value");
+  assert(Idx.isValid() && "Invalid SlotIndex");
+  assert(parentli_.getVNInfoAt(Idx) == ParentVNI && "Bad ParentVNI");
+
+  // Use insert for lookup, so we can add missing values with a second lookup.
+  std::pair<ValueMap::iterator,bool> InsP =
+    valueMap_.insert(ValueMap::value_type(ParentVNI, static_cast<VNInfo *>(0)));
+    // The static_cast<VNInfo *> is only needed to work around a bug in an
+    // old version of the C++0x standard which the following compilers
+    // implemented and have yet to fix:
+    //
+    // Microsoft Visual Studio 2010 Version 10.0.30319.1 RTMRel
+    // Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 16.00.30319.01
+    //
+    // If/When we move to C++0x, this can be replaced by nullptr.
+
+  // This was an unknown value. Create a simple mapping.
+  if (InsP.second)
+    return InsP.first->second = li_.createValueCopy(ParentVNI,
+                                                    lis_.getVNInfoAllocator());
+  // This was a simple mapped value.
+  if (InsP.first->second)
+    return InsP.first->second;
+
+  // This is a complex mapped value. There may be multiple defs, and we may need
+  // to create phi-defs.
+  MachineBasicBlock *IdxMBB = lis_.getMBBFromIndex(Idx);
+  assert(IdxMBB && "No MBB at Idx");
+
+  // Is there a def in the same MBB we can extend?
+  if (VNInfo *VNI = extendTo(IdxMBB, Idx))
+    return VNI;
+
+  // Now for the fun part. We know that ParentVNI potentially has multiple defs,
+  // and we may need to create even more phi-defs to preserve VNInfo SSA form.
+  // Perform a depth-first search for predecessor blocks where we know the
+  // dominating VNInfo. Insert phi-def VNInfos along the path back to IdxMBB.
+
+  // Track MBBs where we have created or learned the dominating value.
+  // This may change during the DFS as we create new phi-defs.
+  typedef DenseMap<MachineBasicBlock*, VNInfo*> MBBValueMap;
+  MBBValueMap DomValue;
+
+  for (idf_iterator<MachineBasicBlock*>
+         IDFI = idf_begin(IdxMBB),
+         IDFE = idf_end(IdxMBB); IDFI != IDFE;) {
+    MachineBasicBlock *MBB = *IDFI;
+    SlotIndex End = lis_.getMBBEndIdx(MBB);
+
+    // We are operating on the restricted CFG where ParentVNI is live.
+    if (parentli_.getVNInfoAt(End.getPrevSlot()) != ParentVNI) {
+      IDFI.skipChildren();
+      continue;
+    }
+
+    // Do we have a dominating value in this block?
+    VNInfo *VNI = extendTo(MBB, End);
+    if (!VNI) {
+      ++IDFI;
+      continue;
+    }
+
+    // Yes, VNI dominates MBB. Track the path back to IdxMBB, creating phi-defs
+    // as needed along the way.
+    for (unsigned PI = IDFI.getPathLength()-1; PI != 0; --PI) {
+      // Start from MBB's immediate successor. End at IdxMBB.
+      MachineBasicBlock *Succ = IDFI.getPath(PI-1);
+      std::pair<MBBValueMap::iterator, bool> InsP =
+        DomValue.insert(MBBValueMap::value_type(Succ, VNI));
+
+      // This is the first time we backtrack to Succ.
+      if (InsP.second)
+        continue;
+
+      // We reached Succ again with the same VNI. Nothing is going to change.
+      VNInfo *OVNI = InsP.first->second;
+      if (OVNI == VNI)
+        break;
+
+      // Succ already has a phi-def. No need to continue.
+      SlotIndex Start = lis_.getMBBStartIdx(Succ);
+      if (OVNI->def == Start)
+        break;
+
+      // We have a collision between the old and new VNI at Succ. That means
+      // neither dominates and we need a new phi-def.
+      VNI = li_.getNextValue(Start, 0, true, lis_.getVNInfoAllocator());
+      VNI->setIsPHIDef(true);
+      InsP.first->second = VNI;
+
+      // Replace OVNI with VNI in the remaining path.
+      for (; PI > 1 ; --PI) {
+        MBBValueMap::iterator I = DomValue.find(IDFI.getPath(PI-2));
+        if (I == DomValue.end() || I->second != OVNI)
+          break;
+        I->second = VNI;
+      }
+    }
+
+    // No need to search the children, we found a dominating value.
+    IDFI.skipChildren();
+  }
+
+  // The search should at least find a dominating value for IdxMBB.
+  assert(!DomValue.empty() && "Couldn't find a reaching definition");
+
+  // Since we went through the trouble of a full DFS visiting all reaching defs,
+  // the values in DomValue are now accurate. No more phi-defs are needed for
+  // these blocks, so we can color the live ranges.
+  // This makes the next mapValue call much faster.
+  VNInfo *IdxVNI = 0;
+  for (MBBValueMap::iterator I = DomValue.begin(), E = DomValue.end(); I != E;
+       ++I) {
+     MachineBasicBlock *MBB = I->first;
+     VNInfo *VNI = I->second;
+     SlotIndex Start = lis_.getMBBStartIdx(MBB);
+     if (MBB == IdxMBB) {
+       // Don't add full liveness to IdxMBB, stop at Idx.
+       if (Start != Idx)
+         li_.addRange(LiveRange(Start, Idx, VNI));
+       // The caller had better add some liveness to IdxVNI, or it leaks.
+       IdxVNI = VNI;
+     } else
+      li_.addRange(LiveRange(Start, lis_.getMBBEndIdx(MBB), VNI));
+  }
+
+  assert(IdxVNI && "Didn't find value for Idx");
+  return IdxVNI;
+}
+
+// extendTo - Find the last li_ value defined in MBB at or before Idx. The
+// parentli_ is assumed to be live at Idx. Extend the live range to Idx.
+// Return the found VNInfo, or NULL.
+VNInfo *LiveIntervalMap::extendTo(MachineBasicBlock *MBB, SlotIndex Idx) {
+  LiveInterval::iterator I = std::upper_bound(li_.begin(), li_.end(), Idx);
+  if (I == li_.begin())
+    return 0;
+  --I;
+  if (I->start < lis_.getMBBStartIdx(MBB))
+    return 0;
+  if (I->end < Idx)
+    I->end = Idx;
+  return I->valno;
+}
+
+// addSimpleRange - Add a simple range from parentli_ to li_.
+// ParentVNI must be live in the [Start;End) interval.
+void LiveIntervalMap::addSimpleRange(SlotIndex Start, SlotIndex End,
+                                     const VNInfo *ParentVNI) {
+  VNInfo *VNI = mapValue(ParentVNI, Start);
+  // A simple mappoing is easy.
+  if (VNI->def == ParentVNI->def) {
+    li_.addRange(LiveRange(Start, End, VNI));
+    return;
+  }
+
+  // ParentVNI is a complex value. We must map per MBB.
+  MachineFunction::iterator MBB = lis_.getMBBFromIndex(Start);
+  MachineFunction::iterator MBBE = lis_.getMBBFromIndex(End);
+
+  if (MBB == MBBE) {
+    li_.addRange(LiveRange(Start, End, VNI));
+    return;
+  }
+
+  // First block.
+  li_.addRange(LiveRange(Start, lis_.getMBBEndIdx(MBB), VNI));
+
+  // Run sequence of full blocks.
+  for (++MBB; MBB != MBBE; ++MBB) {
+    Start = lis_.getMBBStartIdx(MBB);
+    li_.addRange(LiveRange(Start, lis_.getMBBEndIdx(MBB),
+                           mapValue(ParentVNI, Start)));
+  }
+
+  // Final block.
+  Start = lis_.getMBBStartIdx(MBB);
+  if (Start != End)
+    li_.addRange(LiveRange(Start, End, mapValue(ParentVNI, Start)));
+}
+
+/// addRange - Add live ranges to li_ where [Start;End) intersects parentli_.
+/// All needed values whose def is not inside [Start;End) must be defined
+/// beforehand so mapValue will work.
+void LiveIntervalMap::addRange(SlotIndex Start, SlotIndex End) {
+  LiveInterval::const_iterator B = parentli_.begin(), E = parentli_.end();
+  LiveInterval::const_iterator I = std::lower_bound(B, E, Start);
+
+  // Check if --I begins before Start and overlaps.
+  if (I != B) {
+    --I;
+    if (I->end > Start)
+      addSimpleRange(Start, std::min(End, I->end), I->valno);
+    ++I;
+  }
+
+  // The remaining ranges begin after Start.
+  for (;I != E && I->start < End; ++I)
+    addSimpleRange(I->start, std::min(End, I->end), I->valno);
+}
+
+//===----------------------------------------------------------------------===//
+//                               Split Editor
+//===----------------------------------------------------------------------===//
+
+/// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
+SplitEditor::SplitEditor(SplitAnalysis &sa, LiveIntervals &lis, VirtRegMap &vrm,
+                         SmallVectorImpl<LiveInterval*> &intervals)
+  : sa_(sa), lis_(lis), vrm_(vrm),
+    mri_(vrm.getMachineFunction().getRegInfo()),
+    tii_(*vrm.getMachineFunction().getTarget().getInstrInfo()),
+    curli_(sa_.getCurLI()),
+    dupli_(0), openli_(0),
+    intervals_(intervals),
+    firstInterval(intervals_.size())
+{
+  assert(curli_ && "SplitEditor created from empty SplitAnalysis");
+
+  // Make sure curli_ is assigned a stack slot, so all our intervals get the
+  // same slot as curli_.
+  if (vrm_.getStackSlot(curli_->reg) == VirtRegMap::NO_STACK_SLOT)
+    vrm_.assignVirt2StackSlot(curli_->reg);
+
+}
+
+LiveInterval *SplitEditor::createInterval() {
+  unsigned curli = sa_.getCurLI()->reg;
+  unsigned Reg = mri_.createVirtualRegister(mri_.getRegClass(curli));
+  LiveInterval &Intv = lis_.getOrCreateInterval(Reg);
+  vrm_.grow();
+  vrm_.assignVirt2StackSlot(Reg, vrm_.getStackSlot(curli));
+  return &Intv;
+}
+
+LiveInterval *SplitEditor::getDupLI() {
+  if (!dupli_) {
+    // Create an interval for dupli that is a copy of curli.
+    dupli_ = createInterval();
+    dupli_->Copy(*curli_, &mri_, lis_.getVNInfoAllocator());
+  }
+  return dupli_;
+}
+
+VNInfo *SplitEditor::mapValue(const VNInfo *curliVNI) {
+  VNInfo *&VNI = valueMap_[curliVNI];
+  if (!VNI)
+    VNI = openli_->createValueCopy(curliVNI, lis_.getVNInfoAllocator());
+  return VNI;
+}
+
+/// Insert a COPY instruction curli -> li. Allocate a new value from li
+/// defined by the COPY. Note that rewrite() will deal with the curli
+/// register, so this function can be used to copy from any interval - openli,
+/// curli, or dupli.
+VNInfo *SplitEditor::insertCopy(LiveInterval &LI,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) {
+  MachineInstr *MI = BuildMI(MBB, I, DebugLoc(), tii_.get(TargetOpcode::COPY),
+                             LI.reg).addReg(curli_->reg);
+  SlotIndex DefIdx = lis_.InsertMachineInstrInMaps(MI).getDefIndex();
+  return LI.getNextValue(DefIdx, MI, true, lis_.getVNInfoAllocator());
+}
+
+/// Create a new virtual register and live interval.
+void SplitEditor::openIntv() {
+  assert(!openli_ && "Previous LI not closed before openIntv");
+  openli_ = createInterval();
+  intervals_.push_back(openli_);
+  liveThrough_ = false;
+}
+
+/// enterIntvBefore - Enter openli before the instruction at Idx. If curli is
+/// not live before Idx, a COPY is not inserted.
+void SplitEditor::enterIntvBefore(SlotIndex Idx) {
+  assert(openli_ && "openIntv not called before enterIntvBefore");
+
+  // Copy from curli_ if it is live.
+  if (VNInfo *CurVNI = curli_->getVNInfoAt(Idx.getUseIndex())) {
+    MachineInstr *MI = lis_.getInstructionFromIndex(Idx);
+    assert(MI && "enterIntvBefore called with invalid index");
+    VNInfo *VNI = insertCopy(*openli_, *MI->getParent(), MI);
+    openli_->addRange(LiveRange(VNI->def, Idx.getDefIndex(), VNI));
+
+    // Make sure CurVNI is properly mapped.
+    VNInfo *&mapVNI = valueMap_[CurVNI];
+    // We dont have SSA update yet, so only one entry per value is allowed.
+    assert(!mapVNI && "enterIntvBefore called more than once for the same value");
+    mapVNI = VNI;
+  }
+  DEBUG(dbgs() << "    enterIntvBefore " << Idx << ": " << *openli_ << '\n');
+}
+
+/// enterIntvAtEnd - Enter openli at the end of MBB.
+/// PhiMBB is a successor inside openli where a PHI value is created.
+/// Currently, all entries must share the same PhiMBB.
+void SplitEditor::enterIntvAtEnd(MachineBasicBlock &A, MachineBasicBlock &B) {
+  assert(openli_ && "openIntv not called before enterIntvAtEnd");
+
+  SlotIndex EndA = lis_.getMBBEndIdx(&A);
+  VNInfo *CurVNIA = curli_->getVNInfoAt(EndA.getPrevIndex());
+  if (!CurVNIA) {
+    DEBUG(dbgs() << "    enterIntvAtEnd, curli not live out of BB#"
+                 << A.getNumber() << ".\n");
+    return;
+  }
+
+  // Add a phi kill value and live range out of A.
+  VNInfo *VNIA = insertCopy(*openli_, A, A.getFirstTerminator());
+  openli_->addRange(LiveRange(VNIA->def, EndA, VNIA));
+
+  // FIXME: If this is the only entry edge, we don't need the extra PHI value.
+  // FIXME: If there are multiple entry blocks (so not a loop), we need proper
+  // SSA update.
+
+  // Now look at the start of B.
+  SlotIndex StartB = lis_.getMBBStartIdx(&B);
+  SlotIndex EndB = lis_.getMBBEndIdx(&B);
+  const LiveRange *CurB = curli_->getLiveRangeContaining(StartB);
+  if (!CurB) {
+    DEBUG(dbgs() << "    enterIntvAtEnd: curli not live in to BB#"
+                 << B.getNumber() << ".\n");
+    return;
+  }
+
+  VNInfo *VNIB = openli_->getVNInfoAt(StartB);
+  if (!VNIB) {
+    // Create a phi value.
+    VNIB = openli_->getNextValue(SlotIndex(StartB, true), 0, false,
+                                 lis_.getVNInfoAllocator());
+    VNIB->setIsPHIDef(true);
+    VNInfo *&mapVNI = valueMap_[CurB->valno];
+    if (mapVNI) {
+      // Multiple copies - must create PHI value.
+      abort();
+    } else {
+      // This is the first copy of dupLR. Mark the mapping.
+      mapVNI = VNIB;
+    }
+
+  }
+
+  DEBUG(dbgs() << "    enterIntvAtEnd: " << *openli_ << '\n');
+}
+
+/// useIntv - indicate that all instructions in MBB should use openli.
+void SplitEditor::useIntv(const MachineBasicBlock &MBB) {
+  useIntv(lis_.getMBBStartIdx(&MBB), lis_.getMBBEndIdx(&MBB));
+}
+
+void SplitEditor::useIntv(SlotIndex Start, SlotIndex End) {
+  assert(openli_ && "openIntv not called before useIntv");
+
+  // Map the curli values from the interval into openli_
+  LiveInterval::const_iterator B = curli_->begin(), E = curli_->end();
+  LiveInterval::const_iterator I = std::lower_bound(B, E, Start);
+
+  if (I != B) {
+    --I;
+    // I begins before Start, but overlaps.
+    if (I->end > Start)
+      openli_->addRange(LiveRange(Start, std::min(End, I->end),
+                        mapValue(I->valno)));
+    ++I;
+  }
+
+  // The remaining ranges begin after Start.
+  for (;I != E && I->start < End; ++I)
+    openli_->addRange(LiveRange(I->start, std::min(End, I->end),
+                                mapValue(I->valno)));
+  DEBUG(dbgs() << "    use [" << Start << ';' << End << "): " << *openli_
+               << '\n');
+}
+
+/// leaveIntvAfter - Leave openli after the instruction at Idx.
+void SplitEditor::leaveIntvAfter(SlotIndex Idx) {
+  assert(openli_ && "openIntv not called before leaveIntvAfter");
+
+  const LiveRange *CurLR = curli_->getLiveRangeContaining(Idx.getDefIndex());
+  if (!CurLR || CurLR->end <= Idx.getBoundaryIndex()) {
+    DEBUG(dbgs() << "    leaveIntvAfter " << Idx << ": not live\n");
+    return;
+  }
+
+  // Was this value of curli live through openli?
+  if (!openli_->liveAt(CurLR->valno->def)) {
+    DEBUG(dbgs() << "    leaveIntvAfter " << Idx << ": using external value\n");
+    liveThrough_ = true;
+    return;
+  }
+
+  // We are going to insert a back copy, so we must have a dupli_.
+  LiveRange *DupLR = getDupLI()->getLiveRangeContaining(Idx.getDefIndex());
+  assert(DupLR && "dupli not live into black, but curli is?");
+
+  // Insert the COPY instruction.
+  MachineBasicBlock::iterator I = lis_.getInstructionFromIndex(Idx);
+  MachineInstr *MI = BuildMI(*I->getParent(), llvm::next(I), I->getDebugLoc(),
+                             tii_.get(TargetOpcode::COPY), dupli_->reg)
+                       .addReg(openli_->reg);
+  SlotIndex CopyIdx = lis_.InsertMachineInstrInMaps(MI).getDefIndex();
+  openli_->addRange(LiveRange(Idx.getDefIndex(), CopyIdx,
+                    mapValue(CurLR->valno)));
+  DupLR->valno->def = CopyIdx;
+  DEBUG(dbgs() << "    leaveIntvAfter " << Idx << ": " << *openli_ << '\n');
+}
+
+/// leaveIntvAtTop - Leave the interval at the top of MBB.
+/// Currently, only one value can leave the interval.
+void SplitEditor::leaveIntvAtTop(MachineBasicBlock &MBB) {
+  assert(openli_ && "openIntv not called before leaveIntvAtTop");
+
+  SlotIndex Start = lis_.getMBBStartIdx(&MBB);
+  const LiveRange *CurLR = curli_->getLiveRangeContaining(Start);
+
+  // Is curli even live-in to MBB?
+  if (!CurLR) {
+    DEBUG(dbgs() << "    leaveIntvAtTop at " << Start << ": not live\n");
+    return;
+  }
+
+  // Is curli defined by PHI at the beginning of MBB?
+  bool isPHIDef = CurLR->valno->isPHIDef() &&
+                  CurLR->valno->def.getBaseIndex() == Start;
+
+  // If MBB is using a value of curli that was defined outside the openli range,
+  // we don't want to copy it back here.
+  if (!isPHIDef && !openli_->liveAt(CurLR->valno->def)) {
+    DEBUG(dbgs() << "    leaveIntvAtTop at " << Start
+                 << ": using external value\n");
+    liveThrough_ = true;
+    return;
+  }
+
+  // We are going to insert a back copy, so we must have a dupli_.
+  LiveRange *DupLR = getDupLI()->getLiveRangeContaining(Start);
+  assert(DupLR && "dupli not live into black, but curli is?");
+
+  // Insert the COPY instruction.
+  MachineInstr *MI = BuildMI(MBB, MBB.begin(), DebugLoc(),
+                             tii_.get(TargetOpcode::COPY), dupli_->reg)
+                       .addReg(openli_->reg);
+  SlotIndex Idx = lis_.InsertMachineInstrInMaps(MI).getDefIndex();
+
+  // Adjust dupli and openli values.
+  if (isPHIDef) {
+    // dupli was already a PHI on entry to MBB. Simply insert an openli PHI,
+    // and shift the dupli def down to the COPY.
+    VNInfo *VNI = openli_->getNextValue(SlotIndex(Start, true), 0, false,
+                                        lis_.getVNInfoAllocator());
+    VNI->setIsPHIDef(true);
+    openli_->addRange(LiveRange(VNI->def, Idx, VNI));
+
+    dupli_->removeRange(Start, Idx);
+    DupLR->valno->def = Idx;
+    DupLR->valno->setIsPHIDef(false);
+  } else {
+    // The dupli value was defined somewhere inside the openli range.
+    DEBUG(dbgs() << "    leaveIntvAtTop source value defined at "
+                 << DupLR->valno->def << "\n");
+    // FIXME: We may not need a PHI here if all predecessors have the same
+    // value.
+    VNInfo *VNI = openli_->getNextValue(SlotIndex(Start, true), 0, false,
+                                        lis_.getVNInfoAllocator());
+    VNI->setIsPHIDef(true);
+    openli_->addRange(LiveRange(VNI->def, Idx, VNI));
+
+    // FIXME: What if DupLR->valno is used by multiple exits? SSA Update.
+
+    // closeIntv is going to remove the superfluous live ranges.
+    DupLR->valno->def = Idx;
+    DupLR->valno->setIsPHIDef(false);
+  }
+
+  DEBUG(dbgs() << "    leaveIntvAtTop at " << Idx << ": " << *openli_ << '\n');
+}
+
+/// closeIntv - Indicate that we are done editing the currently open
+/// LiveInterval, and ranges can be trimmed.
+void SplitEditor::closeIntv() {
+  assert(openli_ && "openIntv not called before closeIntv");
+
+  DEBUG(dbgs() << "    closeIntv cleaning up\n");
+  DEBUG(dbgs() << "    open " << *openli_ << '\n');
+
+  if (liveThrough_) {
+    DEBUG(dbgs() << "    value live through region, leaving dupli as is.\n");
+  } else {
+    // live out with copies inserted, or killed by region. Either way we need to
+    // remove the overlapping region from dupli.
+    getDupLI();
+    for (LiveInterval::iterator I = openli_->begin(), E = openli_->end();
+         I != E; ++I) {
+      dupli_->removeRange(I->start, I->end);
+    }
+    // FIXME: A block branching to the entry block may also branch elsewhere
+    // curli is live. We need both openli and curli to be live in that case.
+    DEBUG(dbgs() << "    dup2 " << *dupli_ << '\n');
+  }
+  openli_ = 0;
+  valueMap_.clear();
+}
+
+/// rewrite - after all the new live ranges have been created, rewrite
+/// instructions using curli to use the new intervals.
+void SplitEditor::rewrite() {
+  assert(!openli_ && "Previous LI not closed before rewrite");
+  const LiveInterval *curli = sa_.getCurLI();
+  for (MachineRegisterInfo::reg_iterator RI = mri_.reg_begin(curli->reg),
+       RE = mri_.reg_end(); RI != RE;) {
+    MachineOperand &MO = RI.getOperand();
+    MachineInstr *MI = MO.getParent();
+    ++RI;
+    if (MI->isDebugValue()) {
+      DEBUG(dbgs() << "Zapping " << *MI);
+      // FIXME: We can do much better with debug values.
+      MO.setReg(0);
+      continue;
+    }
+    SlotIndex Idx = lis_.getInstructionIndex(MI);
+    Idx = MO.isUse() ? Idx.getUseIndex() : Idx.getDefIndex();
+    LiveInterval *LI = dupli_;
+    for (unsigned i = firstInterval, e = intervals_.size(); i != e; ++i) {
+      LiveInterval *testli = intervals_[i];
+      if (testli->liveAt(Idx)) {
+        LI = testli;
+        break;
+      }
+    }
+    if (LI) {
+      MO.setReg(LI->reg);
+      sa_.removeUse(MI);
+      DEBUG(dbgs() << "  rewrite " << Idx << '\t' << *MI);
+    }
+  }
+
+  // dupli_ goes in last, after rewriting.
+  if (dupli_) {
+    if (dupli_->empty()) {
+      DEBUG(dbgs() << "  dupli became empty?\n");
+      lis_.removeInterval(dupli_->reg);
+      dupli_ = 0;
+    } else {
+      dupli_->RenumberValues(lis_);
+      intervals_.push_back(dupli_);
+    }
+  }
+
+  // Calculate spill weight and allocation hints for new intervals.
+  VirtRegAuxInfo vrai(vrm_.getMachineFunction(), lis_, sa_.loops_);
+  for (unsigned i = firstInterval, e = intervals_.size(); i != e; ++i) {
+    LiveInterval &li = *intervals_[i];
+    vrai.CalculateRegClass(li.reg);
+    vrai.CalculateWeightAndHint(li);
+    DEBUG(dbgs() << "  new interval " << mri_.getRegClass(li.reg)->getName()
+                 << ":" << li << '\n');
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//                               Loop Splitting
+//===----------------------------------------------------------------------===//
+
+bool SplitEditor::splitAroundLoop(const MachineLoop *Loop) {
+  SplitAnalysis::LoopBlocks Blocks;
+  sa_.getLoopBlocks(Loop, Blocks);
+
+  // Break critical edges as needed.
+  SplitAnalysis::BlockPtrSet CriticalExits;
+  sa_.getCriticalExits(Blocks, CriticalExits);
+  assert(CriticalExits.empty() && "Cannot break critical exits yet");
+
+  // Create new live interval for the loop.
+  openIntv();
+
+  // Insert copies in the predecessors.
+  for (SplitAnalysis::BlockPtrSet::iterator I = Blocks.Preds.begin(),
+       E = Blocks.Preds.end(); I != E; ++I) {
+    MachineBasicBlock &MBB = const_cast<MachineBasicBlock&>(**I);
+    enterIntvAtEnd(MBB, *Loop->getHeader());
+  }
+
+  // Switch all loop blocks.
+  for (SplitAnalysis::BlockPtrSet::iterator I = Blocks.Loop.begin(),
+       E = Blocks.Loop.end(); I != E; ++I)
+     useIntv(**I);
+
+  // Insert back copies in the exit blocks.
+  for (SplitAnalysis::BlockPtrSet::iterator I = Blocks.Exits.begin(),
+       E = Blocks.Exits.end(); I != E; ++I) {
+    MachineBasicBlock &MBB = const_cast<MachineBasicBlock&>(**I);
+    leaveIntvAtTop(MBB);
+  }
+
+  // Done.
+  closeIntv();
+  rewrite();
+  return dupli_;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                            Single Block Splitting
+//===----------------------------------------------------------------------===//
+
+/// splitSingleBlocks - Split curli into a separate live interval inside each
+/// basic block in Blocks. Return true if curli has been completely replaced,
+/// false if curli is still intact, and needs to be spilled or split further.
+bool SplitEditor::splitSingleBlocks(const SplitAnalysis::BlockPtrSet &Blocks) {
+  DEBUG(dbgs() << "  splitSingleBlocks for " << Blocks.size() << " blocks.\n");
+  // Determine the first and last instruction using curli in each block.
+  typedef std::pair<SlotIndex,SlotIndex> IndexPair;
+  typedef DenseMap<const MachineBasicBlock*,IndexPair> IndexPairMap;
+  IndexPairMap MBBRange;
+  for (SplitAnalysis::InstrPtrSet::const_iterator I = sa_.usingInstrs_.begin(),
+       E = sa_.usingInstrs_.end(); I != E; ++I) {
+    const MachineBasicBlock *MBB = (*I)->getParent();
+    if (!Blocks.count(MBB))
+      continue;
+    SlotIndex Idx = lis_.getInstructionIndex(*I);
+    DEBUG(dbgs() << "  BB#" << MBB->getNumber() << '\t' << Idx << '\t' << **I);
+    IndexPair &IP = MBBRange[MBB];
+    if (!IP.first.isValid() || Idx < IP.first)
+      IP.first = Idx;
+    if (!IP.second.isValid() || Idx > IP.second)
+      IP.second = Idx;
+  }
+
+  // Create a new interval for each block.
+  for (SplitAnalysis::BlockPtrSet::const_iterator I = Blocks.begin(),
+       E = Blocks.end(); I != E; ++I) {
+    IndexPair &IP = MBBRange[*I];
+    DEBUG(dbgs() << "  splitting for BB#" << (*I)->getNumber() << ": ["
+                 << IP.first << ';' << IP.second << ")\n");
+    assert(IP.first.isValid() && IP.second.isValid());
+
+    openIntv();
+    enterIntvBefore(IP.first);
+    useIntv(IP.first.getBaseIndex(), IP.second.getBoundaryIndex());
+    leaveIntvAfter(IP.second);
+    closeIntv();
+  }
+  rewrite();
+  return dupli_;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                            Sub Block Splitting
+//===----------------------------------------------------------------------===//
+
+/// getBlockForInsideSplit - If curli is contained inside a single basic block,
+/// and it wou pay to subdivide the interval inside that block, return it.
+/// Otherwise return NULL. The returned block can be passed to
+/// SplitEditor::splitInsideBlock.
+const MachineBasicBlock *SplitAnalysis::getBlockForInsideSplit() {
+  // The interval must be exclusive to one block.
+  if (usingBlocks_.size() != 1)
+    return 0;
+  // Don't to this for less than 4 instructions. We want to be sure that
+  // splitting actually reduces the instruction count per interval.
+  if (usingInstrs_.size() < 4)
+    return 0;
+  return usingBlocks_.begin()->first;
+}
+
+/// splitInsideBlock - Split curli into multiple intervals inside MBB. Return
+/// true if curli has been completely replaced, false if curli is still
+/// intact, and needs to be spilled or split further.
+bool SplitEditor::splitInsideBlock(const MachineBasicBlock *MBB) {
+  SmallVector<SlotIndex, 32> Uses;
+  Uses.reserve(sa_.usingInstrs_.size());
+  for (SplitAnalysis::InstrPtrSet::const_iterator I = sa_.usingInstrs_.begin(),
+       E = sa_.usingInstrs_.end(); I != E; ++I)
+    if ((*I)->getParent() == MBB)
+      Uses.push_back(lis_.getInstructionIndex(*I));
+  DEBUG(dbgs() << "  splitInsideBlock BB#" << MBB->getNumber() << " for "
+               << Uses.size() << " instructions.\n");
+  assert(Uses.size() >= 3 && "Need at least 3 instructions");
+  array_pod_sort(Uses.begin(), Uses.end());
+
+  // Simple algorithm: Find the largest gap between uses as determined by slot
+  // indices. Create new intervals for instructions before the gap and after the
+  // gap.
+  unsigned bestPos = 0;
+  int bestGap = 0;
+  DEBUG(dbgs() << "    dist (" << Uses[0]);
+  for (unsigned i = 1, e = Uses.size(); i != e; ++i) {
+    int g = Uses[i-1].distance(Uses[i]);
+    DEBUG(dbgs() << ") -" << g << "- (" << Uses[i]);
+    if (g > bestGap)
+      bestPos = i, bestGap = g;
+  }
+  DEBUG(dbgs() << "), best: -" << bestGap << "-\n");
+
+  // bestPos points to the first use after the best gap.
+  assert(bestPos > 0 && "Invalid gap");
+
+  // FIXME: Don't create intervals for low densities.
+
+  // First interval before the gap. Don't create single-instr intervals.
+  if (bestPos > 1) {
+    openIntv();
+    enterIntvBefore(Uses.front());
+    useIntv(Uses.front().getBaseIndex(), Uses[bestPos-1].getBoundaryIndex());
+    leaveIntvAfter(Uses[bestPos-1]);
+    closeIntv();
+  }
+
+  // Second interval after the gap.
+  if (bestPos < Uses.size()-1) {
+    openIntv();
+    enterIntvBefore(Uses[bestPos]);
+    useIntv(Uses[bestPos].getBaseIndex(), Uses.back().getBoundaryIndex());
+    leaveIntvAfter(Uses.back());
+    closeIntv();
+  }
+
+  rewrite();
+  return dupli_;
+}
diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
new file mode 100644
index 0000000000000..ddef7461dc3d6
--- /dev/null
+++ b/lib/CodeGen/SplitKit.h
@@ -0,0 +1,321 @@
+//===---------- SplitKit.cpp - Toolkit for splitting live ranges ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SplitAnalysis class as well as mutator functions for
+// live range splitting.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+namespace llvm {
+
+class LiveInterval;
+class LiveIntervals;
+class MachineInstr;
+class MachineLoop;
+class MachineLoopInfo;
+class MachineRegisterInfo;
+class TargetInstrInfo;
+class VirtRegMap;
+class VNInfo;
+
+/// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
+/// opportunities.
+class SplitAnalysis {
+public:
+  const MachineFunction &mf_;
+  const LiveIntervals &lis_;
+  const MachineLoopInfo &loops_;
+  const TargetInstrInfo &tii_;
+
+  // Instructions using the the current register.
+  typedef SmallPtrSet<const MachineInstr*, 16> InstrPtrSet;
+  InstrPtrSet usingInstrs_;
+
+  // The number of instructions using curli in each basic block.
+  typedef DenseMap<const MachineBasicBlock*, unsigned> BlockCountMap;
+  BlockCountMap usingBlocks_;
+
+  // The number of basic block using curli in each loop.
+  typedef DenseMap<const MachineLoop*, unsigned> LoopCountMap;
+  LoopCountMap usingLoops_;
+
+private:
+  // Current live interval.
+  const LiveInterval *curli_;
+
+  // Sumarize statistics by counting instructions using curli_.
+  void analyzeUses();
+
+  /// canAnalyzeBranch - Return true if MBB ends in a branch that can be
+  /// analyzed.
+  bool canAnalyzeBranch(const MachineBasicBlock *MBB);
+
+public:
+  SplitAnalysis(const MachineFunction &mf, const LiveIntervals &lis,
+                const MachineLoopInfo &mli);
+
+  /// analyze - set curli to the specified interval, and analyze how it may be
+  /// split.
+  void analyze(const LiveInterval *li);
+
+  /// removeUse - Update statistics by noting that mi no longer uses curli.
+  void removeUse(const MachineInstr *mi);
+
+  const LiveInterval *getCurLI() { return curli_; }
+
+  /// clear - clear all data structures so SplitAnalysis is ready to analyze a
+  /// new interval.
+  void clear();
+
+  typedef SmallPtrSet<const MachineBasicBlock*, 16> BlockPtrSet;
+  typedef SmallPtrSet<const MachineLoop*, 16> LoopPtrSet;
+
+  // Sets of basic blocks surrounding a machine loop.
+  struct LoopBlocks {
+    BlockPtrSet Loop;  // Blocks in the loop.
+    BlockPtrSet Preds; // Loop predecessor blocks.
+    BlockPtrSet Exits; // Loop exit blocks.
+
+    void clear() {
+      Loop.clear();
+      Preds.clear();
+      Exits.clear();
+    }
+  };
+
+  // Calculate the block sets surrounding the loop.
+  void getLoopBlocks(const MachineLoop *Loop, LoopBlocks &Blocks);
+
+  /// LoopPeripheralUse - how is a variable used in and around a loop?
+  /// Peripheral blocks are the loop predecessors and exit blocks.
+  enum LoopPeripheralUse {
+    ContainedInLoop,  // All uses are inside the loop.
+    SinglePeripheral, // At most one instruction per peripheral block.
+    MultiPeripheral,  // Multiple instructions in some peripheral blocks.
+    OutsideLoop       // Uses outside loop periphery.
+  };
+
+  /// analyzeLoopPeripheralUse - Return an enum describing how curli_ is used in
+  /// and around the Loop.
+  LoopPeripheralUse analyzeLoopPeripheralUse(const LoopBlocks&);
+
+  /// getCriticalExits - It may be necessary to partially break critical edges
+  /// leaving the loop if an exit block has phi uses of curli. Collect the exit
+  /// blocks that need special treatment into CriticalExits.
+  void getCriticalExits(const LoopBlocks &Blocks, BlockPtrSet &CriticalExits);
+
+  /// canSplitCriticalExits - Return true if it is possible to insert new exit
+  /// blocks before the blocks in CriticalExits.
+  bool canSplitCriticalExits(const LoopBlocks &Blocks,
+                             BlockPtrSet &CriticalExits);
+
+  /// getBestSplitLoop - Return the loop where curli may best be split to a
+  /// separate register, or NULL.
+  const MachineLoop *getBestSplitLoop();
+
+  /// getMultiUseBlocks - Add basic blocks to Blocks that may benefit from
+  /// having curli split to a new live interval. Return true if Blocks can be
+  /// passed to SplitEditor::splitSingleBlocks.
+  bool getMultiUseBlocks(BlockPtrSet &Blocks);
+
+  /// getBlockForInsideSplit - If curli is contained inside a single basic block,
+  /// and it wou pay to subdivide the interval inside that block, return it.
+  /// Otherwise return NULL. The returned block can be passed to
+  /// SplitEditor::splitInsideBlock.
+  const MachineBasicBlock *getBlockForInsideSplit();
+};
+
+
+/// LiveIntervalMap - Map values from a large LiveInterval into a small
+/// interval that is a subset. Insert phi-def values as needed. This class is
+/// used by SplitEditor to create new smaller LiveIntervals.
+///
+/// parentli_ is the larger interval, li_ is the subset interval. Every value
+/// in li_ corresponds to exactly one value in parentli_, and the live range
+/// of the value is contained within the live range of the parentli_ value.
+/// Values in parentli_ may map to any number of openli_ values, including 0.
+class LiveIntervalMap {
+  LiveIntervals &lis_;
+
+  // The parent interval is never changed.
+  const LiveInterval &parentli_;
+
+  // The child interval's values are fully contained inside parentli_ values.
+  LiveInterval &li_;
+
+  typedef DenseMap<const VNInfo*, VNInfo*> ValueMap;
+
+  // Map parentli_ values to simple values in li_ that are defined at the same
+  // SlotIndex, or NULL for parentli_ values that have complex li_ defs.
+  // Note there is a difference between values mapping to NULL (complex), and
+  // values not present (unknown/unmapped).
+  ValueMap valueMap_;
+
+  // extendTo - Find the last li_ value defined in MBB at or before Idx. The
+  // parentli_ is assumed to be live at Idx. Extend the live range to Idx.
+  // Return the found VNInfo, or NULL.
+  VNInfo *extendTo(MachineBasicBlock *MBB, SlotIndex Idx);
+
+  // addSimpleRange - Add a simple range from parentli_ to li_.
+  // ParentVNI must be live in the [Start;End) interval.
+  void addSimpleRange(SlotIndex Start, SlotIndex End, const VNInfo *ParentVNI);
+
+public:
+  LiveIntervalMap(LiveIntervals &lis,
+                  const LiveInterval &parentli,
+                  LiveInterval &li)
+    : lis_(lis), parentli_(parentli), li_(li) {}
+
+  /// defValue - define a value in li_ from the parentli_ value VNI and Idx.
+  /// Idx does not have to be ParentVNI->def, but it must be contained within
+  /// ParentVNI's live range in parentli_.
+  /// Return the new li_ value.
+  VNInfo *defValue(const VNInfo *ParentVNI, SlotIndex Idx);
+
+  /// mapValue - map ParentVNI to the corresponding li_ value at Idx. It is
+  /// assumed that ParentVNI is live at Idx.
+  /// If ParentVNI has not been defined by defValue, it is assumed that
+  /// ParentVNI->def dominates Idx.
+  /// If ParentVNI has been defined by defValue one or more times, a value that
+  /// dominates Idx will be returned. This may require creating extra phi-def
+  /// values and adding live ranges to li_.
+  VNInfo *mapValue(const VNInfo *ParentVNI, SlotIndex Idx);
+
+  /// addRange - Add live ranges to li_ where [Start;End) intersects parentli_.
+  /// All needed values whose def is not inside [Start;End) must be defined
+  /// beforehand so mapValue will work.
+  void addRange(SlotIndex Start, SlotIndex End);
+};
+
+
+/// SplitEditor - Edit machine code and LiveIntervals for live range
+/// splitting.
+///
+/// - Create a SplitEditor from a SplitAnalysis.
+/// - Start a new live interval with openIntv.
+/// - Mark the places where the new interval is entered using enterIntv*
+/// - Mark the ranges where the new interval is used with useIntv* 
+/// - Mark the places where the interval is exited with exitIntv*.
+/// - Finish the current interval with closeIntv and repeat from 2.
+/// - Rewrite instructions with rewrite().
+///
+class SplitEditor {
+  SplitAnalysis &sa_;
+  LiveIntervals &lis_;
+  VirtRegMap &vrm_;
+  MachineRegisterInfo &mri_;
+  const TargetInstrInfo &tii_;
+
+  /// curli_ - The immutable interval we are currently splitting.
+  const LiveInterval *const curli_;
+
+  /// dupli_ - Created as a copy of curli_, ranges are carved out as new
+  /// intervals get added through openIntv / closeIntv. This is used to avoid
+  /// editing curli_.
+  LiveInterval *dupli_;
+
+  /// Currently open LiveInterval.
+  LiveInterval *openli_;
+
+  /// createInterval - Create a new virtual register and LiveInterval with same
+  /// register class and spill slot as curli.
+  LiveInterval *createInterval();
+
+  /// getDupLI - Ensure dupli is created and return it.
+  LiveInterval *getDupLI();
+
+  /// valueMap_ - Map values in cupli to values in openli. These are direct 1-1
+  /// mappings, and do not include values created by inserted copies.
+  DenseMap<const VNInfo*, VNInfo*> valueMap_;
+
+  /// mapValue - Return the openIntv value that corresponds to the given curli
+  /// value.
+  VNInfo *mapValue(const VNInfo *curliVNI);
+
+  /// A dupli value is live through openIntv.
+  bool liveThrough_;
+
+  /// All the new intervals created for this split are added to intervals_.
+  SmallVectorImpl<LiveInterval*> &intervals_;
+
+  /// The index into intervals_ of the first interval we added. There may be
+  /// others from before we got it.
+  unsigned firstInterval;
+
+  /// Insert a COPY instruction curli -> li. Allocate a new value from li
+  /// defined by the COPY
+  VNInfo *insertCopy(LiveInterval &LI,
+                     MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator I);
+
+public:
+  /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
+  /// Newly created intervals will be appended to newIntervals.
+  SplitEditor(SplitAnalysis &SA, LiveIntervals&, VirtRegMap&,
+              SmallVectorImpl<LiveInterval*> &newIntervals);
+
+  /// getAnalysis - Get the corresponding analysis.
+  SplitAnalysis &getAnalysis() { return sa_; }
+
+  /// Create a new virtual register and live interval.
+  void openIntv();
+
+  /// enterIntvBefore - Enter openli before the instruction at Idx. If curli is
+  /// not live before Idx, a COPY is not inserted.
+  void enterIntvBefore(SlotIndex Idx);
+
+  /// enterIntvAtEnd - Enter openli at the end of MBB.
+  /// PhiMBB is a successor inside openli where a PHI value is created.
+  /// Currently, all entries must share the same PhiMBB.
+  void enterIntvAtEnd(MachineBasicBlock &MBB, MachineBasicBlock &PhiMBB);
+
+  /// useIntv - indicate that all instructions in MBB should use openli.
+  void useIntv(const MachineBasicBlock &MBB);
+
+  /// useIntv - indicate that all instructions in range should use openli.
+  void useIntv(SlotIndex Start, SlotIndex End);
+
+  /// leaveIntvAfter - Leave openli after the instruction at Idx.
+  void leaveIntvAfter(SlotIndex Idx);
+
+  /// leaveIntvAtTop - Leave the interval at the top of MBB.
+  /// Currently, only one value can leave the interval.
+  void leaveIntvAtTop(MachineBasicBlock &MBB);
+
+  /// closeIntv - Indicate that we are done editing the currently open
+  /// LiveInterval, and ranges can be trimmed.
+  void closeIntv();
+
+  /// rewrite - after all the new live ranges have been created, rewrite
+  /// instructions using curli to use the new intervals.
+  void rewrite();
+
+  // ===--- High level methods ---===
+
+  /// splitAroundLoop - Split curli into a separate live interval inside
+  /// the loop. Return true if curli has been completely replaced, false if
+  /// curli is still intact, and needs to be spilled or split further.
+  bool splitAroundLoop(const MachineLoop*);
+
+  /// splitSingleBlocks - Split curli into a separate live interval inside each
+  /// basic block in Blocks. Return true if curli has been completely replaced,
+  /// false if curli is still intact, and needs to be spilled or split further.
+  bool splitSingleBlocks(const SplitAnalysis::BlockPtrSet &Blocks);
+
+  /// splitInsideBlock - Split curli into multiple intervals inside MBB. Return
+  /// true if curli has been completely replaced, false if curli is still
+  /// intact, and needs to be spilled or split further.
+  bool splitInsideBlock(const MachineBasicBlock *);
+};
+
+}
diff --git a/lib/CodeGen/Splitter.cpp b/lib/CodeGen/Splitter.cpp
new file mode 100644
index 0000000000000..38f3b1f4d35ea
--- /dev/null
+++ b/lib/CodeGen/Splitter.cpp
@@ -0,0 +1,817 @@
+//===-- llvm/CodeGen/Splitter.cpp -  Splitter -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loopsplitter"
+
+#include "Splitter.h"
+
+#include "SimpleRegisterCoalescing.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+char LoopSplitter::ID = 0;
+INITIALIZE_PASS(LoopSplitter, "loop-splitting",
+                "Split virtual regists across loop boundaries.", false, false);
+
+namespace llvm {
+
+  class StartSlotComparator {
+  public:
+    StartSlotComparator(LiveIntervals &lis) : lis(lis) {}
+    bool operator()(const MachineBasicBlock *mbb1,
+                    const MachineBasicBlock *mbb2) const {
+      return lis.getMBBStartIdx(mbb1) < lis.getMBBStartIdx(mbb2);
+    }
+  private:
+    LiveIntervals &lis;
+  };
+
+  class LoopSplit {
+  public:
+    LoopSplit(LoopSplitter &ls, LiveInterval &li, MachineLoop &loop)
+      : ls(ls), li(li), loop(loop), valid(true), inSplit(false), newLI(0) {
+      assert(TargetRegisterInfo::isVirtualRegister(li.reg) &&
+             "Cannot split physical registers.");
+    }
+
+    LiveInterval& getLI() const { return li; }
+
+    MachineLoop& getLoop() const { return loop; }
+
+    bool isValid() const { return valid; }
+
+    bool isWorthwhile() const { return valid && (inSplit || !outSplits.empty()); }
+
+    void invalidate() { valid = false; }
+
+    void splitIncoming() { inSplit = true; }
+
+    void splitOutgoing(MachineLoop::Edge &edge) { outSplits.insert(edge); }
+
+    void addLoopInstr(MachineInstr *i) { loopInstrs.push_back(i); }
+
+    void apply() {
+      assert(valid && "Attempt to apply invalid split.");
+      applyIncoming();
+      applyOutgoing();
+      copyRanges();
+      renameInside();
+    }
+
+  private:
+    LoopSplitter &ls;
+    LiveInterval &li;
+    MachineLoop &loop;
+    bool valid, inSplit;
+    std::set<MachineLoop::Edge> outSplits;
+    std::vector<MachineInstr*> loopInstrs;
+
+    LiveInterval *newLI;
+    std::map<VNInfo*, VNInfo*> vniMap;
+
+    LiveInterval* getNewLI() {
+      if (newLI == 0) {
+        const TargetRegisterClass *trc = ls.mri->getRegClass(li.reg);
+        unsigned vreg = ls.mri->createVirtualRegister(trc);
+        newLI = &ls.lis->getOrCreateInterval(vreg);
+      }
+      return newLI;
+    }
+
+    VNInfo* getNewVNI(VNInfo *oldVNI) {
+      VNInfo *newVNI = vniMap[oldVNI];
+
+      if (newVNI == 0) {
+        newVNI = getNewLI()->createValueCopy(oldVNI,
+                                             ls.lis->getVNInfoAllocator());
+        vniMap[oldVNI] = newVNI;
+      }
+
+      return newVNI;
+    }
+
+    void applyIncoming() {
+      if (!inSplit) {
+        return;
+      }
+
+      MachineBasicBlock *preHeader = loop.getLoopPreheader();
+      if (preHeader == 0) {
+        assert(ls.canInsertPreHeader(loop) &&
+               "Can't insert required preheader.");
+        preHeader = &ls.insertPreHeader(loop);
+      }
+
+      LiveRange *preHeaderRange =
+        ls.lis->findExitingRange(li, preHeader);
+      assert(preHeaderRange != 0 && "Range not live into preheader.");
+
+      // Insert the new copy.
+      MachineInstr *copy = BuildMI(*preHeader,
+                                   preHeader->getFirstTerminator(),
+                                   DebugLoc(),
+                                   ls.tii->get(TargetOpcode::COPY))
+        .addReg(getNewLI()->reg, RegState::Define)
+        .addReg(li.reg, RegState::Kill);
+
+      ls.lis->InsertMachineInstrInMaps(copy);
+
+      SlotIndex copyDefIdx = ls.lis->getInstructionIndex(copy).getDefIndex();
+
+      VNInfo *newVal = getNewVNI(preHeaderRange->valno);
+      newVal->def = copyDefIdx;
+      newVal->setCopy(copy);
+      newVal->setIsDefAccurate(true);
+      li.removeRange(copyDefIdx, ls.lis->getMBBEndIdx(preHeader), true);
+
+      getNewLI()->addRange(LiveRange(copyDefIdx,
+                                     ls.lis->getMBBEndIdx(preHeader),
+                                     newVal));
+    }
+
+    void applyOutgoing() {
+
+      for (std::set<MachineLoop::Edge>::iterator osItr = outSplits.begin(),
+                                                 osEnd = outSplits.end();
+           osItr != osEnd; ++osItr) {
+        MachineLoop::Edge edge = *osItr;
+        MachineBasicBlock *outBlock = edge.second;
+        if (ls.isCriticalEdge(edge)) {
+          assert(ls.canSplitEdge(edge) && "Unsplitable critical edge.");
+          outBlock = &ls.splitEdge(edge, loop);
+        }
+        LiveRange *outRange = ls.lis->findEnteringRange(li, outBlock);
+        assert(outRange != 0 && "No exiting range?");
+
+        MachineInstr *copy = BuildMI(*outBlock, outBlock->begin(),
+                                     DebugLoc(),
+                                     ls.tii->get(TargetOpcode::COPY))
+          .addReg(li.reg, RegState::Define)
+          .addReg(getNewLI()->reg, RegState::Kill);
+
+        ls.lis->InsertMachineInstrInMaps(copy);
+
+        SlotIndex copyDefIdx = ls.lis->getInstructionIndex(copy).getDefIndex();
+        
+        // Blow away output range definition.
+        outRange->valno->def = ls.lis->getInvalidIndex();
+        outRange->valno->setIsDefAccurate(false);
+        li.removeRange(ls.lis->getMBBStartIdx(outBlock), copyDefIdx);
+
+        VNInfo *newVal =
+          getNewLI()->getNextValue(SlotIndex(ls.lis->getMBBStartIdx(outBlock),
+                                             true),
+                                   0, false, ls.lis->getVNInfoAllocator());
+
+        getNewLI()->addRange(LiveRange(ls.lis->getMBBStartIdx(outBlock),
+                                       copyDefIdx, newVal));
+                                       
+      }
+    }
+
+    void copyRange(LiveRange &lr) {
+      std::pair<bool, LoopSplitter::SlotPair> lsr =
+        ls.getLoopSubRange(lr, loop);
+      
+      if (!lsr.first)
+        return;
+
+      LiveRange loopRange(lsr.second.first, lsr.second.second,
+                          getNewVNI(lr.valno));
+
+      li.removeRange(loopRange.start, loopRange.end, true);
+
+      getNewLI()->addRange(loopRange);
+    }
+
+    void copyRanges() {
+      for (std::vector<MachineInstr*>::iterator iItr = loopInstrs.begin(),
+                                                iEnd = loopInstrs.end();
+           iItr != iEnd; ++iItr) {
+        MachineInstr &instr = **iItr;
+        SlotIndex instrIdx = ls.lis->getInstructionIndex(&instr);
+        if (instr.modifiesRegister(li.reg, 0)) {
+          LiveRange *defRange =
+            li.getLiveRangeContaining(instrIdx.getDefIndex());
+          if (defRange != 0) // May have caught this already.
+            copyRange(*defRange);
+        }
+        if (instr.readsRegister(li.reg, 0)) {
+          LiveRange *useRange =
+            li.getLiveRangeContaining(instrIdx.getUseIndex());
+          if (useRange != 0) { // May have caught this already.
+            copyRange(*useRange);
+          }
+        }
+      }
+
+      for (MachineLoop::block_iterator bbItr = loop.block_begin(),
+                                       bbEnd = loop.block_end();
+           bbItr != bbEnd; ++bbItr) {
+        MachineBasicBlock &loopBlock = **bbItr;
+        LiveRange *enteringRange =
+          ls.lis->findEnteringRange(li, &loopBlock);
+        if (enteringRange != 0) {
+          copyRange(*enteringRange);
+        }
+      }
+    } 
+
+    void renameInside() {
+      for (std::vector<MachineInstr*>::iterator iItr = loopInstrs.begin(),
+                                                iEnd = loopInstrs.end();
+           iItr != iEnd; ++iItr) {
+        MachineInstr &instr = **iItr;
+        for (unsigned i = 0; i < instr.getNumOperands(); ++i) {
+          MachineOperand &mop = instr.getOperand(i);
+          if (mop.isReg() && mop.getReg() == li.reg) {
+            mop.setReg(getNewLI()->reg);
+          }
+        }
+      }
+    }
+
+  };
+
+  void LoopSplitter::getAnalysisUsage(AnalysisUsage &au) const {
+    au.addRequired<MachineDominatorTree>();
+    au.addPreserved<MachineDominatorTree>();
+    au.addRequired<MachineLoopInfo>();
+    au.addPreserved<MachineLoopInfo>();
+    au.addPreserved<RegisterCoalescer>();
+    au.addPreserved<CalculateSpillWeights>();
+    au.addPreserved<LiveStacks>();
+    au.addRequired<SlotIndexes>();
+    au.addPreserved<SlotIndexes>();
+    au.addRequired<LiveIntervals>();
+    au.addPreserved<LiveIntervals>();
+    MachineFunctionPass::getAnalysisUsage(au);
+  }
+
+  bool LoopSplitter::runOnMachineFunction(MachineFunction &fn) {
+
+    mf = &fn;
+    mri = &mf->getRegInfo();
+    tii = mf->getTarget().getInstrInfo();
+    tri = mf->getTarget().getRegisterInfo();
+    sis = &getAnalysis<SlotIndexes>();
+    lis = &getAnalysis<LiveIntervals>();
+    mli = &getAnalysis<MachineLoopInfo>();
+    mdt = &getAnalysis<MachineDominatorTree>();
+
+    fqn = mf->getFunction()->getParent()->getModuleIdentifier() + "." +
+      mf->getFunction()->getName().str();
+
+    dbgs() << "Splitting " << mf->getFunction()->getName() << ".";
+
+    dumpOddTerminators();
+
+//      dbgs() << "----------------------------------------\n";
+//      lis->dump();
+//      dbgs() << "----------------------------------------\n";
+       
+//     std::deque<MachineLoop*> loops;
+//     std::copy(mli->begin(), mli->end(), std::back_inserter(loops));
+//     dbgs() << "Loops:\n";
+//     while (!loops.empty()) {
+//       MachineLoop &loop = *loops.front();
+//       loops.pop_front();
+//       std::copy(loop.begin(), loop.end(), std::back_inserter(loops));
+
+//       dumpLoopInfo(loop);
+//     }
+    
+    //lis->dump();
+    //exit(0);
+
+    // Setup initial intervals.
+    for (LiveIntervals::iterator liItr = lis->begin(), liEnd = lis->end();
+         liItr != liEnd; ++liItr) {
+      LiveInterval *li = liItr->second;
+
+      if (TargetRegisterInfo::isVirtualRegister(li->reg) &&
+          !lis->intervalIsInOneMBB(*li)) {
+        intervals.push_back(li);
+      }
+    }
+
+    processIntervals();
+
+    intervals.clear();
+
+//     dbgs() << "----------------------------------------\n";
+//     lis->dump();
+//     dbgs() << "----------------------------------------\n";
+
+    dumpOddTerminators();
+
+    //exit(1);
+
+    return false;
+  }
+
+  void LoopSplitter::releaseMemory() {
+    fqn.clear();
+    intervals.clear();
+    loopRangeMap.clear();
+  }
+
+  void LoopSplitter::dumpOddTerminators() {
+    for (MachineFunction::iterator bbItr = mf->begin(), bbEnd = mf->end();
+         bbItr != bbEnd; ++bbItr) {
+      MachineBasicBlock *mbb = &*bbItr;
+      MachineBasicBlock *a = 0, *b = 0;
+      SmallVector<MachineOperand, 4> c;
+      if (tii->AnalyzeBranch(*mbb, a, b, c)) {
+        dbgs() << "MBB#" << mbb->getNumber() << " has multiway terminator.\n";
+        dbgs() << "  Terminators:\n";
+        for (MachineBasicBlock::iterator iItr = mbb->begin(), iEnd = mbb->end();
+             iItr != iEnd; ++iItr) {
+          MachineInstr *instr= &*iItr;
+          dbgs() << "    " << *instr << "";
+        }
+        dbgs() << "\n  Listed successors: [ ";
+        for (MachineBasicBlock::succ_iterator sItr = mbb->succ_begin(), sEnd = mbb->succ_end();
+             sItr != sEnd; ++sItr) {
+          MachineBasicBlock *succMBB = *sItr;
+          dbgs() << succMBB->getNumber() << " ";
+        }
+        dbgs() << "]\n\n";
+      }
+    }
+  }
+
+  void LoopSplitter::dumpLoopInfo(MachineLoop &loop) {
+    MachineBasicBlock &headerBlock = *loop.getHeader();
+    typedef SmallVector<MachineLoop::Edge, 8> ExitEdgesList;
+    ExitEdgesList exitEdges;
+    loop.getExitEdges(exitEdges);
+
+    dbgs() << "  Header: BB#" << headerBlock.getNumber() << ", Contains: [ ";
+    for (std::vector<MachineBasicBlock*>::const_iterator
+           subBlockItr = loop.getBlocks().begin(),
+           subBlockEnd = loop.getBlocks().end();
+         subBlockItr != subBlockEnd; ++subBlockItr) {
+      MachineBasicBlock &subBlock = **subBlockItr;
+      dbgs() << "BB#" << subBlock.getNumber() << " ";
+    }
+    dbgs() << "], Exit edges: [ ";
+    for (ExitEdgesList::iterator exitEdgeItr = exitEdges.begin(),
+                                 exitEdgeEnd = exitEdges.end();
+         exitEdgeItr != exitEdgeEnd; ++exitEdgeItr) {
+      MachineLoop::Edge &exitEdge = *exitEdgeItr;
+      dbgs() << "(MBB#" << exitEdge.first->getNumber()
+             << ", MBB#" << exitEdge.second->getNumber() << ") ";
+    }
+    dbgs() << "], Sub-Loop Headers: [ ";
+    for (MachineLoop::iterator subLoopItr = loop.begin(),
+                               subLoopEnd = loop.end();
+         subLoopItr != subLoopEnd; ++subLoopItr) {
+      MachineLoop &subLoop = **subLoopItr;
+      MachineBasicBlock &subLoopBlock = *subLoop.getHeader();
+      dbgs() << "BB#" << subLoopBlock.getNumber() << " ";
+    }
+    dbgs() << "]\n";
+  }
+
+  void LoopSplitter::updateTerminators(MachineBasicBlock &mbb) {
+    mbb.updateTerminator();
+
+    for (MachineBasicBlock::iterator miItr = mbb.begin(), miEnd = mbb.end();
+         miItr != miEnd; ++miItr) {
+      if (lis->isNotInMIMap(miItr)) {
+        lis->InsertMachineInstrInMaps(miItr);
+      }
+    }
+  }
+
+  bool LoopSplitter::canInsertPreHeader(MachineLoop &loop) {
+    MachineBasicBlock *header = loop.getHeader();
+    MachineBasicBlock *a = 0, *b = 0;
+    SmallVector<MachineOperand, 4> c;
+
+    for (MachineBasicBlock::pred_iterator pbItr = header->pred_begin(),
+                                          pbEnd = header->pred_end();
+         pbItr != pbEnd; ++pbItr) {
+      MachineBasicBlock *predBlock = *pbItr;
+      if (!!tii->AnalyzeBranch(*predBlock, a, b, c)) {
+        return false;
+      }
+    }
+
+    MachineFunction::iterator headerItr(header);
+    if (headerItr == mf->begin())
+      return true;
+    MachineBasicBlock *headerLayoutPred = llvm::prior(headerItr);
+    assert(headerLayoutPred != 0 && "Header should have layout pred.");
+
+    return (!tii->AnalyzeBranch(*headerLayoutPred, a, b, c));
+  }
+
+  MachineBasicBlock& LoopSplitter::insertPreHeader(MachineLoop &loop) {
+    assert(loop.getLoopPreheader() == 0 && "Loop already has preheader.");
+
+    MachineBasicBlock &header = *loop.getHeader();
+
+    // Save the preds - we'll need to update them once we insert the preheader.
+    typedef std::set<MachineBasicBlock*> HeaderPreds;
+    HeaderPreds headerPreds;
+
+    for (MachineBasicBlock::pred_iterator predItr = header.pred_begin(),
+                                          predEnd = header.pred_end();
+         predItr != predEnd; ++predItr) {
+      if (!loop.contains(*predItr))
+        headerPreds.insert(*predItr);
+    }
+
+    assert(!headerPreds.empty() && "No predecessors for header?");
+
+    //dbgs() << fqn << " MBB#" << header.getNumber() << " inserting preheader...";
+
+    MachineBasicBlock *preHeader =
+      mf->CreateMachineBasicBlock(header.getBasicBlock());
+
+    assert(preHeader != 0 && "Failed to create pre-header.");
+
+    mf->insert(header, preHeader);
+
+    for (HeaderPreds::iterator hpItr = headerPreds.begin(),
+                               hpEnd = headerPreds.end(); 
+         hpItr != hpEnd; ++hpItr) {
+      assert(*hpItr != 0 && "How'd a null predecessor get into this set?");
+      MachineBasicBlock &hp = **hpItr;
+      hp.ReplaceUsesOfBlockWith(&header, preHeader);
+    }
+    preHeader->addSuccessor(&header);
+
+    MachineBasicBlock *oldLayoutPred =
+      llvm::prior(MachineFunction::iterator(preHeader));
+    if (oldLayoutPred != 0) {
+      updateTerminators(*oldLayoutPred);
+    }
+
+    lis->InsertMBBInMaps(preHeader);
+
+    if (MachineLoop *parentLoop = loop.getParentLoop()) {
+      assert(parentLoop->getHeader() != loop.getHeader() &&
+             "Parent loop has same header?");
+      parentLoop->addBasicBlockToLoop(preHeader, mli->getBase());
+
+      // Invalidate all parent loop ranges.
+      while (parentLoop != 0) {
+        loopRangeMap.erase(parentLoop);
+        parentLoop = parentLoop->getParentLoop();
+      }
+    }
+
+    for (LiveIntervals::iterator liItr = lis->begin(),
+                                 liEnd = lis->end();
+         liItr != liEnd; ++liItr) {
+      LiveInterval &li = *liItr->second;
+
+      // Is this safe for physregs?
+      // TargetRegisterInfo::isPhysicalRegister(li.reg) ||
+      if (!lis->isLiveInToMBB(li, &header))
+        continue;
+
+      if (lis->isLiveInToMBB(li, preHeader)) {
+        assert(lis->isLiveOutOfMBB(li, preHeader) &&
+               "Range terminates in newly added preheader?");
+        continue;
+      }
+
+      bool insertRange = false;
+
+      for (MachineBasicBlock::pred_iterator predItr = preHeader->pred_begin(),
+                                            predEnd = preHeader->pred_end();
+           predItr != predEnd; ++predItr) {
+        MachineBasicBlock *predMBB = *predItr;
+        if (lis->isLiveOutOfMBB(li, predMBB)) {
+          insertRange = true;
+          break;
+        }
+      }
+
+      if (!insertRange)
+        continue;
+
+      VNInfo *newVal = li.getNextValue(lis->getMBBStartIdx(preHeader),
+                                       0, false, lis->getVNInfoAllocator());
+      li.addRange(LiveRange(lis->getMBBStartIdx(preHeader),
+                            lis->getMBBEndIdx(preHeader),
+                            newVal));
+    }
+
+
+    //dbgs() << "Dumping SlotIndexes:\n";
+    //sis->dump();
+
+    //dbgs() << "done. (Added MBB#" << preHeader->getNumber() << ")\n";
+
+    return *preHeader;
+  }
+
+  bool LoopSplitter::isCriticalEdge(MachineLoop::Edge &edge) {
+    assert(edge.first->succ_size() > 1 && "Non-sensical edge.");
+    if (edge.second->pred_size() > 1)
+      return true;
+    return false;
+  }
+
+  bool LoopSplitter::canSplitEdge(MachineLoop::Edge &edge) {
+    MachineFunction::iterator outBlockItr(edge.second);
+    if (outBlockItr == mf->begin())
+      return true;
+    MachineBasicBlock *outBlockLayoutPred = llvm::prior(outBlockItr);
+    assert(outBlockLayoutPred != 0 && "Should have a layout pred if out!=begin.");
+    MachineBasicBlock *a = 0, *b = 0;
+    SmallVector<MachineOperand, 4> c;
+    return (!tii->AnalyzeBranch(*outBlockLayoutPred, a, b, c) &&
+            !tii->AnalyzeBranch(*edge.first, a, b, c));
+  }
+
+  MachineBasicBlock& LoopSplitter::splitEdge(MachineLoop::Edge &edge,
+                                             MachineLoop &loop) {
+
+    MachineBasicBlock &inBlock = *edge.first;
+    MachineBasicBlock &outBlock = *edge.second;
+
+    assert((inBlock.succ_size() > 1) && (outBlock.pred_size() > 1) &&
+           "Splitting non-critical edge?");
+
+    //dbgs() << fqn << " Splitting edge (MBB#" << inBlock.getNumber()
+    //       << " -> MBB#" << outBlock.getNumber() << ")...";
+
+    MachineBasicBlock *splitBlock =
+      mf->CreateMachineBasicBlock();
+
+    assert(splitBlock != 0 && "Failed to create split block.");
+
+    mf->insert(&outBlock, splitBlock);
+
+    inBlock.ReplaceUsesOfBlockWith(&outBlock, splitBlock);
+    splitBlock->addSuccessor(&outBlock);
+
+    MachineBasicBlock *oldLayoutPred =
+      llvm::prior(MachineFunction::iterator(splitBlock));
+    if (oldLayoutPred != 0) {
+      updateTerminators(*oldLayoutPred);
+    }
+
+    lis->InsertMBBInMaps(splitBlock);
+
+    loopRangeMap.erase(&loop);
+
+    MachineLoop *splitParentLoop = loop.getParentLoop();
+    while (splitParentLoop != 0 &&
+           !splitParentLoop->contains(&outBlock)) {
+      splitParentLoop = splitParentLoop->getParentLoop();
+    }
+
+    if (splitParentLoop != 0) {
+      assert(splitParentLoop->contains(&loop) &&
+             "Split-block parent doesn't contain original loop?");
+      splitParentLoop->addBasicBlockToLoop(splitBlock, mli->getBase());
+      
+      // Invalidate all parent loop ranges.
+      while (splitParentLoop != 0) {
+        loopRangeMap.erase(splitParentLoop);
+        splitParentLoop = splitParentLoop->getParentLoop();
+      }
+    }
+
+
+    for (LiveIntervals::iterator liItr = lis->begin(),
+                                 liEnd = lis->end();
+         liItr != liEnd; ++liItr) {
+      LiveInterval &li = *liItr->second;
+      bool intersects = lis->isLiveOutOfMBB(li, &inBlock) &&
+                       lis->isLiveInToMBB(li, &outBlock);
+      if (lis->isLiveInToMBB(li, splitBlock)) {
+        if (!intersects) {
+          li.removeRange(lis->getMBBStartIdx(splitBlock),
+                         lis->getMBBEndIdx(splitBlock), true);
+        }
+      } else if (intersects) {
+        VNInfo *newVal = li.getNextValue(lis->getMBBStartIdx(splitBlock),
+                                         0, false, lis->getVNInfoAllocator());
+        li.addRange(LiveRange(lis->getMBBStartIdx(splitBlock),
+                              lis->getMBBEndIdx(splitBlock),
+                              newVal));
+      }
+    }
+
+    //dbgs() << "done. (Added MBB#" << splitBlock->getNumber() << ")\n";
+
+    return *splitBlock;
+  }
+
+  LoopSplitter::LoopRanges& LoopSplitter::getLoopRanges(MachineLoop &loop) {
+    typedef std::set<MachineBasicBlock*, StartSlotComparator> LoopMBBSet;
+    LoopRangeMap::iterator lrItr = loopRangeMap.find(&loop);
+    if (lrItr == loopRangeMap.end()) {
+      LoopMBBSet loopMBBs((StartSlotComparator(*lis))); 
+      std::copy(loop.block_begin(), loop.block_end(),
+                std::inserter(loopMBBs, loopMBBs.begin()));
+
+      assert(!loopMBBs.empty() && "No blocks in loop?");
+
+      LoopRanges &loopRanges = loopRangeMap[&loop];
+      assert(loopRanges.empty() && "Loop encountered but not processed?");
+      SlotIndex oldEnd = lis->getMBBEndIdx(*loopMBBs.begin());
+      loopRanges.push_back(
+        std::make_pair(lis->getMBBStartIdx(*loopMBBs.begin()),
+                       lis->getInvalidIndex()));
+      for (LoopMBBSet::iterator curBlockItr = llvm::next(loopMBBs.begin()),
+                                curBlockEnd = loopMBBs.end();
+           curBlockItr != curBlockEnd; ++curBlockItr) {
+        SlotIndex newStart = lis->getMBBStartIdx(*curBlockItr);
+        if (newStart != oldEnd) {
+          loopRanges.back().second = oldEnd;
+          loopRanges.push_back(std::make_pair(newStart,
+                                              lis->getInvalidIndex()));
+        }
+        oldEnd = lis->getMBBEndIdx(*curBlockItr);
+      }
+
+      loopRanges.back().second =
+        lis->getMBBEndIdx(*llvm::prior(loopMBBs.end()));
+
+      return loopRanges;
+    }
+    return lrItr->second;
+  }
+
+  std::pair<bool, LoopSplitter::SlotPair> LoopSplitter::getLoopSubRange(
+                                                           const LiveRange &lr,
+                                                           MachineLoop &loop) {
+    LoopRanges &loopRanges = getLoopRanges(loop);
+    LoopRanges::iterator lrItr = loopRanges.begin(),
+                         lrEnd = loopRanges.end();
+    while (lrItr != lrEnd && lr.start >= lrItr->second) {
+      ++lrItr;
+    }
+
+    if (lrItr == lrEnd) {
+      SlotIndex invalid = lis->getInvalidIndex();
+      return std::make_pair(false, SlotPair(invalid, invalid));
+    }
+
+    SlotIndex srStart(lr.start < lrItr->first ? lrItr->first : lr.start);
+    SlotIndex srEnd(lr.end > lrItr->second ? lrItr->second : lr.end);
+
+    return std::make_pair(true, SlotPair(srStart, srEnd));      
+  }
+
+  void LoopSplitter::dumpLoopRanges(MachineLoop &loop) {
+    LoopRanges &loopRanges = getLoopRanges(loop);
+    dbgs() << "For loop MBB#" << loop.getHeader()->getNumber() << ", subranges are: [ ";
+    for (LoopRanges::iterator lrItr = loopRanges.begin(), lrEnd = loopRanges.end();
+         lrItr != lrEnd; ++lrItr) {
+      dbgs() << "[" << lrItr->first << ", " << lrItr->second << ") ";
+    }
+    dbgs() << "]\n";
+  }
+
+  void LoopSplitter::processHeader(LoopSplit &split) {
+    MachineBasicBlock &header = *split.getLoop().getHeader();
+    //dbgs() << "  Processing loop header BB#" << header.getNumber() << "\n";
+
+    if (!lis->isLiveInToMBB(split.getLI(), &header))
+      return; // Not live in, but nothing wrong so far.
+
+    MachineBasicBlock *preHeader = split.getLoop().getLoopPreheader();
+    if (!preHeader) {
+
+      if (!canInsertPreHeader(split.getLoop())) {
+        split.invalidate();
+        return; // Couldn't insert a pre-header. Bail on this interval.
+      }
+
+      for (MachineBasicBlock::pred_iterator predItr = header.pred_begin(),
+                                            predEnd = header.pred_end();
+           predItr != predEnd; ++predItr) {
+        if (lis->isLiveOutOfMBB(split.getLI(), *predItr)) {
+          split.splitIncoming();
+          break;
+        }
+      }
+    } else if (lis->isLiveOutOfMBB(split.getLI(), preHeader)) {
+      split.splitIncoming();
+    }
+  }
+
+  void LoopSplitter::processLoopExits(LoopSplit &split) {
+    typedef SmallVector<MachineLoop::Edge, 8> ExitEdgesList;
+    ExitEdgesList exitEdges;
+    split.getLoop().getExitEdges(exitEdges);
+
+    //dbgs() << "  Processing loop exits:\n";
+
+    for (ExitEdgesList::iterator exitEdgeItr = exitEdges.begin(),
+                                 exitEdgeEnd = exitEdges.end();
+         exitEdgeItr != exitEdgeEnd; ++exitEdgeItr) {
+      MachineLoop::Edge exitEdge = *exitEdgeItr;
+
+      LiveRange *outRange =
+        split.getLI().getLiveRangeContaining(lis->getMBBStartIdx(exitEdge.second));
+
+      if (outRange != 0) {
+        if (isCriticalEdge(exitEdge) && !canSplitEdge(exitEdge)) {
+          split.invalidate();
+          return;
+        }
+
+        split.splitOutgoing(exitEdge);
+      }
+    }
+  }
+
+  void LoopSplitter::processLoopUses(LoopSplit &split) {
+    std::set<MachineInstr*> processed;
+
+    for (MachineRegisterInfo::reg_iterator
+           rItr = mri->reg_begin(split.getLI().reg),
+           rEnd = mri->reg_end();
+      rItr != rEnd; ++rItr) {
+      MachineInstr &instr = *rItr;
+      if (split.getLoop().contains(&instr) && processed.count(&instr) == 0) {
+        split.addLoopInstr(&instr);
+        processed.insert(&instr);
+      }
+    }
+
+    //dbgs() << "  Rewriting reg" << li.reg << " to reg" << newLI->reg
+    //       << " in blocks [ ";
+    //dbgs() << "]\n";
+  }
+
+  bool LoopSplitter::splitOverLoop(LiveInterval &li, MachineLoop &loop) {
+    assert(TargetRegisterInfo::isVirtualRegister(li.reg) &&
+           "Attempt to split physical register.");
+
+    LoopSplit split(*this, li, loop);
+    processHeader(split);
+    if (split.isValid())
+      processLoopExits(split);
+    if (split.isValid())
+      processLoopUses(split);
+    if (split.isValid() /* && split.isWorthwhile() */) {
+      split.apply();
+      DEBUG(dbgs() << "Success.\n");
+      return true;
+    }
+    DEBUG(dbgs() << "Failed.\n");
+    return false;
+  }
+
+  void LoopSplitter::processInterval(LiveInterval &li) {
+    std::deque<MachineLoop*> loops;
+    std::copy(mli->begin(), mli->end(), std::back_inserter(loops));
+
+    while (!loops.empty()) {
+      MachineLoop &loop = *loops.front();
+      loops.pop_front();
+      DEBUG(
+        dbgs() << fqn << " reg" << li.reg << " " << li.weight << " BB#"
+               << loop.getHeader()->getNumber() << " ";
+      );
+      if (!splitOverLoop(li, loop)) {
+        // Couldn't split over outer loop, schedule sub-loops to be checked.
+	std::copy(loop.begin(), loop.end(), std::back_inserter(loops));
+      }
+    }
+  }
+
+  void LoopSplitter::processIntervals() {
+    while (!intervals.empty()) {
+      LiveInterval &li = *intervals.front();
+      intervals.pop_front();
+
+      assert(!lis->intervalIsInOneMBB(li) &&
+             "Single interval in process worklist.");
+
+      processInterval(li);      
+    }
+  }
+
+}
diff --git a/lib/CodeGen/Splitter.h b/lib/CodeGen/Splitter.h
new file mode 100644
index 0000000000000..a726a7b834fbe
--- /dev/null
+++ b/lib/CodeGen/Splitter.h
@@ -0,0 +1,99 @@
+//===-- llvm/CodeGen/Splitter.h - Splitter -*- C++ -*----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SPLITTER_H
+#define LLVM_CODEGEN_SPLITTER_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+#include <deque>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+  class LiveInterval;
+  class LiveIntervals;
+  struct LiveRange;
+  class LoopSplit;
+  class MachineDominatorTree;
+  class MachineRegisterInfo;
+  class SlotIndexes;
+  class TargetInstrInfo;
+  class VNInfo;
+
+  class LoopSplitter : public MachineFunctionPass {
+    friend class LoopSplit;
+  public:
+    static char ID;
+
+    LoopSplitter() : MachineFunctionPass(ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &au) const;
+
+    virtual bool runOnMachineFunction(MachineFunction &fn);
+
+    virtual void releaseMemory();
+
+
+  private:
+
+    MachineFunction *mf;
+    LiveIntervals *lis;
+    MachineLoopInfo *mli;
+    MachineRegisterInfo *mri;
+    MachineDominatorTree *mdt;
+    SlotIndexes *sis;
+    const TargetInstrInfo *tii;
+    const TargetRegisterInfo *tri;
+
+    std::string fqn;
+    std::deque<LiveInterval*> intervals;
+
+    typedef std::pair<SlotIndex, SlotIndex> SlotPair;
+    typedef std::vector<SlotPair> LoopRanges;
+    typedef std::map<MachineLoop*, LoopRanges> LoopRangeMap;
+    LoopRangeMap loopRangeMap;
+
+    void dumpLoopInfo(MachineLoop &loop);
+
+    void dumpOddTerminators();
+
+    void updateTerminators(MachineBasicBlock &mbb);
+
+    bool canInsertPreHeader(MachineLoop &loop);
+    MachineBasicBlock& insertPreHeader(MachineLoop &loop);
+
+    bool isCriticalEdge(MachineLoop::Edge &edge);
+    bool canSplitEdge(MachineLoop::Edge &edge);
+    MachineBasicBlock& splitEdge(MachineLoop::Edge &edge, MachineLoop &loop);
+
+    LoopRanges& getLoopRanges(MachineLoop &loop);
+    std::pair<bool, SlotPair> getLoopSubRange(const LiveRange &lr,
+                                              MachineLoop &loop);
+
+    void dumpLoopRanges(MachineLoop &loop);
+
+    void processHeader(LoopSplit &split);
+    void processLoopExits(LoopSplit &split);
+    void processLoopUses(LoopSplit &split);
+
+    bool splitOverLoop(LiveInterval &li, MachineLoop &loop);
+
+    void processInterval(LiveInterval &li);
+
+    void processIntervals();
+  };
+
+}
+
+#endif
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index ca5c28ce010cb..9f51778da7562 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -62,17 +62,17 @@ namespace {
     bool RequiresStackProtector() const;
   public:
     static char ID;             // Pass identification, replacement for typeid.
-    StackProtector() : FunctionPass(&ID), TLI(0) {}
+    StackProtector() : FunctionPass(ID), TLI(0) {}
     StackProtector(const TargetLowering *tli)
-      : FunctionPass(&ID), TLI(tli) {}
+      : FunctionPass(ID), TLI(tli) {}
 
     virtual bool runOnFunction(Function &Fn);
   };
 } // end anonymous namespace
 
 char StackProtector::ID = 0;
-static RegisterPass<StackProtector>
-X("stack-protector", "Insert stack protectors");
+INITIALIZE_PASS(StackProtector, "stack-protector",
+                "Insert stack protectors", false, false);
 
 FunctionPass *llvm::createStackProtectorPass(const TargetLowering *tli) {
   return new StackProtector(tli);
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index eff3c33e3daa0..8d57ae95dde2a 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -95,9 +95,9 @@ namespace {
   public:
     static char ID; // Pass identification
     StackSlotColoring() :
-      MachineFunctionPass(&ID), ColorWithRegs(false), NextColor(-1) {}
+      MachineFunctionPass(ID), ColorWithRegs(false), NextColor(-1) {}
     StackSlotColoring(bool RegColor) :
-      MachineFunctionPass(&ID), ColorWithRegs(RegColor), NextColor(-1) {}
+      MachineFunctionPass(ID), ColorWithRegs(RegColor), NextColor(-1) {}
     
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
@@ -119,7 +119,6 @@ namespace {
 
   private:
     void InitializeSlots();
-    bool CheckForSetJmpCall(const MachineFunction &MF) const;
     void ScanForSpillSlotRefs(MachineFunction &MF);
     bool OverlapWithAssignments(LiveInterval *li, int Color) const;
     int ColorSlot(LiveInterval *li);
@@ -146,8 +145,8 @@ namespace {
 
 char StackSlotColoring::ID = 0;
 
-static RegisterPass<StackSlotColoring>
-X("stack-slot-coloring", "Stack Slot Coloring");
+INITIALIZE_PASS(StackSlotColoring, "stack-slot-coloring",
+                "Stack Slot Coloring", false, false);
 
 FunctionPass *llvm::createStackSlotColoringPass(bool RegColor) {
   return new StackSlotColoring(RegColor);
diff --git a/lib/CodeGen/StrongPHIElimination.cpp b/lib/CodeGen/StrongPHIElimination.cpp
index 59315cf67282b..894dbfa28bac7 100644
--- a/lib/CodeGen/StrongPHIElimination.cpp
+++ b/lib/CodeGen/StrongPHIElimination.cpp
@@ -39,7 +39,7 @@ using namespace llvm;
 namespace {
   struct StrongPHIElimination : public MachineFunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    StrongPHIElimination() : MachineFunctionPass(&ID) {}
+    StrongPHIElimination() : MachineFunctionPass(ID) {}
 
     // Waiting stores, for each MBB, the set of copies that need to
     // be inserted into that MBB
@@ -150,11 +150,10 @@ namespace {
 }
 
 char StrongPHIElimination::ID = 0;
-static RegisterPass<StrongPHIElimination>
-X("strong-phi-node-elimination",
-  "Eliminate PHI nodes for register allocation, intelligently");
+INITIALIZE_PASS(StrongPHIElimination, "strong-phi-node-elimination",
+  "Eliminate PHI nodes for register allocation, intelligently", false, false);
 
-const PassInfo *const llvm::StrongPHIEliminationID = &X;
+char &llvm::StrongPHIEliminationID = StrongPHIElimination::ID;
 
 /// computeDFS - Computes the DFS-in and DFS-out numbers of the dominator tree
 /// of the given MachineFunction.  These numbers are then used in other parts
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 075db803bd231..a815b364d54e7 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -69,7 +69,7 @@ namespace {
   public:
     static char ID;
     explicit TailDuplicatePass(bool PreRA) :
-      MachineFunctionPass(&ID), PreRegAlloc(PreRA) {}
+      MachineFunctionPass(ID), PreRegAlloc(PreRA) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
     virtual const char *getPassName() const { return "Tail Duplication"; }
@@ -254,14 +254,15 @@ bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) {
       // SSA form.
       for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
         MachineInstr *Copy = Copies[i];
-        unsigned Src, Dst, SrcSR, DstSR;
-        if (TII->isMoveInstr(*Copy, Src, Dst, SrcSR, DstSR)) {
-          MachineRegisterInfo::use_iterator UI = MRI->use_begin(Src);
-          if (++UI == MRI->use_end()) {
-            // Copy is the only use. Do trivial copy propagation here.
-            MRI->replaceRegWith(Dst, Src);
-            Copy->eraseFromParent();
-          }
+        if (!Copy->isCopy())
+          continue;
+        unsigned Dst = Copy->getOperand(0).getReg();
+        unsigned Src = Copy->getOperand(1).getReg();
+        MachineRegisterInfo::use_iterator UI = MRI->use_begin(Src);
+        if (++UI == MRI->use_end()) {
+          // Copy is the only use. Do trivial copy propagation here.
+          MRI->replaceRegWith(Dst, Src);
+          Copy->eraseFromParent();
         }
       }
 
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index cdacb98e0e883..6e4a0d837ecd3 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -178,19 +178,6 @@ MachineInstr *TargetInstrInfoImpl::duplicate(MachineInstr *Orig,
   return MF.CloneMachineInstr(Orig);
 }
 
-unsigned
-TargetInstrInfoImpl::GetFunctionSizeInBytes(const MachineFunction &MF) const {
-  unsigned FnSize = 0;
-  for (MachineFunction::const_iterator MBBI = MF.begin(), E = MF.end();
-       MBBI != E; ++MBBI) {
-    const MachineBasicBlock &MBB = *MBBI;
-    for (MachineBasicBlock::const_iterator I = MBB.begin(),E = MBB.end();
-         I != E; ++I)
-      FnSize += GetInstSizeInBytes(I);
-  }
-  return FnSize;
-}
-
 // If the COPY instruction in MI can be folded to a stack operation, return
 // the register class to use.
 static const TargetRegisterClass *canFoldCopy(const MachineInstr *MI,
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index a80cfc4b256fe..f1e10eec724c6 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -519,11 +519,7 @@ void TargetLoweringObjectFileMachO::Initialize(MCContext &Ctx,
   ConstTextCoalSection
     = getContext().getMachOSection("__TEXT", "__const_coal", 
                                    MCSectionMachO::S_COALESCED,
-                                   SectionKind::getText());
-  ConstDataCoalSection
-    = getContext().getMachOSection("__DATA","__const_coal",
-                                   MCSectionMachO::S_COALESCED,
-                                   SectionKind::getText());
+                                   SectionKind::getReadOnly());
   ConstDataSection  // .const_data
     = getContext().getMachOSection("__DATA", "__const", 0,
                                    SectionKind::getReadOnlyWithRel());
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 564914373bb56..78989c567e42f 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -138,7 +138,7 @@ namespace {
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    TwoAddressInstructionPass() : MachineFunctionPass(&ID) {}
+    TwoAddressInstructionPass() : MachineFunctionPass(ID) {}
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
@@ -159,10 +159,10 @@ namespace {
 }
 
 char TwoAddressInstructionPass::ID = 0;
-static RegisterPass<TwoAddressInstructionPass>
-X("twoaddressinstruction", "Two-Address instruction pass");
+INITIALIZE_PASS(TwoAddressInstructionPass, "twoaddressinstruction",
+                "Two-Address instruction pass", false, false);
 
-const PassInfo *const llvm::TwoAddressInstructionPassID = &X;
+char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID;
 
 /// Sink3AddrInstruction - A two-address instruction has been converted to a
 /// three-address instruction to avoid clobbering a register. Try to sink it
@@ -380,26 +380,18 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
                         bool &IsSrcPhys, bool &IsDstPhys) {
   SrcReg = 0;
   DstReg = 0;
-  unsigned SrcSubIdx, DstSubIdx;
-  if (!TII->isMoveInstr(MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
-    if (MI.isCopy()) {
-      DstReg = MI.getOperand(0).getReg();
-      SrcReg = MI.getOperand(1).getReg();
-    } else if (MI.isInsertSubreg()) {
-      DstReg = MI.getOperand(0).getReg();
-      SrcReg = MI.getOperand(2).getReg();
-    } else if (MI.isSubregToReg()) {
-      DstReg = MI.getOperand(0).getReg();
-      SrcReg = MI.getOperand(2).getReg();
-    }
-  }
+  if (MI.isCopy()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+  } else if (MI.isInsertSubreg() || MI.isSubregToReg()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(2).getReg();
+  } else
+    return false;
 
-  if (DstReg) {
-    IsSrcPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
-    IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
-    return true;
-  }
-  return false;
+  IsSrcPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
+  IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+  return true;
 }
 
 /// isKilled - Test if the given register value, which is used by the given
@@ -1454,7 +1446,17 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
         //
         // If the REG_SEQUENCE doesn't kill its source, keeping live variables
         // correctly up to date becomes very difficult. Insert a copy.
-        //
+
+        // Defer any kill flag to the last operand using SrcReg. Otherwise, we
+        // might insert a COPY that uses SrcReg after is was killed.
+        if (isKill)
+          for (unsigned j = i + 2; j < e; j += 2)
+            if (MI->getOperand(j).getReg() == SrcReg) {
+              MI->getOperand(j).setIsKill();
+              isKill = false;
+              break;
+            }
+
         MachineBasicBlock::iterator InsertLoc = MI;
         MachineInstr *CopyMI = BuildMI(*MI->getParent(), InsertLoc,
                                 MI->getDebugLoc(), TII->get(TargetOpcode::COPY))
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index 7b338126d475f..6dd333358bc44 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -43,7 +43,7 @@ namespace {
     virtual bool runOnFunction(Function &F);
   public:
     static char ID; // Pass identification, replacement for typeid
-    UnreachableBlockElim() : FunctionPass(&ID) {}
+    UnreachableBlockElim() : FunctionPass(ID) {}
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<ProfileInfo>();
@@ -51,8 +51,8 @@ namespace {
   };
 }
 char UnreachableBlockElim::ID = 0;
-static RegisterPass<UnreachableBlockElim>
-X("unreachableblockelim", "Remove unreachable blocks from the CFG");
+INITIALIZE_PASS(UnreachableBlockElim, "unreachableblockelim",
+                "Remove unreachable blocks from the CFG", false, false);
 
 FunctionPass *llvm::createUnreachableBlockEliminationPass() {
   return new UnreachableBlockElim();
@@ -100,16 +100,15 @@ namespace {
     MachineModuleInfo *MMI;
   public:
     static char ID; // Pass identification, replacement for typeid
-    UnreachableMachineBlockElim() : MachineFunctionPass(&ID) {}
+    UnreachableMachineBlockElim() : MachineFunctionPass(ID) {}
   };
 }
 char UnreachableMachineBlockElim::ID = 0;
 
-static RegisterPass<UnreachableMachineBlockElim>
-Y("unreachable-mbb-elimination",
-  "Remove unreachable machine basic blocks");
+INITIALIZE_PASS(UnreachableMachineBlockElim, "unreachable-mbb-elimination",
+  "Remove unreachable machine basic blocks", false, false);
 
-const PassInfo *const llvm::UnreachableMachineBlockElimID = &Y;
+char &llvm::UnreachableMachineBlockElimID = UnreachableMachineBlockElim::ID;
 
 void UnreachableMachineBlockElim::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineLoopInfo>();
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index ed0269695dfe0..20ffcffa70d35 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -48,8 +48,7 @@ STATISTIC(NumSpills  , "Number of register spills");
 
 char VirtRegMap::ID = 0;
 
-static RegisterPass<VirtRegMap>
-X("virtregmap", "Virtual Register Map");
+INITIALIZE_PASS(VirtRegMap, "virtregmap", "Virtual Register Map", false, false);
 
 bool VirtRegMap::runOnMachineFunction(MachineFunction &mf) {
   MRI = &mf.getRegInfo();
diff --git a/lib/CodeGen/VirtRegMap.h b/lib/CodeGen/VirtRegMap.h
index a5599f68b64e6..8b6082d181932 100644
--- a/lib/CodeGen/VirtRegMap.h
+++ b/lib/CodeGen/VirtRegMap.h
@@ -139,7 +139,7 @@ namespace llvm {
 
   public:
     static char ID;
-    VirtRegMap() : MachineFunctionPass(&ID), Virt2PhysMap(NO_PHYS_REG),
+    VirtRegMap() : MachineFunctionPass(ID), Virt2PhysMap(NO_PHYS_REG),
                    Virt2StackSlotMap(NO_STACK_SLOT), 
                    Virt2ReMatIdMap(NO_STACK_SLOT), Virt2SplitMap(0),
                    Virt2SplitKillMap(SlotIndex()), ReMatMap(NULL),
@@ -152,6 +152,11 @@ namespace llvm {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
+    MachineFunction &getMachineFunction() const {
+      assert(MF && "getMachineFunction called before runOnMAchineFunction");
+      return *MF;
+    }
+
     void grow();
 
     /// @brief returns true if the specified virtual register is
diff --git a/lib/CodeGen/VirtRegRewriter.cpp b/lib/CodeGen/VirtRegRewriter.cpp
index 57a1500e6e9da..240d28cf30114 100644
--- a/lib/CodeGen/VirtRegRewriter.cpp
+++ b/lib/CodeGen/VirtRegRewriter.cpp
@@ -67,23 +67,16 @@ VirtRegRewriter::~VirtRegRewriter() {}
 /// Note that operands may be added, so the MO reference is no longer valid.
 static void substitutePhysReg(MachineOperand &MO, unsigned Reg,
                               const TargetRegisterInfo &TRI) {
-  if (unsigned SubIdx = MO.getSubReg()) {
-    // Insert the physical subreg and reset the subreg field.
-    MO.setReg(TRI.getSubReg(Reg, SubIdx));
-    MO.setSubReg(0);
-
-    // Any def, dead, and kill flags apply to the full virtual register, so they
-    // also apply to the full physical register. Add imp-def/dead and imp-kill
-    // as needed.
+  if (MO.getSubReg()) {
+    MO.substPhysReg(Reg, TRI);
+
+    // Any kill flags apply to the full virtual register, so they also apply to
+    // the full physical register.
+    // We assume that partial defs have already been decorated with a super-reg
+    // <imp-def> operand by LiveIntervals.
     MachineInstr &MI = *MO.getParent();
-    if (MO.isDef())
-      if (MO.isDead())
-        MI.addRegisterDead(Reg, &TRI, /*AddIfNotFound=*/ true);
-      else
-        MI.addRegisterDefined(Reg, &TRI);
-    else if (!MO.isUndef() &&
-             (MO.isKill() ||
-              MI.isRegTiedToDefOperand(&MO-&MI.getOperand(0))))
+    if (MO.isUse() && !MO.isUndef() &&
+        (MO.isKill() || MI.isRegTiedToDefOperand(&MO-&MI.getOperand(0))))
       MI.addRegisterKilled(Reg, &TRI, /*AddIfNotFound=*/ true);
   } else {
     MO.setReg(Reg);
@@ -460,7 +453,7 @@ public:
 /// blocks each of which is a successor of the specified BB and has no other
 /// predecessor.
 static void findSinglePredSuccessor(MachineBasicBlock *MBB,
-                                   SmallVectorImpl<MachineBasicBlock *> &Succs) {
+                                   SmallVectorImpl<MachineBasicBlock *> &Succs){
   for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
          SE = MBB->succ_end(); SI != SE; ++SI) {
     MachineBasicBlock *SuccMBB = *SI;
@@ -852,8 +845,8 @@ unsigned ReuseInfo::GetRegForReload(const TargetRegisterClass *RC,
       // Yup, use the reload register that we didn't use before.
       unsigned NewReg = Op.AssignedPhysReg;
       Rejected.insert(PhysReg);
-      return GetRegForReload(RC, NewReg, MF, MI, Spills, MaybeDeadStores, Rejected,
-                             RegKills, KillOps, VRM);
+      return GetRegForReload(RC, NewReg, MF, MI, Spills, MaybeDeadStores,
+                             Rejected, RegKills, KillOps, VRM);
     } else {
       // Otherwise, we might also have a problem if a previously reused
       // value aliases the new register. If so, codegen the previous reload
@@ -1864,7 +1857,7 @@ bool LocalRewriter::InsertSpills(MachineInstr *MI) {
 
 
 /// rewriteMBB - Keep track of which spills are available even after the
-/// register allocator is done with them.  If possible, avid reloading vregs.
+/// register allocator is done with them.  If possible, avoid reloading vregs.
 void
 LocalRewriter::RewriteMBB(LiveIntervals *LIs,
                           AvailableSpills &Spills, BitVector &RegKills,
@@ -1914,7 +1907,6 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
     if (InsertSpills(MII))
       NextMII = llvm::next(MII);
 
-    VirtRegMap::MI2VirtMapTy::const_iterator I, End;
     bool Erased = false;
     bool BackTracked = false;
     MachineInstr &MI = *MII;
@@ -2028,14 +2020,16 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
           CanReuse = !ReusedOperands.isClobbered(PhysReg) &&
             Spills.canClobberPhysReg(PhysReg);
         }
-        // If this is an asm, and PhysReg is used elsewhere as an earlyclobber
-        // operand, we can't also use it as an input.  (Outputs always come
-        // before inputs, so we can stop looking at i.)
+        // If this is an asm, and a PhysReg alias is used elsewhere as an
+        // earlyclobber operand, we can't also use it as an input.
         if (MI.isInlineAsm()) {
-          for (unsigned k=0; k<i; ++k) {
+          for (unsigned k = 0, e = MI.getNumOperands(); k != e; ++k) {
             MachineOperand &MOk = MI.getOperand(k);
-            if (MOk.isReg() && MOk.getReg()==PhysReg && MOk.isEarlyClobber()) {
+            if (MOk.isReg() && MOk.isEarlyClobber() &&
+                TRI->regsOverlap(MOk.getReg(), PhysReg)) {
               CanReuse = false;
+              DEBUG(dbgs() << "Not reusing physreg " << TRI->getName(PhysReg)
+                           << " for vreg" << VirtReg << ": " << MOk << '\n');
               break;
             }
           }
@@ -2248,15 +2242,22 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
     // If we have folded references to memory operands, make sure we clear all
     // physical registers that may contain the value of the spilled virtual
     // register
+
+    // Copy the folded virts to a small vector, we may change MI2VirtMap.
+    SmallVector<std::pair<unsigned, VirtRegMap::ModRef>, 4> FoldedVirts;
+    // C++0x FTW!
+    for (std::pair<VirtRegMap::MI2VirtMapTy::const_iterator,
+                   VirtRegMap::MI2VirtMapTy::const_iterator> FVRange =
+           VRM->getFoldedVirts(&MI);
+         FVRange.first != FVRange.second; ++FVRange.first)
+      FoldedVirts.push_back(FVRange.first->second);
+
     SmallSet<int, 2> FoldedSS;
-    for (tie(I, End) = VRM->getFoldedVirts(&MI); I != End; ) {
-      unsigned VirtReg = I->second.first;
-      VirtRegMap::ModRef MR = I->second.second;
+    for (unsigned FVI = 0, FVE = FoldedVirts.size(); FVI != FVE; ++FVI) {
+      unsigned VirtReg = FoldedVirts[FVI].first;
+      VirtRegMap::ModRef MR = FoldedVirts[FVI].second;
       DEBUG(dbgs() << "Folded vreg: " << VirtReg << "  MR: " << MR);
 
-      // MI2VirtMap be can updated which invalidate the iterator.
-      // Increment the iterator first.
-      ++I;
       int SS = VRM->getStackSlot(VirtReg);
       if (SS == VirtRegMap::NO_STACK_SLOT)
         continue;
@@ -2302,7 +2303,7 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
           unsigned PhysReg = Spills.getSpillSlotOrReMatPhysReg(SS);
           SmallVector<MachineInstr*, 4> NewMIs;
           if (PhysReg &&
-              TII->unfoldMemoryOperand(MF, &MI, PhysReg, false, false, NewMIs)) {
+              TII->unfoldMemoryOperand(MF, &MI, PhysReg, false, false, NewMIs)){
             MBB->insert(MII, NewMIs[0]);
             InvalidateKills(MI, TRI, RegKills, KillOps);
             VRM->RemoveMachineInstrFromMaps(&MI);
@@ -2442,28 +2443,6 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
           Spills.disallowClobberPhysReg(VirtReg);
           goto ProcessNextInst;
         }
-        unsigned Src, Dst, SrcSR, DstSR;
-        if (TII->isMoveInstr(MI, Src, Dst, SrcSR, DstSR) &&
-            Src == Dst && SrcSR == DstSR &&
-            !MI.findRegisterUseOperand(Src)->isUndef()) {
-          ++NumDCE;
-          DEBUG(dbgs() << "Removing now-noop copy: " << MI);
-          SmallVector<unsigned, 2> KillRegs;
-          InvalidateKills(MI, TRI, RegKills, KillOps, &KillRegs);
-          if (MO.isDead() && !KillRegs.empty()) {
-            // Source register or an implicit super/sub-register use is killed.
-            assert(KillRegs[0] == Dst ||
-                   TRI->isSubRegister(KillRegs[0], Dst) ||
-                   TRI->isSuperRegister(KillRegs[0], Dst));
-            // Last def is now dead.
-            TransferDeadness(Src, RegKills, KillOps);
-          }
-          VRM->RemoveMachineInstrFromMaps(&MI);
-          MBB->erase(&MI);
-          Erased = true;
-          Spills.disallowClobberPhysReg(VirtReg);
-          goto ProcessNextInst;
-        }
 
         // If it's not a no-op copy, it clobbers the value in the destreg.
         Spills.ClobberPhysReg(VirtReg);
@@ -2541,20 +2520,6 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
           UpdateKills(*LastStore, TRI, RegKills, KillOps);
           goto ProcessNextInst;
         }
-        {
-          unsigned Src, Dst, SrcSR, DstSR;
-          if (TII->isMoveInstr(MI, Src, Dst, SrcSR, DstSR) &&
-              Src == Dst && SrcSR == DstSR) {
-            ++NumDCE;
-            DEBUG(dbgs() << "Removing now-noop copy: " << MI);
-            InvalidateKills(MI, TRI, RegKills, KillOps);
-            VRM->RemoveMachineInstrFromMaps(&MI);
-            MBB->erase(&MI);
-            Erased = true;
-            UpdateKills(*LastStore, TRI, RegKills, KillOps);
-            goto ProcessNextInst;
-          }
-        }
       }
     }
     ProcessNextInst:
diff --git a/lib/CompilerDriver/Action.cpp b/lib/CompilerDriver/Action.cpp
index 5f30dce5855fd..0be80496a3cb7 100644
--- a/lib/CompilerDriver/Action.cpp
+++ b/lib/CompilerDriver/Action.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/CompilerDriver/Action.h"
 #include "llvm/CompilerDriver/BuiltinOptions.h"
+#include "llvm/CompilerDriver/Error.h"
 
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/SystemUtils.h"
@@ -58,11 +59,15 @@ namespace {
 
     if (prog.isEmpty()) {
       prog = FindExecutable(name, ProgramName, (void *)(intptr_t)&Main);
-      if (prog.isEmpty())
-        throw std::runtime_error("Can't find program '" + name + "'");
+      if (prog.isEmpty()) {
+        PrintError("Can't find program '" + name + "'");
+        return -1;
+      }
+    }
+    if (!prog.canExecute()) {
+      PrintError("Program '" + name + "' is not executable.");
+      return -1;
     }
-    if (!prog.canExecute())
-      throw std::runtime_error("Program '" + name + "' is not executable.");
 
     // Build the command line vector and the redirects array.
     const sys::Path* redirects[3] = {0,0,0};
diff --git a/lib/CompilerDriver/BuiltinOptions.cpp b/lib/CompilerDriver/BuiltinOptions.cpp
index d1ac8c98322c1..38442038d7385 100644
--- a/lib/CompilerDriver/BuiltinOptions.cpp
+++ b/lib/CompilerDriver/BuiltinOptions.cpp
@@ -19,7 +19,7 @@
 
 namespace cl = llvm::cl;
 
-// External linkage here is intentional.
+namespace llvmc {
 
 cl::list<std::string> InputFilenames(cl::Positional, cl::desc("<input file>"),
                                      cl::ZeroOrMore);
@@ -57,3 +57,5 @@ cl::opt<SaveTempsEnum::Values> SaveTemps
             clEnumValN(SaveTempsEnum::Obj, "", "Same as 'cwd'"),
             clEnumValEnd),
  cl::ValueOptional);
+
+}  // End namespace llvmc.
diff --git a/lib/CompilerDriver/CompilationGraph.cpp b/lib/CompilerDriver/CompilationGraph.cpp
index 7d1c7fe4a62b9..d0c0e15bcdb7b 100644
--- a/lib/CompilerDriver/CompilationGraph.cpp
+++ b/lib/CompilerDriver/CompilationGraph.cpp
@@ -25,39 +25,46 @@
 #include <iterator>
 #include <limits>
 #include <queue>
-#include <stdexcept>
 
 using namespace llvm;
 using namespace llvmc;
 
 namespace llvmc {
 
-  const std::string& LanguageMap::GetLanguage(const sys::Path& File) const {
+  const std::string* LanguageMap::GetLanguage(const sys::Path& File) const {
     StringRef suf = File.getSuffix();
     LanguageMap::const_iterator Lang =
       this->find(suf.empty() ? "*empty*" : suf);
-    if (Lang == this->end())
-      throw std::runtime_error("File '" + File.str() +
-                                "' has unknown suffix '" + suf.str() + '\'');
-    return Lang->second;
+    if (Lang == this->end()) {
+      PrintError("File '" + File.str() + "' has unknown suffix '"
+                 + suf.str() + '\'');
+      return 0;
+    }
+    return &Lang->second;
   }
 }
 
 namespace {
 
-  /// ChooseEdge - Return the edge with the maximum weight.
+  /// ChooseEdge - Return the edge with the maximum weight. Returns 0 on error.
   template <class C>
   const Edge* ChooseEdge(const C& EdgesContainer,
                          const InputLanguagesSet& InLangs,
                          const std::string& NodeName = "root") {
     const Edge* MaxEdge = 0;
-    unsigned MaxWeight = 0;
+    int MaxWeight = 0;
     bool SingleMax = true;
 
+    // TODO: fix calculation of SingleMax.
     for (typename C::const_iterator B = EdgesContainer.begin(),
            E = EdgesContainer.end(); B != E; ++B) {
       const Edge* e = B->getPtr();
-      unsigned EW = e->Weight(InLangs);
+      int EW = e->Weight(InLangs);
+      if (EW < 0) {
+        // (error) invocation in TableGen -> we don't need to print an error
+        // message.
+        return 0;
+      }
       if (EW > MaxWeight) {
         MaxEdge = e;
         MaxWeight = EW;
@@ -67,14 +74,16 @@ namespace {
       }
     }
 
-    if (!SingleMax)
-      throw std::runtime_error("Node " + NodeName +
-                               ": multiple maximal outward edges found!"
-                               " Most probably a specification error.");
-    if (!MaxEdge)
-      throw std::runtime_error("Node " + NodeName +
-                               ": no maximal outward edge found!"
-                               " Most probably a specification error.");
+    if (!SingleMax) {
+      PrintError("Node " + NodeName + ": multiple maximal outward edges found!"
+                 " Most probably a specification error.");
+      return 0;
+    }
+    if (!MaxEdge) {
+      PrintError("Node " + NodeName + ": no maximal outward edge found!"
+                 " Most probably a specification error.");
+      return 0;
+    }
     return MaxEdge;
   }
 
@@ -98,29 +107,34 @@ CompilationGraph::CompilationGraph() {
   NodesMap["root"] = Node(this);
 }
 
-Node& CompilationGraph::getNode(const std::string& ToolName) {
+Node* CompilationGraph::getNode(const std::string& ToolName) {
   nodes_map_type::iterator I = NodesMap.find(ToolName);
-  if (I == NodesMap.end())
-    throw std::runtime_error("Node " + ToolName + " is not in the graph");
-  return I->second;
+  if (I == NodesMap.end()) {
+    PrintError("Node " + ToolName + " is not in the graph");
+    return 0;
+  }
+  return &I->second;
 }
 
-const Node& CompilationGraph::getNode(const std::string& ToolName) const {
+const Node* CompilationGraph::getNode(const std::string& ToolName) const {
   nodes_map_type::const_iterator I = NodesMap.find(ToolName);
-  if (I == NodesMap.end())
-    throw std::runtime_error("Node " + ToolName + " is not in the graph!");
-  return I->second;
+  if (I == NodesMap.end()) {
+    PrintError("Node " + ToolName + " is not in the graph!");
+    return 0;
+  }
+  return &I->second;
 }
 
 // Find the tools list corresponding to the given language name.
-const CompilationGraph::tools_vector_type&
+const CompilationGraph::tools_vector_type*
 CompilationGraph::getToolsVector(const std::string& LangName) const
 {
   tools_map_type::const_iterator I = ToolsMap.find(LangName);
-  if (I == ToolsMap.end())
-    throw std::runtime_error("No tool corresponding to the language "
-                             + LangName + " found");
-  return I->second;
+  if (I == ToolsMap.end()) {
+    PrintError("No tool corresponding to the language " + LangName + " found");
+    return 0;
+  }
+  return &I->second;
 }
 
 void CompilationGraph::insertNode(Tool* V) {
@@ -128,29 +142,37 @@ void CompilationGraph::insertNode(Tool* V) {
     NodesMap[V->Name()] = Node(this, V);
 }
 
-void CompilationGraph::insertEdge(const std::string& A, Edge* Edg) {
-  Node& B = getNode(Edg->ToolName());
+int CompilationGraph::insertEdge(const std::string& A, Edge* Edg) {
+  Node* B = getNode(Edg->ToolName());
+  if (B == 0)
+    return 1;
+
   if (A == "root") {
-    const char** InLangs = B.ToolPtr->InputLanguages();
+    const char** InLangs = B->ToolPtr->InputLanguages();
     for (;*InLangs; ++InLangs)
       ToolsMap[*InLangs].push_back(IntrusiveRefCntPtr<Edge>(Edg));
     NodesMap["root"].AddEdge(Edg);
   }
   else {
-    Node& N = getNode(A);
-    N.AddEdge(Edg);
+    Node* N = getNode(A);
+    if (N == 0)
+      return 1;
+
+    N->AddEdge(Edg);
   }
   // Increase the inward edge counter.
-  B.IncrInEdges();
+  B->IncrInEdges();
+
+  return 0;
 }
 
 // Pass input file through the chain until we bump into a Join node or
 // a node that says that it is the last.
-void CompilationGraph::PassThroughGraph (const sys::Path& InFile,
-                                         const Node* StartNode,
-                                         const InputLanguagesSet& InLangs,
-                                         const sys::Path& TempDir,
-                                         const LanguageMap& LangMap) const {
+int CompilationGraph::PassThroughGraph (const sys::Path& InFile,
+                                        const Node* StartNode,
+                                        const InputLanguagesSet& InLangs,
+                                        const sys::Path& TempDir,
+                                        const LanguageMap& LangMap) const {
   sys::Path In = InFile;
   const Node* CurNode = StartNode;
 
@@ -158,25 +180,35 @@ void CompilationGraph::PassThroughGraph (const sys::Path& InFile,
     Tool* CurTool = CurNode->ToolPtr.getPtr();
 
     if (CurTool->IsJoin()) {
-      JoinTool& JT = dynamic_cast<JoinTool&>(*CurTool);
+      JoinTool& JT = static_cast<JoinTool&>(*CurTool);
       JT.AddToJoinList(In);
       break;
     }
 
-    Action CurAction = CurTool->GenerateAction(In, CurNode->HasChildren(),
-                                               TempDir, InLangs, LangMap);
+    Action CurAction;
+    if (int ret = CurTool->GenerateAction(CurAction, In, CurNode->HasChildren(),
+                                          TempDir, InLangs, LangMap)) {
+      return ret;
+    }
 
     if (int ret = CurAction.Execute())
-      throw error_code(ret);
+      return ret;
 
     if (CurAction.StopCompilation())
-      return;
+      return 0;
+
+    const Edge* Edg = ChooseEdge(CurNode->OutEdges, InLangs, CurNode->Name());
+    if (Edg == 0)
+      return 1;
+
+    CurNode = getNode(Edg->ToolName());
+    if (CurNode == 0)
+      return 1;
 
-    CurNode = &getNode(ChooseEdge(CurNode->OutEdges,
-                                  InLangs,
-                                  CurNode->Name())->ToolName());
     In = CurAction.OutFile();
   }
+
+  return 0;
 }
 
 // Find the head of the toolchain corresponding to the given file.
@@ -186,26 +218,39 @@ FindToolChain(const sys::Path& In, const std::string* ForceLanguage,
               InputLanguagesSet& InLangs, const LanguageMap& LangMap) const {
 
   // Determine the input language.
-  const std::string& InLanguage =
-    ForceLanguage ? *ForceLanguage : LangMap.GetLanguage(In);
+  const std::string* InLang = LangMap.GetLanguage(In);
+  if (InLang == 0)
+    return 0;
+  const std::string& InLanguage = (ForceLanguage ? *ForceLanguage : *InLang);
 
   // Add the current input language to the input language set.
   InLangs.insert(InLanguage);
 
   // Find the toolchain for the input language.
-  const tools_vector_type& TV = getToolsVector(InLanguage);
-  if (TV.empty())
-    throw std::runtime_error("No toolchain corresponding to language "
-                             + InLanguage + " found");
-  return &getNode(ChooseEdge(TV, InLangs)->ToolName());
+  const tools_vector_type* pTV = getToolsVector(InLanguage);
+  if (pTV == 0)
+    return 0;
+
+  const tools_vector_type& TV = *pTV;
+  if (TV.empty()) {
+    PrintError("No toolchain corresponding to language "
+               + InLanguage + " found");
+    return 0;
+  }
+
+  const Edge* Edg = ChooseEdge(TV, InLangs);
+  if (Edg == 0)
+    return 0;
+
+  return getNode(Edg->ToolName());
 }
 
 // Helper function used by Build().
 // Traverses initial portions of the toolchains (up to the first Join node).
 // This function is also responsible for handling the -x option.
-void CompilationGraph::BuildInitial (InputLanguagesSet& InLangs,
-                                     const sys::Path& TempDir,
-                                     const LanguageMap& LangMap) {
+int CompilationGraph::BuildInitial (InputLanguagesSet& InLangs,
+                                    const sys::Path& TempDir,
+                                    const LanguageMap& LangMap) {
   // This is related to -x option handling.
   cl::list<std::string>::const_iterator xIter = Languages.begin(),
     xBegin = xIter, xEnd = Languages.end();
@@ -255,15 +300,25 @@ void CompilationGraph::BuildInitial (InputLanguagesSet& InLangs,
 
     // Find the toolchain corresponding to this file.
     const Node* N = FindToolChain(In, xLanguage, InLangs, LangMap);
+    if (N == 0)
+      return 1;
     // Pass file through the chain starting at head.
-    PassThroughGraph(In, N, InLangs, TempDir, LangMap);
+    if (int ret = PassThroughGraph(In, N, InLangs, TempDir, LangMap))
+      return ret;
   }
+
+  return 0;
 }
 
 // Sort the nodes in topological order.
-void CompilationGraph::TopologicalSort(std::vector<const Node*>& Out) {
+int CompilationGraph::TopologicalSort(std::vector<const Node*>& Out) {
   std::queue<const Node*> Q;
-  Q.push(&getNode("root"));
+
+  Node* Root = getNode("root");
+  if (Root == 0)
+    return 1;
+
+  Q.push(Root);
 
   while (!Q.empty()) {
     const Node* A = Q.front();
@@ -271,12 +326,17 @@ void CompilationGraph::TopologicalSort(std::vector<const Node*>& Out) {
     Out.push_back(A);
     for (Node::const_iterator EB = A->EdgesBegin(), EE = A->EdgesEnd();
          EB != EE; ++EB) {
-      Node* B = &getNode((*EB)->ToolName());
+      Node* B = getNode((*EB)->ToolName());
+      if (B == 0)
+        return 1;
+
       B->DecrInEdges();
       if (B->HasNoInEdges())
         Q.push(B);
     }
   }
+
+  return 0;
 }
 
 namespace {
@@ -287,49 +347,71 @@ namespace {
 
 // Call TopologicalSort and filter the resulting list to include
 // only Join nodes.
-void CompilationGraph::
+int CompilationGraph::
 TopologicalSortFilterJoinNodes(std::vector<const Node*>& Out) {
   std::vector<const Node*> TopSorted;
-  TopologicalSort(TopSorted);
+  if (int ret = TopologicalSort(TopSorted))
+    return ret;
   std::remove_copy_if(TopSorted.begin(), TopSorted.end(),
                       std::back_inserter(Out), NotJoinNode);
+
+  return 0;
 }
 
 int CompilationGraph::Build (const sys::Path& TempDir,
                              const LanguageMap& LangMap) {
-
   InputLanguagesSet InLangs;
+  bool WasSomeActionGenerated = !InputFilenames.empty();
 
   // Traverse initial parts of the toolchains and fill in InLangs.
-  BuildInitial(InLangs, TempDir, LangMap);
+  if (int ret = BuildInitial(InLangs, TempDir, LangMap))
+    return ret;
 
   std::vector<const Node*> JTV;
-  TopologicalSortFilterJoinNodes(JTV);
+  if (int ret = TopologicalSortFilterJoinNodes(JTV))
+    return ret;
 
   // For all join nodes in topological order:
   for (std::vector<const Node*>::iterator B = JTV.begin(), E = JTV.end();
        B != E; ++B) {
 
     const Node* CurNode = *B;
-    JoinTool* JT = &dynamic_cast<JoinTool&>(*CurNode->ToolPtr.getPtr());
+    JoinTool* JT = &static_cast<JoinTool&>(*CurNode->ToolPtr.getPtr());
 
     // Are there any files in the join list?
     if (JT->JoinListEmpty() && !(JT->WorksOnEmpty() && InputFilenames.empty()))
       continue;
 
-    Action CurAction = JT->GenerateAction(CurNode->HasChildren(),
-                                          TempDir, InLangs, LangMap);
+    WasSomeActionGenerated = true;
+    Action CurAction;
+    if (int ret = JT->GenerateAction(CurAction, CurNode->HasChildren(),
+                                     TempDir, InLangs, LangMap)) {
+      return ret;
+    }
 
     if (int ret = CurAction.Execute())
-      throw error_code(ret);
+      return ret;
 
     if (CurAction.StopCompilation())
       return 0;
 
-    const Node* NextNode = &getNode(ChooseEdge(CurNode->OutEdges, InLangs,
-                                               CurNode->Name())->ToolName());
-    PassThroughGraph(sys::Path(CurAction.OutFile()), NextNode,
-                     InLangs, TempDir, LangMap);
+    const Edge* Edg = ChooseEdge(CurNode->OutEdges, InLangs, CurNode->Name());
+    if (Edg == 0)
+      return 1;
+
+    const Node* NextNode = getNode(Edg->ToolName());
+    if (NextNode == 0)
+      return 1;
+
+    if (int ret = PassThroughGraph(sys::Path(CurAction.OutFile()), NextNode,
+                                   InLangs, TempDir, LangMap)) {
+      return ret;
+    }
+  }
+
+  if (!WasSomeActionGenerated) {
+    PrintError("no input files");
+    return 1;
   }
 
   return 0;
@@ -337,6 +419,7 @@ int CompilationGraph::Build (const sys::Path& TempDir,
 
 int CompilationGraph::CheckLanguageNames() const {
   int ret = 0;
+
   // Check that names for output and input languages on all edges do match.
   for (const_nodes_iterator B = this->NodesMap.begin(),
          E = this->NodesMap.end(); B != E; ++B) {
@@ -345,9 +428,11 @@ int CompilationGraph::CheckLanguageNames() const {
     if (N1.ToolPtr) {
       for (Node::const_iterator EB = N1.EdgesBegin(), EE = N1.EdgesEnd();
            EB != EE; ++EB) {
-        const Node& N2 = this->getNode((*EB)->ToolName());
+        const Node* N2 = this->getNode((*EB)->ToolName());
+        if (N2 == 0)
+          return 1;
 
-        if (!N2.ToolPtr) {
+        if (!N2->ToolPtr) {
           ++ret;
           errs() << "Error: there is an edge from '" << N1.ToolPtr->Name()
                  << "' back to the root!\n\n";
@@ -355,7 +440,7 @@ int CompilationGraph::CheckLanguageNames() const {
         }
 
         const char* OutLang = N1.ToolPtr->OutputLanguage();
-        const char** InLangs = N2.ToolPtr->InputLanguages();
+        const char** InLangs = N2->ToolPtr->InputLanguages();
         bool eq = false;
         for (;*InLangs; ++InLangs) {
           if (std::strcmp(OutLang, *InLangs) == 0) {
@@ -367,11 +452,11 @@ int CompilationGraph::CheckLanguageNames() const {
         if (!eq) {
           ++ret;
           errs() << "Error: Output->input language mismatch in the edge '"
-                 << N1.ToolPtr->Name() << "' -> '" << N2.ToolPtr->Name()
+                 << N1.ToolPtr->Name() << "' -> '" << N2->ToolPtr->Name()
                  << "'!\n"
                  << "Expected one of { ";
 
-          InLangs = N2.ToolPtr->InputLanguages();
+          InLangs = N2->ToolPtr->InputLanguages();
           for (;*InLangs; ++InLangs) {
             errs() << '\'' << *InLangs << (*(InLangs+1) ? "', " : "'");
           }
@@ -395,7 +480,7 @@ int CompilationGraph::CheckMultipleDefaultEdges() const {
   for (const_nodes_iterator B = this->NodesMap.begin(),
          E = this->NodesMap.end(); B != E; ++B) {
     const Node& N = B->second;
-    unsigned MaxWeight = 0;
+    int MaxWeight = 0;
 
     // Ignore the root node.
     if (!N.ToolPtr)
@@ -403,7 +488,7 @@ int CompilationGraph::CheckMultipleDefaultEdges() const {
 
     for (Node::const_iterator EB = N.EdgesBegin(), EE = N.EdgesEnd();
          EB != EE; ++EB) {
-      unsigned EdgeWeight = (*EB)->Weight(Dummy);
+      int EdgeWeight = (*EB)->Weight(Dummy);
       if (EdgeWeight > MaxWeight) {
         MaxWeight = EdgeWeight;
       }
@@ -422,7 +507,12 @@ int CompilationGraph::CheckMultipleDefaultEdges() const {
 int CompilationGraph::CheckCycles() {
   unsigned deleted = 0;
   std::queue<Node*> Q;
-  Q.push(&getNode("root"));
+
+  Node* Root = getNode("root");
+  if (Root == 0)
+    return 1;
+
+  Q.push(Root);
 
   // Try to delete all nodes that have no ingoing edges, starting from the
   // root. If there are any nodes left after this operation, then we have a
@@ -434,7 +524,10 @@ int CompilationGraph::CheckCycles() {
 
     for (Node::iterator EB = A->EdgesBegin(), EE = A->EdgesEnd();
          EB != EE; ++EB) {
-      Node* B = &getNode((*EB)->ToolName());
+      Node* B = getNode((*EB)->ToolName());
+      if (B == 0)
+        return 1;
+
       B->DecrInEdges();
       if (B->HasNoInEdges())
         Q.push(B);
@@ -453,18 +546,28 @@ int CompilationGraph::CheckCycles() {
 
 int CompilationGraph::Check () {
   // We try to catch as many errors as we can in one go.
+  int errs = 0;
   int ret = 0;
 
   // Check that output/input language names match.
-  ret += this->CheckLanguageNames();
+  ret = this->CheckLanguageNames();
+  if (ret < 0)
+    return 1;
+  errs += ret;
 
   // Check for multiple default edges.
-  ret += this->CheckMultipleDefaultEdges();
+  ret = this->CheckMultipleDefaultEdges();
+  if (ret < 0)
+    return 1;
+  errs += ret;
 
   // Check for cycles.
-  ret += this->CheckCycles();
+  ret = this->CheckCycles();
+  if (ret < 0)
+    return 1;
+  errs += ret;
 
-  return ret;
+  return errs;
 }
 
 // Code related to graph visualization.
@@ -516,7 +619,7 @@ namespace llvm {
 
 }
 
-void CompilationGraph::writeGraph(const std::string& OutputFilename) {
+int CompilationGraph::writeGraph(const std::string& OutputFilename) {
   std::string ErrorInfo;
   raw_fd_ostream O(OutputFilename.c_str(), ErrorInfo);
 
@@ -526,9 +629,11 @@ void CompilationGraph::writeGraph(const std::string& OutputFilename) {
     errs() << "done.\n";
   }
   else {
-    throw std::runtime_error("Error opening file '" + OutputFilename
-                             + "' for writing!");
+    PrintError("Error opening file '" + OutputFilename + "' for writing!");
+    return 1;
   }
+
+  return 0;
 }
 
 void CompilationGraph::viewGraph() {
diff --git a/lib/CompilerDriver/Main.cpp b/lib/CompilerDriver/Main.cpp
index b5e507dfc3a37..0a6613aa77a3b 100644
--- a/lib/CompilerDriver/Main.cpp
+++ b/lib/CompilerDriver/Main.cpp
@@ -11,16 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CompilerDriver/AutoGenerated.h"
 #include "llvm/CompilerDriver/BuiltinOptions.h"
 #include "llvm/CompilerDriver/CompilationGraph.h"
 #include "llvm/CompilerDriver/Error.h"
-#include "llvm/CompilerDriver/Plugin.h"
 
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/System/Path.h"
 
 #include <sstream>
-#include <stdexcept>
 #include <string>
 
 namespace cl = llvm::cl;
@@ -31,9 +30,9 @@ namespace {
 
   std::stringstream* GlobalTimeLog;
 
-  sys::Path getTempDir() {
-    sys::Path tempDir;
-
+  /// GetTempDir - Get the temporary directory location. Returns non-zero value
+  /// on error.
+  int GetTempDir(sys::Path& tempDir) {
     // The --temp-dir option.
     if (!TempDirname.empty()) {
       tempDir = TempDirname;
@@ -41,7 +40,7 @@ namespace {
     // GCC 4.5-style -save-temps handling.
     else if (SaveTemps == SaveTempsEnum::Unset) {
       tempDir = sys::Path::GetTemporaryDirectory();
-      return tempDir;
+      return 0;
     }
     else if (SaveTemps == SaveTempsEnum::Obj && !OutputFilename.empty()) {
       tempDir = OutputFilename;
@@ -49,35 +48,35 @@ namespace {
     }
     else {
       // SaveTemps == Cwd --> use current dir (leave tempDir empty).
-      return tempDir;
+      return 0;
     }
 
     if (!tempDir.exists()) {
       std::string ErrMsg;
-      if (tempDir.createDirectoryOnDisk(true, &ErrMsg))
-        throw std::runtime_error(ErrMsg);
+      if (tempDir.createDirectoryOnDisk(true, &ErrMsg)) {
+        PrintError(ErrMsg);
+        return 1;
+      }
     }
 
-    return tempDir;
+    return 0;
   }
 
-  /// BuildTargets - A small wrapper for CompilationGraph::Build.
+  /// BuildTargets - A small wrapper for CompilationGraph::Build. Returns
+  /// non-zero value in case of error.
   int BuildTargets(CompilationGraph& graph, const LanguageMap& langMap) {
     int ret;
-    const sys::Path& tempDir = getTempDir();
+    sys::Path tempDir;
     bool toDelete = (SaveTemps == SaveTempsEnum::Unset);
 
-    try {
-      ret = graph.Build(tempDir, langMap);
-    }
-    catch(...) {
-      if (toDelete)
-        tempDir.eraseFromDisk(true);
-      throw;
-    }
+    if (int ret = GetTempDir(tempDir))
+      return ret;
+
+    ret = graph.Build(tempDir, langMap);
 
     if (toDelete)
       tempDir.eraseFromDisk(true);
+
     return ret;
   }
 }
@@ -89,68 +88,58 @@ void AppendToGlobalTimeLog(const std::string& cmd, double time) {
   *GlobalTimeLog << "# " << cmd << ' ' << time << '\n';
 }
 
-// Sometimes plugins want to condition on the value in argv[0].
+// Sometimes user code wants to access the argv[0] value.
 const char* ProgramName;
 
 int Main(int argc, char** argv) {
-  try {
-    LanguageMap langMap;
-    CompilationGraph graph;
-
-    ProgramName = argv[0];
+  int ret = 0;
+  LanguageMap langMap;
+  CompilationGraph graph;
 
-    cl::ParseCommandLineOptions
-      (argc, argv, "LLVM Compiler Driver (Work In Progress)",
-       /* ReadResponseFiles = */ false);
+  ProgramName = argv[0];
 
-    PluginLoader Plugins;
-    Plugins.RunInitialization(langMap, graph);
+  cl::ParseCommandLineOptions
+    (argc, argv,
+     /* Overview = */ "LLVM Compiler Driver (Work In Progress)",
+     /* ReadResponseFiles = */ false);
 
-    if (CheckGraph) {
-      int ret = graph.Check();
-      if (!ret)
-        llvm::errs() << "check-graph: no errors found.\n";
+  if (int ret = autogenerated::RunInitialization(langMap, graph))
+    return ret;
 
-      return ret;
-    }
+  if (CheckGraph) {
+    ret = graph.Check();
+    if (!ret)
+      llvm::errs() << "check-graph: no errors found.\n";
 
-    if (ViewGraph) {
-      graph.viewGraph();
-      if (!WriteGraph)
-        return 0;
-    }
+    return ret;
+  }
 
-    if (WriteGraph) {
-      graph.writeGraph(OutputFilename.empty()
-                       ? std::string("compilation-graph.dot")
-                       : OutputFilename);
+  if (ViewGraph) {
+    graph.viewGraph();
+    if (!WriteGraph)
       return 0;
-    }
+  }
 
-    if (Time) {
-      GlobalTimeLog = new std::stringstream;
-      GlobalTimeLog->precision(2);
-    }
+  if (WriteGraph) {
+    const std::string& Out = (OutputFilename.empty()
+                              ? std::string("compilation-graph.dot")
+                              : OutputFilename);
+    return graph.writeGraph(Out);
+  }
 
-    int ret = BuildTargets(graph, langMap);
+  if (Time) {
+    GlobalTimeLog = new std::stringstream;
+    GlobalTimeLog->precision(2);
+  }
 
-    if (Time) {
-      llvm::errs() << GlobalTimeLog->str();
-      delete GlobalTimeLog;
-    }
+  ret = BuildTargets(graph, langMap);
 
-    return ret;
-  }
-  catch(llvmc::error_code& ec) {
-    return ec.code();
+  if (Time) {
+    llvm::errs() << GlobalTimeLog->str();
+    delete GlobalTimeLog;
   }
-  catch(const std::exception& ex) {
-    llvm::errs() << argv[0] << ": " << ex.what() << '\n';
-  }
-  catch(...) {
-    llvm::errs() << argv[0] << ": unknown error!\n";
-  }
-  return 1;
+
+  return ret;
 }
 
 } // end namespace llvmc
diff --git a/lib/CompilerDriver/Makefile b/lib/CompilerDriver/Makefile
index 66c6d11552fc7..8e8b73ca8f83f 100644
--- a/lib/CompilerDriver/Makefile
+++ b/lib/CompilerDriver/Makefile
@@ -10,39 +10,11 @@
 LEVEL = ../..
 
 # We don't want this library to appear in `llvm-config --libs` output, so its
-# name doesn't start with "LLVM".
+# name doesn't start with "LLVM" and NO_LLVM_CONFIG is set.
 
-ifeq ($(ENABLE_LLVMC_DYNAMIC),1)
-  LIBRARYNAME = libCompilerDriver
-  LLVMLIBS = LLVMSupport.a LLVMSystem.a
-  LOADABLE_MODULE := 1
-else
-  LIBRARYNAME = CompilerDriver
-  LINK_COMPONENTS = support system
-endif
+LIBRARYNAME = CompilerDriver
+LINK_COMPONENTS = support system
+NO_LLVM_CONFIG = 1
 
-REQUIRES_EH := 1
-REQUIRES_RTTI := 1
 
 include $(LEVEL)/Makefile.common
-
-ifeq ($(ENABLE_LLVMC_DYNAMIC_PLUGINS), 1)
-    CPP.Flags += -DENABLE_LLVMC_DYNAMIC_PLUGINS
-endif
-
-# Copy libCompilerDriver to the bin dir so that llvmc can find it.
-ifeq ($(ENABLE_LLVMC_DYNAMIC),1)
-
-FullLibName = $(LIBRARYNAME)$(SHLIBEXT)
-
-all-local:: $(ToolDir)/$(FullLibName)
-
-$(ToolDir)/$(FullLibName): $(LibDir)/$(FullLibName) $(ToolDir)/.dir
-	$(Echo) Copying $(BuildMode) Shared Library $(FullLibName) to $@
-	-$(Verb) $(CP) $< $@
-
-clean-local::
-	$(Echo) Removing $(BuildMode) Shared Library $(FullLibName) \
-	from $(ToolDir)
-	-$(Verb) $(RM) -f $(ToolDir)/$(FullLibName)
-endif
diff --git a/lib/CompilerDriver/Plugin.cpp b/lib/CompilerDriver/Plugin.cpp
deleted file mode 100644
index 0fdfef4c6a29b..0000000000000
--- a/lib/CompilerDriver/Plugin.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-//===--- Plugin.cpp - The LLVM Compiler Driver ------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open
-// Source License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//  Plugin support.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CompilerDriver/Plugin.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/System/Mutex.h"
-#include <algorithm>
-#include <vector>
-
-namespace {
-
-  // Registry::Add<> does not do lifetime management (probably issues
-  // with static constructor/destructor ordering), so we have to
-  // implement it here.
-  //
-  // All this static registration/life-before-main model seems
-  // unnecessary convoluted to me.
-
-  static bool pluginListInitialized = false;
-  typedef std::vector<const llvmc::BasePlugin*> PluginList;
-  static PluginList Plugins;
-  static llvm::ManagedStatic<llvm::sys::SmartMutex<true> > PluginMutex;
-
-  struct ByPriority {
-    bool operator()(const llvmc::BasePlugin* lhs,
-                    const llvmc::BasePlugin* rhs) {
-      return lhs->Priority() < rhs->Priority();
-    }
-  };
-}
-
-namespace llvmc {
-
-  PluginLoader::PluginLoader() {
-    llvm::sys::SmartScopedLock<true> Lock(*PluginMutex);
-    if (!pluginListInitialized) {
-      for (PluginRegistry::iterator B = PluginRegistry::begin(),
-             E = PluginRegistry::end(); B != E; ++B)
-        Plugins.push_back(B->instantiate());
-      std::sort(Plugins.begin(), Plugins.end(), ByPriority());
-    }
-    pluginListInitialized = true;
-  }
-
-  PluginLoader::~PluginLoader() {
-    llvm::sys::SmartScopedLock<true> Lock(*PluginMutex);
-    if (pluginListInitialized) {
-      for (PluginList::iterator B = Plugins.begin(), E = Plugins.end();
-           B != E; ++B)
-        delete (*B);
-    }
-    pluginListInitialized = false;
-  }
-
-  void PluginLoader::RunInitialization(LanguageMap& langMap,
-                                       CompilationGraph& graph) const
-  {
-    llvm::sys::SmartScopedLock<true> Lock(*PluginMutex);
-    for (PluginList::iterator B = Plugins.begin(), E = Plugins.end();
-         B != E; ++B) {
-      const BasePlugin* BP = *B;
-      BP->PreprocessOptions();
-      BP->PopulateLanguageMap(langMap);
-      BP->PopulateCompilationGraph(graph);
-    }
-  }
-
-}
diff --git a/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index c7495d442d9cf..f8f1f4a78ee53 100644
--- a/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -236,6 +236,10 @@ LLVMBool LLVMFindFunction(LLVMExecutionEngineRef EE, const char *Name,
   return 1;
 }
 
+void *LLVMRecompileAndRelinkFunction(LLVMExecutionEngineRef EE, LLVMValueRef Fn) {
+  return unwrap(EE)->recompileAndRelinkFunction(unwrap<Function>(Fn));
+}
+
 LLVMTargetDataRef LLVMGetExecutionEngineTargetData(LLVMExecutionEngineRef EE) {
   return wrap(unwrap(EE)->getTargetData());
 }
diff --git a/lib/ExecutionEngine/JIT/Intercept.cpp b/lib/ExecutionEngine/JIT/Intercept.cpp
index b367033d32b56..274f816f39e11 100644
--- a/lib/ExecutionEngine/JIT/Intercept.cpp
+++ b/lib/ExecutionEngine/JIT/Intercept.cpp
@@ -89,6 +89,10 @@ static int jit_atexit(void (*Fn)()) {
   return 0;  // Always successful
 }
 
+static int jit_noop() {
+  return 0;
+}
+
 //===----------------------------------------------------------------------===//
 //
 /// getPointerToNamedFunction - This method returns the address of the specified
@@ -104,6 +108,14 @@ void *JIT::getPointerToNamedFunction(const std::string &Name,
     if (Name == "exit") return (void*)(intptr_t)&jit_exit;
     if (Name == "atexit") return (void*)(intptr_t)&jit_atexit;
 
+    // We should not invoke parent's ctors/dtors from generated main()!
+    // On Mingw and Cygwin, the symbol __main is resolved to
+    // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
+    // (and register wrong callee's dtors with atexit(3)).
+    // We expect ExecutionEngine::runStaticConstructorsDestructors()
+    // is called before ExecutionEngine::runFunctionAsMain() is called.
+    if (Name == "__main") return (void*)(intptr_t)&jit_noop;
+
     const char *NameStr = Name.c_str();
     // If this is an asm specifier, skip the sentinal.
     if (NameStr[0] == 1) ++NameStr;
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index 67bd3ed10ad97..63125b79c8e26 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -67,7 +67,7 @@ extern "C" void LLVMLinkInJIT() {
 }
 
 
-#if defined(__GNUC__) && !defined(__ARM__EABI__)
+#if defined(__GNUC__) && !defined(__ARM_EABI__) && !defined(__USING_SJLJ_EXCEPTIONS__)
  
 // libgcc defines the __register_frame function to dynamically register new
 // dwarf frames for exception handling. This functionality is not portable
@@ -219,10 +219,8 @@ ExecutionEngine *JIT::createJIT(Module *M,
                                 StringRef MArch,
                                 StringRef MCPU,
                                 const SmallVectorImpl<std::string>& MAttrs) {
-  // Make sure we can resolve symbols in the program as well. The zero arg
-  // to the function tells DynamicLibrary to load the program, not a library.
-  if (sys::DynamicLibrary::LoadLibraryPermanently(0, ErrorStr))
-    return 0;
+  // Try to register the program as a source of symbols to resolve against.
+  sys::DynamicLibrary::LoadLibraryPermanently(0, NULL);
 
   // Pick a target either via -march or by guessing the native arch.
   TargetMachine *TM = JIT::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr);
@@ -308,7 +306,7 @@ JIT::JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
   }
   
   // Register routine for informing unwinding runtime about new EH frames
-#if defined(__GNUC__) && !defined(__ARM_EABI__)
+#if defined(__GNUC__) && !defined(__ARM_EABI__) && !defined(__USING_SJLJ_EXCEPTIONS__)
 #if USE_KEYMGR
   struct LibgccObjectInfo* LOI = (struct LibgccObjectInfo*)
     _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST);
diff --git a/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp b/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp
index 749a57d92c948..6e11a3cd9368d 100644
--- a/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp
+++ b/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp
@@ -90,8 +90,8 @@ std::string JITDebugRegisterer::MakeELF(const Function *F, DebugInfo &I) {
   // section.  This allows GDB to get a good stack trace, particularly on
   // linux x86_64.  Mark this as a PROGBITS section that needs to be loaded
   // into memory at runtime.
-  ELFSection &EH = EW.getSection(".eh_frame", ELFSection::SHT_PROGBITS,
-                                 ELFSection::SHF_ALLOC);
+  ELFSection &EH = EW.getSection(".eh_frame", ELF::SHT_PROGBITS,
+                                 ELF::SHF_ALLOC);
   // Pointers in the DWARF EH info are all relative to the EH frame start,
   // which is stored here.
   EH.Addr = (uint64_t)I.EhStart;
@@ -102,9 +102,9 @@ std::string JITDebugRegisterer::MakeELF(const Function *F, DebugInfo &I) {
   // Add this single function to the symbol table, so the debugger prints the
   // name instead of '???'.  We give the symbol default global visibility.
   ELFSym *FnSym = ELFSym::getGV(F,
-                                ELFSym::STB_GLOBAL,
-                                ELFSym::STT_FUNC,
-                                ELFSym::STV_DEFAULT);
+                                ELF::STB_GLOBAL,
+                                ELF::STT_FUNC,
+                                ELF::STV_DEFAULT);
   FnSym->SectionIdx = Text.SectionIdx;
   FnSym->Size = I.FnEnd - I.FnStart;
   FnSym->Value = 0;  // Offset from start of section.
diff --git a/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp b/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
index 4b3ca8759b8ac..1105bcc0437f2 100644
--- a/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
@@ -595,443 +595,3 @@ JITDwarfEmitter::EmitEHFrame(const Function* Personality,
   
   return StartEHPtr;
 }
-
-unsigned JITDwarfEmitter::GetDwarfTableSizeInBytes(MachineFunction& F,
-                                         JITCodeEmitter& jce,
-                                         unsigned char* StartFunction,
-                                         unsigned char* EndFunction) {
-  const TargetMachine& TM = F.getTarget();
-  TD = TM.getTargetData();
-  stackGrowthDirection = TM.getFrameInfo()->getStackGrowthDirection();
-  RI = TM.getRegisterInfo();
-  JCE = &jce;
-  unsigned FinalSize = 0;
-  
-  FinalSize += GetExceptionTableSizeInBytes(&F);
-      
-  const std::vector<const Function *> Personalities = MMI->getPersonalities();
-  FinalSize += 
-    GetCommonEHFrameSizeInBytes(Personalities[MMI->getPersonalityIndex()]);
-
-  FinalSize += GetEHFrameSizeInBytes(Personalities[MMI->getPersonalityIndex()],
-                                     StartFunction);
-
-  return FinalSize;
-}
-
-/// RoundUpToAlign - Add the specified alignment to FinalSize and returns
-/// the new value.
-static unsigned RoundUpToAlign(unsigned FinalSize, unsigned Alignment) {
-  if (Alignment == 0) Alignment = 1;
-  // Since we do not know where the buffer will be allocated, be pessimistic.
-  return FinalSize + Alignment;
-}
-  
-unsigned
-JITDwarfEmitter::GetEHFrameSizeInBytes(const Function* Personality,
-                                       unsigned char* StartFunction) const { 
-  unsigned PointerSize = TD->getPointerSize();
-  unsigned FinalSize = 0;
-  // EH frame header.
-  FinalSize += PointerSize;
-  // FDE CIE Offset
-  FinalSize += 3 * PointerSize;
-  // If there is a personality and landing pads then point to the language
-  // specific data area in the exception table.
-  if (Personality) {
-    FinalSize += MCAsmInfo::getULEB128Size(4); 
-    FinalSize += PointerSize;
-  } else {
-    FinalSize += MCAsmInfo::getULEB128Size(0);
-  }
-      
-  // Indicate locations of function specific  callee saved registers in
-  // frame.
-  FinalSize += GetFrameMovesSizeInBytes((intptr_t)StartFunction,
-                                        MMI->getFrameMoves());
-      
-  FinalSize = RoundUpToAlign(FinalSize, 4);
-  
-  // Double zeroes for the unwind runtime
-  FinalSize += 2 * PointerSize;
-
-  return FinalSize;
-}
-
-unsigned JITDwarfEmitter::GetCommonEHFrameSizeInBytes(const Function* Personality) 
-  const {
-
-  unsigned PointerSize = TD->getPointerSize();
-  int stackGrowth = stackGrowthDirection == TargetFrameInfo::StackGrowsUp ?
-          PointerSize : -PointerSize;
-  unsigned FinalSize = 0; 
-  // EH Common Frame header
-  FinalSize += PointerSize;
-  FinalSize += 4;
-  FinalSize += 1;
-  FinalSize += Personality ? 5 : 3; // "zPLR" or "zR"
-  FinalSize += MCAsmInfo::getULEB128Size(1);
-  FinalSize += MCAsmInfo::getSLEB128Size(stackGrowth);
-  FinalSize += 1;
-  
-  if (Personality) {
-    FinalSize += MCAsmInfo::getULEB128Size(7);
-    
-    // Encoding
-    FinalSize+= 1;
-    //Personality
-    FinalSize += PointerSize;
-    
-    FinalSize += MCAsmInfo::getULEB128Size(dwarf::DW_EH_PE_pcrel);
-    FinalSize += MCAsmInfo::getULEB128Size(dwarf::DW_EH_PE_pcrel);
-      
-  } else {
-    FinalSize += MCAsmInfo::getULEB128Size(1);
-    FinalSize += MCAsmInfo::getULEB128Size(dwarf::DW_EH_PE_pcrel);
-  }
-
-  std::vector<MachineMove> Moves;
-  RI->getInitialFrameState(Moves);
-  FinalSize += GetFrameMovesSizeInBytes(0, Moves);
-  FinalSize = RoundUpToAlign(FinalSize, 4);
-  return FinalSize;
-}
-
-unsigned
-JITDwarfEmitter::GetFrameMovesSizeInBytes(intptr_t BaseLabelPtr,
-                                  const std::vector<MachineMove> &Moves) const {
-  unsigned PointerSize = TD->getPointerSize();
-  int stackGrowth = stackGrowthDirection == TargetFrameInfo::StackGrowsUp ?
-          PointerSize : -PointerSize;
-  bool IsLocal = BaseLabelPtr;
-  unsigned FinalSize = 0; 
-
-  for (unsigned i = 0, N = Moves.size(); i < N; ++i) {
-    const MachineMove &Move = Moves[i];
-    MCSymbol *Label = Move.getLabel();
-    
-    // Throw out move if the label is invalid.
-    if (Label && (*JCE->getLabelLocations())[Label] == 0)
-      continue;
-    
-    intptr_t LabelPtr = 0;
-    if (Label) LabelPtr = JCE->getLabelAddress(Label);
-
-    const MachineLocation &Dst = Move.getDestination();
-    const MachineLocation &Src = Move.getSource();
-    
-    // Advance row if new location.
-    if (BaseLabelPtr && Label && (BaseLabelPtr != LabelPtr || !IsLocal)) {
-      FinalSize++;
-      FinalSize += PointerSize;
-      BaseLabelPtr = LabelPtr;
-      IsLocal = true;
-    }
-    
-    // If advancing cfa.
-    if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
-      if (!Src.isReg()) {
-        if (Src.getReg() == MachineLocation::VirtualFP) {
-          ++FinalSize;
-        } else {
-          ++FinalSize;
-          unsigned RegNum = RI->getDwarfRegNum(Src.getReg(), true);
-          FinalSize += MCAsmInfo::getULEB128Size(RegNum);
-        }
-        
-        int Offset = -Src.getOffset();
-        
-        FinalSize += MCAsmInfo::getULEB128Size(Offset);
-      } else {
-        llvm_unreachable("Machine move no supported yet.");
-      }
-    } else if (Src.isReg() &&
-      Src.getReg() == MachineLocation::VirtualFP) {
-      if (Dst.isReg()) {
-        ++FinalSize;
-        unsigned RegNum = RI->getDwarfRegNum(Dst.getReg(), true);
-        FinalSize += MCAsmInfo::getULEB128Size(RegNum);
-      } else {
-        llvm_unreachable("Machine move no supported yet.");
-      }
-    } else {
-      unsigned Reg = RI->getDwarfRegNum(Src.getReg(), true);
-      int Offset = Dst.getOffset() / stackGrowth;
-      
-      if (Offset < 0) {
-        ++FinalSize;
-        FinalSize += MCAsmInfo::getULEB128Size(Reg);
-        FinalSize += MCAsmInfo::getSLEB128Size(Offset);
-      } else if (Reg < 64) {
-        ++FinalSize;
-        FinalSize += MCAsmInfo::getULEB128Size(Offset);
-      } else {
-        ++FinalSize;
-        FinalSize += MCAsmInfo::getULEB128Size(Reg);
-        FinalSize += MCAsmInfo::getULEB128Size(Offset);
-      }
-    }
-  }
-  return FinalSize;
-}
-
-unsigned 
-JITDwarfEmitter::GetExceptionTableSizeInBytes(MachineFunction* MF) const {
-  unsigned FinalSize = 0;
-
-  // Map all labels and get rid of any dead landing pads.
-  MMI->TidyLandingPads(JCE->getLabelLocations());
-
-  const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
-  const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
-  const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
-  if (PadInfos.empty()) return 0;
-
-  // Sort the landing pads in order of their type ids.  This is used to fold
-  // duplicate actions.
-  SmallVector<const LandingPadInfo *, 64> LandingPads;
-  LandingPads.reserve(PadInfos.size());
-  for (unsigned i = 0, N = PadInfos.size(); i != N; ++i)
-    LandingPads.push_back(&PadInfos[i]);
-  std::sort(LandingPads.begin(), LandingPads.end(), PadLT);
-
-  // Negative type ids index into FilterIds, positive type ids index into
-  // TypeInfos.  The value written for a positive type id is just the type
-  // id itself.  For a negative type id, however, the value written is the
-  // (negative) byte offset of the corresponding FilterIds entry.  The byte
-  // offset is usually equal to the type id, because the FilterIds entries
-  // are written using a variable width encoding which outputs one byte per
-  // entry as long as the value written is not too large, but can differ.
-  // This kind of complication does not occur for positive type ids because
-  // type infos are output using a fixed width encoding.
-  // FilterOffsets[i] holds the byte offset corresponding to FilterIds[i].
-  SmallVector<int, 16> FilterOffsets;
-  FilterOffsets.reserve(FilterIds.size());
-  int Offset = -1;
-  for(std::vector<unsigned>::const_iterator I = FilterIds.begin(),
-    E = FilterIds.end(); I != E; ++I) {
-    FilterOffsets.push_back(Offset);
-    Offset -= MCAsmInfo::getULEB128Size(*I);
-  }
-
-  // Compute the actions table and gather the first action index for each
-  // landing pad site.
-  SmallVector<ActionEntry, 32> Actions;
-  SmallVector<unsigned, 64> FirstActions;
-  FirstActions.reserve(LandingPads.size());
-
-  int FirstAction = 0;
-  unsigned SizeActions = 0;
-  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
-    const LandingPadInfo *LP = LandingPads[i];
-    const std::vector<int> &TypeIds = LP->TypeIds;
-    const unsigned NumShared = i ? SharedTypeIds(LP, LandingPads[i-1]) : 0;
-    unsigned SizeSiteActions = 0;
-
-    if (NumShared < TypeIds.size()) {
-      unsigned SizeAction = 0;
-      ActionEntry *PrevAction = 0;
-
-      if (NumShared) {
-        const unsigned SizePrevIds = LandingPads[i-1]->TypeIds.size();
-        assert(Actions.size());
-        PrevAction = &Actions.back();
-        SizeAction = MCAsmInfo::getSLEB128Size(PrevAction->NextAction) +
-          MCAsmInfo::getSLEB128Size(PrevAction->ValueForTypeID);
-        for (unsigned j = NumShared; j != SizePrevIds; ++j) {
-          SizeAction -= MCAsmInfo::getSLEB128Size(PrevAction->ValueForTypeID);
-          SizeAction += -PrevAction->NextAction;
-          PrevAction = PrevAction->Previous;
-        }
-      }
-
-      // Compute the actions.
-      for (unsigned I = NumShared, M = TypeIds.size(); I != M; ++I) {
-        int TypeID = TypeIds[I];
-        assert(-1-TypeID < (int)FilterOffsets.size() && "Unknown filter id!");
-        int ValueForTypeID = TypeID < 0 ? FilterOffsets[-1 - TypeID] : TypeID;
-        unsigned SizeTypeID = MCAsmInfo::getSLEB128Size(ValueForTypeID);
-
-        int NextAction = SizeAction ? -(SizeAction + SizeTypeID) : 0;
-        SizeAction = SizeTypeID + MCAsmInfo::getSLEB128Size(NextAction);
-        SizeSiteActions += SizeAction;
-
-        ActionEntry Action = {ValueForTypeID, NextAction, PrevAction};
-        Actions.push_back(Action);
-
-        PrevAction = &Actions.back();
-      }
-
-      // Record the first action of the landing pad site.
-      FirstAction = SizeActions + SizeSiteActions - SizeAction + 1;
-    } // else identical - re-use previous FirstAction
-
-    FirstActions.push_back(FirstAction);
-
-    // Compute this sites contribution to size.
-    SizeActions += SizeSiteActions;
-  }
-
-  // Compute the call-site table.  Entries must be ordered by address.
-  SmallVector<CallSiteEntry, 64> CallSites;
-
-  RangeMapType PadMap;
-  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
-    const LandingPadInfo *LandingPad = LandingPads[i];
-    for (unsigned j=0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
-      MCSymbol *BeginLabel = LandingPad->BeginLabels[j];
-      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
-      PadRange P = { i, j };
-      PadMap[BeginLabel] = P;
-    }
-  }
-
-  bool MayThrow = false;
-  MCSymbol *LastLabel = 0;
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
-        I != E; ++I) {
-    for (MachineBasicBlock::const_iterator MI = I->begin(), E = I->end();
-          MI != E; ++MI) {
-      if (!MI->isLabel()) {
-        MayThrow |= MI->getDesc().isCall();
-        continue;
-      }
-
-      MCSymbol *BeginLabel = MI->getOperand(0).getMCSymbol();
-      
-      if (BeginLabel == LastLabel)
-        MayThrow = false;
-
-      RangeMapType::iterator L = PadMap.find(BeginLabel);
-
-      if (L == PadMap.end())
-        continue;
-
-      PadRange P = L->second;
-      const LandingPadInfo *LandingPad = LandingPads[P.PadIndex];
-
-      assert(BeginLabel == LandingPad->BeginLabels[P.RangeIndex] &&
-              "Inconsistent landing pad map!");
-
-      // If some instruction between the previous try-range and this one may
-      // throw, create a call-site entry with no landing pad for the region
-      // between the try-ranges.
-      if (MayThrow) {
-        CallSiteEntry Site = {LastLabel, BeginLabel, 0, 0};
-        CallSites.push_back(Site);
-      }
-
-      LastLabel = LandingPad->EndLabels[P.RangeIndex];
-      CallSiteEntry Site = {BeginLabel, LastLabel,
-        LandingPad->LandingPadLabel, FirstActions[P.PadIndex]};
-
-      assert(Site.BeginLabel && Site.EndLabel && Site.PadLabel &&
-              "Invalid landing pad!");
-
-      // Try to merge with the previous call-site.
-      if (CallSites.size()) {
-        CallSiteEntry &Prev = CallSites.back();
-        if (Site.PadLabel == Prev.PadLabel && Site.Action == Prev.Action) {
-          // Extend the range of the previous entry.
-          Prev.EndLabel = Site.EndLabel;
-          continue;
-        }
-      }
-
-      // Otherwise, create a new call-site.
-      CallSites.push_back(Site);
-    }
-  }
-  // If some instruction between the previous try-range and the end of the
-  // function may throw, create a call-site entry with no landing pad for the
-  // region following the try-range.
-  if (MayThrow) {
-    CallSiteEntry Site = {LastLabel, 0, 0, 0};
-    CallSites.push_back(Site);
-  }
-
-  // Final tallies.
-  unsigned SizeSites = CallSites.size() * (sizeof(int32_t) + // Site start.
-                                            sizeof(int32_t) + // Site length.
-                                            sizeof(int32_t)); // Landing pad.
-  for (unsigned i = 0, e = CallSites.size(); i < e; ++i)
-    SizeSites += MCAsmInfo::getULEB128Size(CallSites[i].Action);
-
-  unsigned SizeTypes = TypeInfos.size() * TD->getPointerSize();
-
-  unsigned TypeOffset = sizeof(int8_t) + // Call site format
-                        // Call-site table length
-                        MCAsmInfo::getULEB128Size(SizeSites) + 
-                        SizeSites + SizeActions + SizeTypes;
-
-  unsigned TotalSize = sizeof(int8_t) + // LPStart format
-                       sizeof(int8_t) + // TType format
-                       MCAsmInfo::getULEB128Size(TypeOffset) + // TType base offset
-                       TypeOffset;
-
-  unsigned SizeAlign = (4 - TotalSize) & 3;
-
-  // Begin the exception table.
-  FinalSize = RoundUpToAlign(FinalSize, 4);
-  for (unsigned i = 0; i != SizeAlign; ++i) {
-    ++FinalSize;
-  }
-  
-  unsigned PointerSize = TD->getPointerSize();
-
-  // Emit the header.
-  ++FinalSize;
-  // Asm->EOL("LPStart format (DW_EH_PE_omit)");
-  ++FinalSize;
-  // Asm->EOL("TType format (DW_EH_PE_absptr)");
-  ++FinalSize;
-  // Asm->EOL("TType base offset");
-  ++FinalSize;
-  // Asm->EOL("Call site format (DW_EH_PE_udata4)");
-  ++FinalSize;
-  // Asm->EOL("Call-site table length");
-
-  // Emit the landing pad site information.
-  for (unsigned i = 0; i < CallSites.size(); ++i) {
-    CallSiteEntry &S = CallSites[i];
-
-    // Asm->EOL("Region start");
-    FinalSize += PointerSize;
-    
-    //Asm->EOL("Region length");
-    FinalSize += PointerSize;
-
-    // Asm->EOL("Landing pad");
-    FinalSize += PointerSize;
-
-    FinalSize += MCAsmInfo::getULEB128Size(S.Action);
-    // Asm->EOL("Action");
-  }
-
-  // Emit the actions.
-  for (unsigned I = 0, N = Actions.size(); I != N; ++I) {
-    ActionEntry &Action = Actions[I];
-
-    //Asm->EOL("TypeInfo index");
-    FinalSize += MCAsmInfo::getSLEB128Size(Action.ValueForTypeID);
-    //Asm->EOL("Next action");
-    FinalSize += MCAsmInfo::getSLEB128Size(Action.NextAction);
-  }
-
-  // Emit the type ids.
-  for (unsigned M = TypeInfos.size(); M; --M) {
-    // Asm->EOL("TypeInfo");
-    FinalSize += PointerSize;
-  }
-
-  // Emit the filter typeids.
-  for (unsigned j = 0, M = FilterIds.size(); j < M; ++j) {
-    unsigned TypeID = FilterIds[j];
-    FinalSize += MCAsmInfo::getULEB128Size(TypeID);
-    //Asm->EOL("Filter TypeInfo index");
-  }
-  
-  FinalSize = RoundUpToAlign(FinalSize, 4);
-
-  return FinalSize;
-}
diff --git a/lib/ExecutionEngine/JIT/JITDwarfEmitter.h b/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
index e627550d6d0e6..30956820f357e 100644
--- a/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
+++ b/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
@@ -49,17 +49,6 @@ class JITDwarfEmitter {
                              unsigned char* EndFunction,
                              unsigned char* ExceptionTable) const;
     
-  unsigned GetExceptionTableSizeInBytes(MachineFunction* MF) const;
-  
-  unsigned
-    GetFrameMovesSizeInBytes(intptr_t BaseLabelPtr, 
-                             const std::vector<MachineMove> &Moves) const;
-    
-  unsigned GetCommonEHFrameSizeInBytes(const Function* Personality) const;
-
-  unsigned GetEHFrameSizeInBytes(const Function* Personality, 
-                                 unsigned char* StartFunction) const; 
-    
 public:
   
   JITDwarfEmitter(JIT& jit);
@@ -71,11 +60,6 @@ public:
                                 unsigned char* &EHFramePtr);
   
   
-  unsigned GetDwarfTableSizeInBytes(MachineFunction& F, 
-                                    JITCodeEmitter& JCE,
-                                    unsigned char* StartFunction,
-                                    unsigned char* EndFunction);
-
   void setModuleInfo(MachineModuleInfo* Info) {
     MMI = Info;
   }
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index 28d79daed350b..4c0d0789cced4 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -152,16 +152,6 @@ namespace {
       FunctionToCallSitesMap[F].insert(CallSite);
     }
 
-    // Returns the Function of the stub if a stub was erased, or NULL if there
-    // was no stub.  This function uses the call-site->function map to find a
-    // relevant function, but asserts that only stubs and not other call sites
-    // will be passed in.
-    Function *EraseStub(const MutexGuard &locked, void *Stub);
-
-    void EraseAllCallSitesFor(const MutexGuard &locked, Function *F) {
-      assert(locked.holds(TheJIT->lock));
-      EraseAllCallSitesForPrelocked(F);
-    }
     void EraseAllCallSitesForPrelocked(Function *F);
 
     // Erases _all_ call sites regardless of their function.  This is used to
@@ -223,9 +213,6 @@ namespace {
     /// specified GV address.
     void *getGlobalValueIndirectSym(GlobalValue *V, void *GVAddress);
 
-    void getRelocatableGVs(SmallVectorImpl<GlobalValue*> &GVs,
-                           SmallVectorImpl<void*> &Ptrs);
-
     /// getGOTIndexForAddress - Return a new or existing index in the GOT for
     /// an address.  This function only manages slots, it does not manage the
     /// contents of the slots or the memory associated with the GOT.
@@ -398,7 +385,6 @@ namespace {
     /// classof - Methods for support type inquiry through isa, cast, and
     /// dyn_cast:
     ///
-    static inline bool classof(const JITEmitter*) { return true; }
     static inline bool classof(const MachineCodeEmitter*) { return true; }
 
     JITResolver &getJITResolver() { return Resolver; }
@@ -480,26 +466,10 @@ namespace {
       if (DE.get()) DE->setModuleInfo(Info);
     }
 
-    void setMemoryExecutable() {
-      MemMgr->setMemoryExecutable();
-    }
-
-    JITMemoryManager *getMemMgr() const { return MemMgr; }
-
   private:
     void *getPointerToGlobal(GlobalValue *GV, void *Reference,
                              bool MayNeedFarStub);
     void *getPointerToGVIndirectSym(GlobalValue *V, void *Reference);
-    unsigned addSizeOfGlobal(const GlobalVariable *GV, unsigned Size);
-    unsigned addSizeOfGlobalsInConstantVal(
-      const Constant *C, unsigned Size,
-      SmallPtrSet<const GlobalVariable*, 8> &SeenGlobals,
-      SmallVectorImpl<const GlobalVariable*> &Worklist);
-    unsigned addSizeOfGlobalsInInitializer(
-      const Constant *Init, unsigned Size,
-      SmallPtrSet<const GlobalVariable*, 8> &SeenGlobals,
-      SmallVectorImpl<const GlobalVariable*> &Worklist);
-    unsigned GetSizeOfGlobalsInBytes(MachineFunction &MF);
   };
 }
 
@@ -507,39 +477,6 @@ void CallSiteValueMapConfig::onDelete(JITResolverState *JRS, Function *F) {
   JRS->EraseAllCallSitesForPrelocked(F);
 }
 
-Function *JITResolverState::EraseStub(const MutexGuard &locked, void *Stub) {
-  CallSiteToFunctionMapTy::iterator C2F_I =
-    CallSiteToFunctionMap.find(Stub);
-  if (C2F_I == CallSiteToFunctionMap.end()) {
-    // Not a stub.
-    return NULL;
-  }
-
-  StubToResolverMap->UnregisterStubResolver(Stub);
-
-  Function *const F = C2F_I->second;
-#ifndef NDEBUG
-  void *RealStub = FunctionToLazyStubMap.lookup(F);
-  assert(RealStub == Stub &&
-         "Call-site that wasn't a stub passed in to EraseStub");
-#endif
-  FunctionToLazyStubMap.erase(F);
-  CallSiteToFunctionMap.erase(C2F_I);
-
-  // Remove the stub from the function->call-sites map, and remove the whole
-  // entry from the map if that was the last call site.
-  FunctionToCallSitesMapTy::iterator F2C_I = FunctionToCallSitesMap.find(F);
-  assert(F2C_I != FunctionToCallSitesMap.end() &&
-         "FunctionToCallSitesMap broken");
-  bool Erased = F2C_I->second.erase(Stub);
-  (void)Erased;
-  assert(Erased && "FunctionToCallSitesMap broken");
-  if (F2C_I->second.empty())
-    FunctionToCallSitesMap.erase(F2C_I);
-
-  return F;
-}
-
 void JITResolverState::EraseAllCallSitesForPrelocked(Function *F) {
   FunctionToCallSitesMapTy::iterator F2C = FunctionToCallSitesMap.find(F);
   if (F2C == FunctionToCallSitesMap.end())
@@ -690,28 +627,6 @@ unsigned JITResolver::getGOTIndexForAddr(void* addr) {
   return idx;
 }
 
-void JITResolver::getRelocatableGVs(SmallVectorImpl<GlobalValue*> &GVs,
-                                    SmallVectorImpl<void*> &Ptrs) {
-  MutexGuard locked(TheJIT->lock);
-
-  const FunctionToLazyStubMapTy &FM = state.getFunctionToLazyStubMap(locked);
-  GlobalToIndirectSymMapTy &GM = state.getGlobalToIndirectSymMap(locked);
-
-  for (FunctionToLazyStubMapTy::const_iterator i = FM.begin(), e = FM.end();
-       i != e; ++i){
-    Function *F = i->first;
-    if (F->isDeclaration() && F->hasExternalLinkage()) {
-      GVs.push_back(i->first);
-      Ptrs.push_back(i->second);
-    }
-  }
-  for (GlobalToIndirectSymMapTy::iterator i = GM.begin(), e = GM.end();
-       i != e; ++i) {
-    GVs.push_back(i->first);
-    Ptrs.push_back(i->second);
-  }
-}
-
 /// JITCompilerFn - This function is called when a lazy compilation stub has
 /// been entered.  It looks up which function this stub corresponds to, compiles
 /// it if necessary, then returns the resultant function pointer.
@@ -831,7 +746,7 @@ void JITEmitter::processDebugLoc(DebugLoc DL, bool BeforePrintingInsn) {
   if (DL.isUnknown()) return;
   if (!BeforePrintingInsn) return;
   
-  const LLVMContext& Context = EmissionDetails.MF->getFunction()->getContext();
+  const LLVMContext &Context = EmissionDetails.MF->getFunction()->getContext();
 
   if (DL.getScope(Context) != 0 && PrevDL != DL) {
     JITEvent_EmittedFunctionDetails::LineStart NextLine;
@@ -859,184 +774,6 @@ static unsigned GetConstantPoolSizeInBytes(MachineConstantPool *MCP,
   return Size;
 }
 
-static unsigned GetJumpTableSizeInBytes(MachineJumpTableInfo *MJTI, JIT *jit) {
-  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
-  if (JT.empty()) return 0;
-
-  unsigned NumEntries = 0;
-  for (unsigned i = 0, e = JT.size(); i != e; ++i)
-    NumEntries += JT[i].MBBs.size();
-
-  return NumEntries * MJTI->getEntrySize(*jit->getTargetData());
-}
-
-static uintptr_t RoundUpToAlign(uintptr_t Size, unsigned Alignment) {
-  if (Alignment == 0) Alignment = 1;
-  // Since we do not know where the buffer will be allocated, be pessimistic.
-  return Size + Alignment;
-}
-
-/// addSizeOfGlobal - add the size of the global (plus any alignment padding)
-/// into the running total Size.
-
-unsigned JITEmitter::addSizeOfGlobal(const GlobalVariable *GV, unsigned Size) {
-  const Type *ElTy = GV->getType()->getElementType();
-  size_t GVSize = (size_t)TheJIT->getTargetData()->getTypeAllocSize(ElTy);
-  size_t GVAlign =
-      (size_t)TheJIT->getTargetData()->getPreferredAlignment(GV);
-  DEBUG(dbgs() << "JIT: Adding in size " << GVSize << " alignment " << GVAlign);
-  DEBUG(GV->dump());
-  // Assume code section ends with worst possible alignment, so first
-  // variable needs maximal padding.
-  if (Size==0)
-    Size = 1;
-  Size = ((Size+GVAlign-1)/GVAlign)*GVAlign;
-  Size += GVSize;
-  return Size;
-}
-
-/// addSizeOfGlobalsInConstantVal - find any globals that we haven't seen yet
-/// but are referenced from the constant; put them in SeenGlobals and the
-/// Worklist, and add their size into the running total Size.
-
-unsigned JITEmitter::addSizeOfGlobalsInConstantVal(
-    const Constant *C,
-    unsigned Size,
-    SmallPtrSet<const GlobalVariable*, 8> &SeenGlobals,
-    SmallVectorImpl<const GlobalVariable*> &Worklist) {
-  // If its undefined, return the garbage.
-  if (isa<UndefValue>(C))
-    return Size;
-
-  // If the value is a ConstantExpr
-  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
-    Constant *Op0 = CE->getOperand(0);
-    switch (CE->getOpcode()) {
-    case Instruction::GetElementPtr:
-    case Instruction::Trunc:
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPTrunc:
-    case Instruction::FPExt:
-    case Instruction::UIToFP:
-    case Instruction::SIToFP:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::BitCast: {
-      Size = addSizeOfGlobalsInConstantVal(Op0, Size, SeenGlobals, Worklist);
-      break;
-    }
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor: {
-      Size = addSizeOfGlobalsInConstantVal(Op0, Size, SeenGlobals, Worklist);
-      Size = addSizeOfGlobalsInConstantVal(CE->getOperand(1), Size,
-                                           SeenGlobals, Worklist);
-      break;
-    }
-    default: {
-       std::string msg;
-       raw_string_ostream Msg(msg);
-       Msg << "ConstantExpr not handled: " << *CE;
-       report_fatal_error(Msg.str());
-    }
-    }
-  }
-
-  if (C->getType()->getTypeID() == Type::PointerTyID)
-    if (const GlobalVariable* GV = dyn_cast<GlobalVariable>(C))
-      if (SeenGlobals.insert(GV)) {
-        Worklist.push_back(GV);
-        Size = addSizeOfGlobal(GV, Size);
-      }
-
-  return Size;
-}
-
-/// addSizeOfGLobalsInInitializer - handle any globals that we haven't seen yet
-/// but are referenced from the given initializer.
-
-unsigned JITEmitter::addSizeOfGlobalsInInitializer(
-    const Constant *Init,
-    unsigned Size,
-    SmallPtrSet<const GlobalVariable*, 8> &SeenGlobals,
-    SmallVectorImpl<const GlobalVariable*> &Worklist) {
-  if (!isa<UndefValue>(Init) &&
-      !isa<ConstantVector>(Init) &&
-      !isa<ConstantAggregateZero>(Init) &&
-      !isa<ConstantArray>(Init) &&
-      !isa<ConstantStruct>(Init) &&
-      Init->getType()->isFirstClassType())
-    Size = addSizeOfGlobalsInConstantVal(Init, Size, SeenGlobals, Worklist);
-  return Size;
-}
-
-/// GetSizeOfGlobalsInBytes - walk the code for the function, looking for
-/// globals; then walk the initializers of those globals looking for more.
-/// If their size has not been considered yet, add it into the running total
-/// Size.
-
-unsigned JITEmitter::GetSizeOfGlobalsInBytes(MachineFunction &MF) {
-  unsigned Size = 0;
-  SmallPtrSet<const GlobalVariable*, 8> SeenGlobals;
-
-  for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
-       MBB != E; ++MBB) {
-    for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
-         I != E; ++I) {
-      const TargetInstrDesc &Desc = I->getDesc();
-      const MachineInstr &MI = *I;
-      unsigned NumOps = Desc.getNumOperands();
-      for (unsigned CurOp = 0; CurOp < NumOps; CurOp++) {
-        const MachineOperand &MO = MI.getOperand(CurOp);
-        if (MO.isGlobal()) {
-          const GlobalValue* V = MO.getGlobal();
-          const GlobalVariable *GV = dyn_cast<const GlobalVariable>(V);
-          if (!GV)
-            continue;
-          // If seen in previous function, it will have an entry here.
-          if (TheJIT->getPointerToGlobalIfAvailable(
-                const_cast<GlobalVariable *>(GV)))
-            continue;
-          // If seen earlier in this function, it will have an entry here.
-          // FIXME: it should be possible to combine these tables, by
-          // assuming the addresses of the new globals in this module
-          // start at 0 (or something) and adjusting them after codegen
-          // complete.  Another possibility is to grab a marker bit in GV.
-          if (SeenGlobals.insert(GV))
-            // A variable as yet unseen.  Add in its size.
-            Size = addSizeOfGlobal(GV, Size);
-        }
-      }
-    }
-  }
-  DEBUG(dbgs() << "JIT: About to look through initializers\n");
-  // Look for more globals that are referenced only from initializers.
-  SmallVector<const GlobalVariable*, 8> Worklist(
-    SeenGlobals.begin(), SeenGlobals.end());
-  while (!Worklist.empty()) {
-    const GlobalVariable* GV = Worklist.back();
-    Worklist.pop_back();
-    if (GV->hasInitializer())
-      Size = addSizeOfGlobalsInInitializer(GV->getInitializer(), Size,
-                                           SeenGlobals, Worklist);
-  }
-
-  return Size;
-}
-
 void JITEmitter::startFunction(MachineFunction &F) {
   DEBUG(dbgs() << "JIT: Starting CodeGen of Function "
         << F.getFunction()->getName() << "\n");
@@ -1044,43 +781,8 @@ void JITEmitter::startFunction(MachineFunction &F) {
   uintptr_t ActualSize = 0;
   // Set the memory writable, if it's not already
   MemMgr->setMemoryWritable();
-  if (MemMgr->NeedsExactSize()) {
-    DEBUG(dbgs() << "JIT: ExactSize\n");
-    const TargetInstrInfo* TII = F.getTarget().getInstrInfo();
-    MachineConstantPool *MCP = F.getConstantPool();
-
-    // Ensure the constant pool/jump table info is at least 4-byte aligned.
-    ActualSize = RoundUpToAlign(ActualSize, 16);
-
-    // Add the alignment of the constant pool
-    ActualSize = RoundUpToAlign(ActualSize, MCP->getConstantPoolAlignment());
-
-    // Add the constant pool size
-    ActualSize += GetConstantPoolSizeInBytes(MCP, TheJIT->getTargetData());
-
-    if (MachineJumpTableInfo *MJTI = F.getJumpTableInfo()) {
-      // Add the aligment of the jump table info
-      ActualSize = RoundUpToAlign(ActualSize,
-                             MJTI->getEntryAlignment(*TheJIT->getTargetData()));
-
-      // Add the jump table size
-      ActualSize += GetJumpTableSizeInBytes(MJTI, TheJIT);
-    }
-
-    // Add the alignment for the function
-    ActualSize = RoundUpToAlign(ActualSize,
-                                std::max(F.getFunction()->getAlignment(), 8U));
-
-    // Add the function size
-    ActualSize += TII->GetFunctionSizeInBytes(F);
-
-    DEBUG(dbgs() << "JIT: ActualSize before globals " << ActualSize << "\n");
-    // Add the size of the globals that will be allocated after this function.
-    // These are all the ones referenced from this function that were not
-    // previously allocated.
-    ActualSize += GetSizeOfGlobalsInBytes(F);
-    DEBUG(dbgs() << "JIT: ActualSize after globals " << ActualSize << "\n");
-  } else if (SizeEstimate > 0) {
+  
+  if (SizeEstimate > 0) {
     // SizeEstimate will be non-zero on reallocation attempts.
     ActualSize = SizeEstimate;
   }
@@ -1268,9 +970,6 @@ bool JITEmitter::finishFunction(MachineFunction &F) {
     SavedBufferEnd = BufferEnd;
     SavedCurBufferPtr = CurBufferPtr;
 
-    if (MemMgr->NeedsExactSize())
-      ActualSize = DE->GetDwarfTableSizeInBytes(F, *this, FnStart, FnEnd);
-
     BufferBegin = CurBufferPtr = MemMgr->startExceptionTable(F.getFunction(),
                                                              ActualSize);
     BufferEnd = BufferBegin+ActualSize;
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 8487c83ce36ab..7e8245a9e3a6b 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/System/Path.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/ADT/DenseMap.h"
 using namespace llvm;
 
@@ -96,15 +97,6 @@ public:
     return 0;
   }
 
-  /// erase - Remove the specified type, returning true if it was in the set.
-  bool erase(const Type *Ty) {
-    if (!TheMap.erase(Ty))
-      return false;
-    if (Ty->isAbstract())
-      Ty->removeAbstractTypeUser(this);
-    return true;
-  }
-
   /// insert - This returns true if the pointer was new to the set, false if it
   /// was already in the set.
   bool insert(const Type *Src, const Type *Dst) {
@@ -334,97 +326,6 @@ static bool LinkTypes(Module *Dest, const Module *Src, std::string *Err) {
   return false;
 }
 
-#ifndef NDEBUG
-static void PrintMap(const std::map<const Value*, Value*> &M) {
-  for (std::map<const Value*, Value*>::const_iterator I = M.begin(), E =M.end();
-       I != E; ++I) {
-    dbgs() << " Fr: " << (void*)I->first << " ";
-    I->first->dump();
-    dbgs() << " To: " << (void*)I->second << " ";
-    I->second->dump();
-    dbgs() << "\n";
-  }
-}
-#endif
-
-
-// RemapOperand - Use ValueMap to convert constants from one module to another.
-static Value *RemapOperand(const Value *In,
-                           std::map<const Value*, Value*> &ValueMap) {
-  std::map<const Value*,Value*>::const_iterator I = ValueMap.find(In);
-  if (I != ValueMap.end())
-    return I->second;
-
-  // Check to see if it's a constant that we are interested in transforming.
-  Value *Result = 0;
-  if (const Constant *CPV = dyn_cast<Constant>(In)) {
-    if ((!isa<DerivedType>(CPV->getType()) && !isa<ConstantExpr>(CPV)) ||
-        isa<ConstantInt>(CPV) || isa<ConstantAggregateZero>(CPV))
-      return const_cast<Constant*>(CPV);   // Simple constants stay identical.
-
-    if (const ConstantArray *CPA = dyn_cast<ConstantArray>(CPV)) {
-      std::vector<Constant*> Operands(CPA->getNumOperands());
-      for (unsigned i = 0, e = CPA->getNumOperands(); i != e; ++i)
-        Operands[i] =cast<Constant>(RemapOperand(CPA->getOperand(i), ValueMap));
-      Result = ConstantArray::get(cast<ArrayType>(CPA->getType()), Operands);
-    } else if (const ConstantStruct *CPS = dyn_cast<ConstantStruct>(CPV)) {
-      std::vector<Constant*> Operands(CPS->getNumOperands());
-      for (unsigned i = 0, e = CPS->getNumOperands(); i != e; ++i)
-        Operands[i] =cast<Constant>(RemapOperand(CPS->getOperand(i), ValueMap));
-      Result = ConstantStruct::get(cast<StructType>(CPS->getType()), Operands);
-    } else if (isa<ConstantPointerNull>(CPV) || isa<UndefValue>(CPV)) {
-      Result = const_cast<Constant*>(CPV);
-    } else if (const ConstantVector *CP = dyn_cast<ConstantVector>(CPV)) {
-      std::vector<Constant*> Operands(CP->getNumOperands());
-      for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i)
-        Operands[i] = cast<Constant>(RemapOperand(CP->getOperand(i), ValueMap));
-      Result = ConstantVector::get(Operands);
-    } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV)) {
-      std::vector<Constant*> Ops;
-      for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i)
-        Ops.push_back(cast<Constant>(RemapOperand(CE->getOperand(i),ValueMap)));
-      Result = CE->getWithOperands(Ops);
-    } else if (const BlockAddress *CE = dyn_cast<BlockAddress>(CPV)) {
-      Result = BlockAddress::get(
-                 cast<Function>(RemapOperand(CE->getFunction(), ValueMap)),
-                                 CE->getBasicBlock());
-    } else {
-      assert(!isa<GlobalValue>(CPV) && "Unmapped global?");
-      llvm_unreachable("Unknown type of derived type constant value!");
-    }
-  } else if (const MDNode *MD = dyn_cast<MDNode>(In)) {
-    if (MD->isFunctionLocal()) {
-      SmallVector<Value*, 4> Elts;
-      for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) {
-        if (MD->getOperand(i))
-          Elts.push_back(RemapOperand(MD->getOperand(i), ValueMap));
-        else
-          Elts.push_back(NULL);
-      }
-      Result = MDNode::get(In->getContext(), Elts.data(), MD->getNumOperands());
-    } else {
-      Result = const_cast<Value*>(In);
-    }
-  } else if (isa<MDString>(In) || isa<InlineAsm>(In) || isa<Instruction>(In)) {
-    Result = const_cast<Value*>(In);
-  }
-
-  // Cache the mapping in our local map structure
-  if (Result) {
-    ValueMap[In] = Result;
-    return Result;
-  }
-
-#ifndef NDEBUG
-  dbgs() << "LinkModules ValueMap: \n";
-  PrintMap(ValueMap);
-
-  dbgs() << "Couldn't remap value: " << (void*)In << " " << *In << "\n";
-  llvm_unreachable("Couldn't remap value!");
-#endif
-  return 0;
-}
-
 /// ForceRenaming - The LLVM SymbolTable class autorenames globals that conflict
 /// in the symbol table.  This is good for all clients except for us.  Go
 /// through the trouble to force this back.
@@ -541,25 +442,24 @@ static bool GetLinkageResult(GlobalValue *Dest, const GlobalValue *Src,
 }
 
 // Insert all of the named mdnoes in Src into the Dest module.
-static void LinkNamedMDNodes(Module *Dest, Module *Src) {
+static void LinkNamedMDNodes(Module *Dest, Module *Src,
+                             ValueToValueMapTy &ValueMap) {
   for (Module::const_named_metadata_iterator I = Src->named_metadata_begin(),
          E = Src->named_metadata_end(); I != E; ++I) {
     const NamedMDNode *SrcNMD = I;
-    NamedMDNode *DestNMD = Dest->getNamedMetadata(SrcNMD->getName());
-    if (!DestNMD)
-      NamedMDNode::Create(SrcNMD, Dest);
-    else {
-      // Add Src elements into Dest node.
-      for (unsigned i = 0, e = SrcNMD->getNumOperands(); i != e; ++i) 
-        DestNMD->addOperand(SrcNMD->getOperand(i));
-    }
+    NamedMDNode *DestNMD = Dest->getOrInsertNamedMetadata(SrcNMD->getName());
+    // Add Src elements into Dest node.
+    for (unsigned i = 0, e = SrcNMD->getNumOperands(); i != e; ++i) 
+      DestNMD->addOperand(cast<MDNode>(MapValue(SrcNMD->getOperand(i),
+                                                ValueMap,
+                                                true)));
   }
 }
 
 // LinkGlobals - Loop through the global variables in the src module and merge
 // them into the dest module.
 static bool LinkGlobals(Module *Dest, const Module *Src,
-                        std::map<const Value*, Value*> &ValueMap,
+                        ValueToValueMapTy &ValueMap,
                     std::multimap<std::string, GlobalVariable *> &AppendingVars,
                         std::string *Err) {
   ValueSymbolTable &DestSymTab = Dest->getValueSymbolTable();
@@ -735,6 +635,12 @@ CalculateAliasLinkage(const GlobalValue *SGV, const GlobalValue *DGV) {
   else if (SL == GlobalValue::LinkerPrivateLinkage &&
            DL == GlobalValue::LinkerPrivateLinkage)
     return GlobalValue::LinkerPrivateLinkage;
+  else if (SL == GlobalValue::LinkerPrivateWeakLinkage &&
+           DL == GlobalValue::LinkerPrivateWeakLinkage)
+    return GlobalValue::LinkerPrivateWeakLinkage;
+  else if (SL == GlobalValue::LinkerPrivateWeakDefAutoLinkage &&
+           DL == GlobalValue::LinkerPrivateWeakDefAutoLinkage)
+    return GlobalValue::LinkerPrivateWeakDefAutoLinkage;
   else {
     assert (SL == GlobalValue::PrivateLinkage &&
             DL == GlobalValue::PrivateLinkage && "Unexpected linkage type");
@@ -746,7 +652,7 @@ CalculateAliasLinkage(const GlobalValue *SGV, const GlobalValue *DGV) {
 // dest module. We're assuming, that all functions/global variables were already
 // linked in.
 static bool LinkAlias(Module *Dest, const Module *Src,
-                      std::map<const Value*, Value*> &ValueMap,
+                      ValueToValueMapTy &ValueMap,
                       std::string *Err) {
   // Loop over all alias in the src module
   for (Module::const_alias_iterator I = Src->alias_begin(),
@@ -757,7 +663,7 @@ static bool LinkAlias(Module *Dest, const Module *Src,
 
     // Globals were already linked, thus we can just query ValueMap for variant
     // of SAliasee in Dest.
-    std::map<const Value*,Value*>::const_iterator VMI = ValueMap.find(SAliasee);
+    ValueToValueMapTy::const_iterator VMI = ValueMap.find(SAliasee);
     assert(VMI != ValueMap.end() && "Aliasee not linked");
     GlobalValue* DAliasee = cast<GlobalValue>(VMI->second);
     GlobalValue* DGV = NULL;
@@ -888,7 +794,7 @@ static bool LinkAlias(Module *Dest, const Module *Src,
       ForceRenaming(NewGA, SGA->getName());
 
     // Remember this mapping so uses in the source module get remapped
-    // later by RemapOperand.
+    // later by MapValue.
     ValueMap[SGA] = NewGA;
   }
 
@@ -899,7 +805,7 @@ static bool LinkAlias(Module *Dest, const Module *Src,
 // LinkGlobalInits - Update the initializers in the Dest module now that all
 // globals that may be referenced are in Dest.
 static bool LinkGlobalInits(Module *Dest, const Module *Src,
-                            std::map<const Value*, Value*> &ValueMap,
+                            ValueToValueMapTy &ValueMap,
                             std::string *Err) {
   // Loop over all of the globals in the src module, mapping them over as we go
   for (Module::const_global_iterator I = Src->global_begin(),
@@ -909,7 +815,7 @@ static bool LinkGlobalInits(Module *Dest, const Module *Src,
     if (SGV->hasInitializer()) {      // Only process initialized GV's
       // Figure out what the initializer looks like in the dest module...
       Constant *SInit =
-        cast<Constant>(RemapOperand(SGV->getInitializer(), ValueMap));
+        cast<Constant>(MapValue(SGV->getInitializer(), ValueMap, true));
       // Grab destination global variable or alias.
       GlobalValue *DGV = cast<GlobalValue>(ValueMap[SGV]->stripPointerCasts());
 
@@ -954,7 +860,7 @@ static bool LinkGlobalInits(Module *Dest, const Module *Src,
 // to the Dest function...
 //
 static bool LinkFunctionProtos(Module *Dest, const Module *Src,
-                               std::map<const Value*, Value*> &ValueMap,
+                               ValueToValueMapTy &ValueMap,
                                std::string *Err) {
   ValueSymbolTable &DestSymTab = Dest->getValueSymbolTable();
 
@@ -1039,7 +945,7 @@ static bool LinkFunctionProtos(Module *Dest, const Module *Src,
         ForceRenaming(NewDF, SF->getName());
 
       // Remember this mapping so uses in the source module get remapped
-      // later by RemapOperand.
+      // later by MapValue.
       ValueMap[SF] = NewDF;
       continue;
     }
@@ -1069,7 +975,7 @@ static bool LinkFunctionProtos(Module *Dest, const Module *Src,
 // fix up references to values.  At this point we know that Dest is an external
 // function, and that Src is not.
 static bool LinkFunctionBody(Function *Dest, Function *Src,
-                             std::map<const Value*, Value*> &ValueMap,
+                             ValueToValueMapTy &ValueMap,
                              std::string *Err) {
   assert(Src && Dest && Dest->isDeclaration() && !Src->isDeclaration());
 
@@ -1091,12 +997,30 @@ static bool LinkFunctionBody(Function *Dest, Function *Src,
   // the Source function as operands.  Loop through all of the operands of the
   // functions and patch them up to point to the local versions...
   //
+  // This is the same as RemapInstruction, except that it avoids remapping
+  // instruction and basic block operands.
+  //
   for (Function::iterator BB = Dest->begin(), BE = Dest->end(); BB != BE; ++BB)
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Remap operands.
       for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
            OI != OE; ++OI)
         if (!isa<Instruction>(*OI) && !isa<BasicBlock>(*OI))
-          *OI = RemapOperand(*OI, ValueMap);
+          *OI = MapValue(*OI, ValueMap, true);
+
+      // Remap attached metadata.
+      SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+      I->getAllMetadata(MDs);
+      for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator
+           MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) {
+        Value *Old = MI->second;
+        if (!isa<Instruction>(Old) && !isa<BasicBlock>(Old)) {
+          Value *New = MapValue(Old, ValueMap, true);
+          if (New != Old) 
+            I->setMetadata(MI->first, cast<MDNode>(New));
+        }
+      }
+    }
 
   // There is no need to map the arguments anymore.
   for (Function::arg_iterator I = Src->arg_begin(), E = Src->arg_end();
@@ -1111,7 +1035,7 @@ static bool LinkFunctionBody(Function *Dest, Function *Src,
 // source module into the DestModule.  This consists basically of copying the
 // function over and fixing up references to values.
 static bool LinkFunctionBodies(Module *Dest, Module *Src,
-                               std::map<const Value*, Value*> &ValueMap,
+                               ValueToValueMapTy &ValueMap,
                                std::string *Err) {
 
   // Loop over all of the functions in the src module, mapping them over as we
@@ -1319,8 +1243,10 @@ Linker::LinkModules(Module *Dest, Module *Src, std::string *ErrorMsg) {
     return true;
 
   // ValueMap - Mapping of values from what they used to be in Src, to what they
-  // are now in Dest.
-  std::map<const Value*, Value*> ValueMap;
+  // are now in Dest.  ValueToValueMapTy is a ValueMap, which involves some
+  // overhead due to the use of Value handles which the Linker doesn't actually
+  // need, but this allows us to reuse the ValueMapper code.
+  ValueToValueMapTy ValueMap;
 
   // AppendingVars - Keep track of global variables in the destination module
   // with appending linkage.  After the module is linked together, they are
@@ -1334,9 +1260,6 @@ Linker::LinkModules(Module *Dest, Module *Src, std::string *ErrorMsg) {
       AppendingVars.insert(std::make_pair(I->getName(), I));
   }
 
-  // Insert all of the named mdnoes in Src into the Dest module.
-  LinkNamedMDNodes(Dest, Src);
-
   // Insert all of the globals in src into the Dest module... without linking
   // initializers (which could refer to functions not yet mapped over).
   if (LinkGlobals(Dest, Src, ValueMap, AppendingVars, ErrorMsg))
@@ -1370,6 +1293,11 @@ Linker::LinkModules(Module *Dest, Module *Src, std::string *ErrorMsg) {
   // Resolve all uses of aliases with aliasees
   if (ResolveAliases(Dest)) return true;
 
+  // Remap all of the named mdnoes in Src into the Dest module. We do this
+  // after linking GlobalValues so that MDNodes that reference GlobalValues
+  // are properly remapped.
+  LinkNamedMDNodes(Dest, Src, ValueMap);
+
   // If the source library's module id is in the dependent library list of the
   // destination library, remove it since that module is now linked in.
   sys::Path modId;
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index fc4f3c69482ac..60a3a3e3e3128 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(LLVMMC
+  ELFObjectWriter.cpp
   MCAsmInfo.cpp
   MCAsmInfoCOFF.cpp
   MCAsmInfoDarwin.cpp
@@ -7,10 +8,12 @@ add_llvm_library(LLVMMC
   MCCodeEmitter.cpp
   MCContext.cpp
   MCDisassembler.cpp
+  MCELFStreamer.cpp
   MCExpr.cpp
   MCInst.cpp
   MCInstPrinter.cpp
   MCLabel.cpp
+  MCDwarf.cpp
   MCLoggingStreamer.cpp
   MCMachOStreamer.cpp
   MCNullStreamer.cpp
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
new file mode 100644
index 0000000000000..cf35b45715e1e
--- /dev/null
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -0,0 +1,973 @@
+//===- lib/MC/ELFObjectWriter.cpp - ELF File Writer -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF object file writer information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/ELFObjectWriter.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetAsmBackend.h"
+
+#include "../Target/X86/X86FixupKinds.h"
+
+#include <vector>
+using namespace llvm;
+
+namespace {
+
+  class ELFObjectWriterImpl {
+    static bool isFixupKindX86PCRel(unsigned Kind) {
+      switch (Kind) {
+      default:
+        return false;
+      case X86::reloc_pcrel_1byte:
+      case X86::reloc_pcrel_4byte:
+      case X86::reloc_riprel_4byte:
+      case X86::reloc_riprel_4byte_movq_load:
+        return true;
+      }
+    }
+
+    /*static bool isFixupKindX86RIPRel(unsigned Kind) {
+      return Kind == X86::reloc_riprel_4byte ||
+        Kind == X86::reloc_riprel_4byte_movq_load;
+    }*/
+
+
+    /// ELFSymbolData - Helper struct for containing some precomputed information
+    /// on symbols.
+    struct ELFSymbolData {
+      MCSymbolData *SymbolData;
+      uint64_t StringIndex;
+      uint32_t SectionIndex;
+
+      // Support lexicographic sorting.
+      bool operator<(const ELFSymbolData &RHS) const {
+        return SymbolData->getSymbol().getName() <
+               RHS.SymbolData->getSymbol().getName();
+      }
+    };
+
+    /// @name Relocation Data
+    /// @{
+
+    struct ELFRelocationEntry {
+      // Make these big enough for both 32-bit and 64-bit
+      uint64_t r_offset;
+      uint64_t r_info;
+      uint64_t r_addend;
+
+      // Support lexicographic sorting.
+      bool operator<(const ELFRelocationEntry &RE) const {
+        return RE.r_offset < r_offset;
+      }
+    };
+
+    llvm::DenseMap<const MCSectionData*,
+                   std::vector<ELFRelocationEntry> > Relocations;
+    DenseMap<const MCSection*, uint64_t> SectionStringTableIndex;
+
+    /// @}
+    /// @name Symbol Table Data
+    /// @{
+
+    SmallString<256> StringTable;
+    std::vector<ELFSymbolData> LocalSymbolData;
+    std::vector<ELFSymbolData> ExternalSymbolData;
+    std::vector<ELFSymbolData> UndefinedSymbolData;
+
+    /// @}
+
+    ELFObjectWriter *Writer;
+
+    raw_ostream &OS;
+
+    // This holds the current offset into the object file.
+    size_t FileOff;
+
+    unsigned Is64Bit : 1;
+
+    bool HasRelocationAddend;
+
+    // This holds the symbol table index of the last local symbol.
+    unsigned LastLocalSymbolIndex;
+    // This holds the .strtab section index.
+    unsigned StringTableIndex;
+
+    unsigned ShstrtabIndex;
+
+  public:
+    ELFObjectWriterImpl(ELFObjectWriter *_Writer, bool _Is64Bit,
+                        bool _HasRelAddend)
+      : Writer(_Writer), OS(Writer->getStream()), FileOff(0),
+        Is64Bit(_Is64Bit), HasRelocationAddend(_HasRelAddend) {
+    }
+
+    void Write8(uint8_t Value) { Writer->Write8(Value); }
+    void Write16(uint16_t Value) { Writer->Write16(Value); }
+    void Write32(uint32_t Value) { Writer->Write32(Value); }
+    //void Write64(uint64_t Value) { Writer->Write64(Value); }
+    void WriteZeros(unsigned N) { Writer->WriteZeros(N); }
+    //void WriteBytes(StringRef Str, unsigned ZeroFillSize = 0) {
+    //  Writer->WriteBytes(Str, ZeroFillSize);
+    //}
+
+    void WriteWord(uint64_t W) {
+      if (Is64Bit)
+        Writer->Write64(W);
+      else
+        Writer->Write32(W);
+    }
+
+    void String8(char *buf, uint8_t Value) {
+      buf[0] = Value;
+    }
+
+    void StringLE16(char *buf, uint16_t Value) {
+      buf[0] = char(Value >> 0);
+      buf[1] = char(Value >> 8);
+    }
+
+    void StringLE32(char *buf, uint32_t Value) {
+      StringLE16(buf, uint16_t(Value >> 0));
+      StringLE16(buf + 2, uint16_t(Value >> 16));
+    }
+
+    void StringLE64(char *buf, uint64_t Value) {
+      StringLE32(buf, uint32_t(Value >> 0));
+      StringLE32(buf + 4, uint32_t(Value >> 32));
+    }
+
+    void StringBE16(char *buf ,uint16_t Value) {
+      buf[0] = char(Value >> 8);
+      buf[1] = char(Value >> 0);
+    }
+
+    void StringBE32(char *buf, uint32_t Value) {
+      StringBE16(buf, uint16_t(Value >> 16));
+      StringBE16(buf + 2, uint16_t(Value >> 0));
+    }
+
+    void StringBE64(char *buf, uint64_t Value) {
+      StringBE32(buf, uint32_t(Value >> 32));
+      StringBE32(buf + 4, uint32_t(Value >> 0));
+    }
+
+    void String16(char *buf, uint16_t Value) {
+      if (Writer->isLittleEndian())
+        StringLE16(buf, Value);
+      else
+        StringBE16(buf, Value);
+    }
+
+    void String32(char *buf, uint32_t Value) {
+      if (Writer->isLittleEndian())
+        StringLE32(buf, Value);
+      else
+        StringBE32(buf, Value);
+    }
+
+    void String64(char *buf, uint64_t Value) {
+      if (Writer->isLittleEndian())
+        StringLE64(buf, Value);
+      else
+        StringBE64(buf, Value);
+    }
+
+    void WriteHeader(uint64_t SectionDataSize, unsigned NumberOfSections);
+
+    void WriteSymbolEntry(MCDataFragment *F, uint64_t name, uint8_t info,
+                          uint64_t value, uint64_t size,
+                          uint8_t other, uint16_t shndx);
+
+    void WriteSymbol(MCDataFragment *F, ELFSymbolData &MSD,
+                     const MCAsmLayout &Layout);
+
+    void WriteSymbolTable(MCDataFragment *F, const MCAssembler &Asm,
+                          const MCAsmLayout &Layout);
+
+    void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                          const MCFragment *Fragment, const MCFixup &Fixup,
+                          MCValue Target, uint64_t &FixedValue);
+
+    uint64_t getSymbolIndexInSymbolTable(const MCAssembler &Asm,
+                                         const MCSymbol *S);
+
+    /// ComputeSymbolTable - Compute the symbol table data
+    ///
+    /// \param StringTable [out] - The string table data.
+    /// \param StringIndexMap [out] - Map from symbol names to offsets in the
+    /// string table.
+    void ComputeSymbolTable(MCAssembler &Asm);
+
+    void WriteRelocation(MCAssembler &Asm, MCAsmLayout &Layout,
+                         const MCSectionData &SD);
+
+    void WriteRelocations(MCAssembler &Asm, MCAsmLayout &Layout) {
+      for (MCAssembler::const_iterator it = Asm.begin(),
+             ie = Asm.end(); it != ie; ++it) {
+        WriteRelocation(Asm, Layout, *it);
+      }
+    }
+
+    void CreateMetadataSections(MCAssembler &Asm, MCAsmLayout &Layout);
+
+    void ExecutePostLayoutBinding(MCAssembler &Asm) {
+      // Compute symbol table information.
+      ComputeSymbolTable(Asm);
+    }
+
+    void WriteSecHdrEntry(uint32_t Name, uint32_t Type, uint64_t Flags,
+                          uint64_t Address, uint64_t Offset,
+                          uint64_t Size, uint32_t Link, uint32_t Info,
+                          uint64_t Alignment, uint64_t EntrySize);
+
+    void WriteRelocationsFragment(const MCAssembler &Asm, MCDataFragment *F,
+                                  const MCSectionData *SD);
+
+    void WriteObject(const MCAssembler &Asm, const MCAsmLayout &Layout);
+  };
+
+}
+
+// Emit the ELF header.
+void ELFObjectWriterImpl::WriteHeader(uint64_t SectionDataSize,
+                                      unsigned NumberOfSections) {
+  // ELF Header
+  // ----------
+  //
+  // Note
+  // ----
+  // emitWord method behaves differently for ELF32 and ELF64, writing
+  // 4 bytes in the former and 8 in the latter.
+
+  Write8(0x7f); // e_ident[EI_MAG0]
+  Write8('E');  // e_ident[EI_MAG1]
+  Write8('L');  // e_ident[EI_MAG2]
+  Write8('F');  // e_ident[EI_MAG3]
+
+  Write8(Is64Bit ? ELF::ELFCLASS64 : ELF::ELFCLASS32); // e_ident[EI_CLASS]
+
+  // e_ident[EI_DATA]
+  Write8(Writer->isLittleEndian() ? ELF::ELFDATA2LSB : ELF::ELFDATA2MSB);
+
+  Write8(ELF::EV_CURRENT);        // e_ident[EI_VERSION]
+  Write8(ELF::ELFOSABI_LINUX);    // e_ident[EI_OSABI]
+  Write8(0);                  // e_ident[EI_ABIVERSION]
+
+  WriteZeros(ELF::EI_NIDENT - ELF::EI_PAD);
+
+  Write16(ELF::ET_REL);             // e_type
+
+  // FIXME: Make this configurable
+  Write16(Is64Bit ? ELF::EM_X86_64 : ELF::EM_386); // e_machine = target
+
+  Write32(ELF::EV_CURRENT);         // e_version
+  WriteWord(0);                    // e_entry, no entry point in .o file
+  WriteWord(0);                    // e_phoff, no program header for .o
+  WriteWord(SectionDataSize + (Is64Bit ? sizeof(ELF::Elf64_Ehdr) :
+            sizeof(ELF::Elf32_Ehdr)));  // e_shoff = sec hdr table off in bytes
+
+  // FIXME: Make this configurable.
+  Write32(0);   // e_flags = whatever the target wants
+
+  // e_ehsize = ELF header size
+  Write16(Is64Bit ? sizeof(ELF::Elf64_Ehdr) : sizeof(ELF::Elf32_Ehdr));
+
+  Write16(0);                  // e_phentsize = prog header entry size
+  Write16(0);                  // e_phnum = # prog header entries = 0
+
+  // e_shentsize = Section header entry size
+  Write16(Is64Bit ? sizeof(ELF::Elf64_Shdr) : sizeof(ELF::Elf32_Shdr));
+
+  // e_shnum     = # of section header ents
+  Write16(NumberOfSections);
+
+  // e_shstrndx  = Section # of '.shstrtab'
+  Write16(ShstrtabIndex);
+}
+
+void ELFObjectWriterImpl::WriteSymbolEntry(MCDataFragment *F, uint64_t name,
+                                           uint8_t info, uint64_t value,
+                                           uint64_t size, uint8_t other,
+                                           uint16_t shndx) {
+  if (Is64Bit) {
+    char buf[8];
+
+    String32(buf, name);
+    F->getContents() += StringRef(buf, 4); // st_name
+
+    String8(buf, info);
+    F->getContents() += StringRef(buf, 1);  // st_info
+
+    String8(buf, other);
+    F->getContents() += StringRef(buf, 1); // st_other
+
+    String16(buf, shndx);
+    F->getContents() += StringRef(buf, 2); // st_shndx
+
+    String64(buf, value);
+    F->getContents() += StringRef(buf, 8); // st_value
+
+    String64(buf, size);
+    F->getContents() += StringRef(buf, 8);  // st_size
+  } else {
+    char buf[4];
+
+    String32(buf, name);
+    F->getContents() += StringRef(buf, 4);  // st_name
+
+    String32(buf, value);
+    F->getContents() += StringRef(buf, 4); // st_value
+
+    String32(buf, size);
+    F->getContents() += StringRef(buf, 4);  // st_size
+
+    String8(buf, info);
+    F->getContents() += StringRef(buf, 1);  // st_info
+
+    String8(buf, other);
+    F->getContents() += StringRef(buf, 1); // st_other
+
+    String16(buf, shndx);
+    F->getContents() += StringRef(buf, 2); // st_shndx
+  }
+}
+
+void ELFObjectWriterImpl::WriteSymbol(MCDataFragment *F, ELFSymbolData &MSD,
+                                      const MCAsmLayout &Layout) {
+  MCSymbolData &Data = *MSD.SymbolData;
+  uint8_t Info = (Data.getFlags() & 0xff);
+  uint8_t Other = ((Data.getFlags() & 0xf00) >> ELF_STV_Shift);
+  uint64_t Value = 0;
+  uint64_t Size = 0;
+  const MCExpr *ESize;
+
+  if (Data.isCommon() && Data.isExternal())
+    Value = Data.getCommonAlignment();
+
+  if (!Data.isCommon())
+    if (MCFragment *FF = Data.getFragment())
+      Value = Layout.getSymbolAddress(&Data) -
+              Layout.getSectionAddress(FF->getParent());
+
+  ESize = Data.getSize();
+  if (Data.getSize()) {
+    MCValue Res;
+    if (ESize->getKind() == MCExpr::Binary) {
+      const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(ESize);
+
+      if (BE->EvaluateAsRelocatable(Res, &Layout)) {
+        MCSymbolData &A =
+          Layout.getAssembler().getSymbolData(Res.getSymA()->getSymbol());
+        MCSymbolData &B =
+          Layout.getAssembler().getSymbolData(Res.getSymB()->getSymbol());
+
+        Size = Layout.getSymbolAddress(&A) - Layout.getSymbolAddress(&B);
+      }
+    } else if (ESize->getKind() == MCExpr::Constant) {
+      Size = static_cast<const MCConstantExpr *>(ESize)->getValue();
+    } else {
+      assert(0 && "Unsupported size expression");
+    }
+  }
+
+  // Write out the symbol table entry
+  WriteSymbolEntry(F, MSD.StringIndex, Info, Value,
+                   Size, Other, MSD.SectionIndex);
+}
+
+void ELFObjectWriterImpl::WriteSymbolTable(MCDataFragment *F,
+                                           const MCAssembler &Asm,
+                                           const MCAsmLayout &Layout) {
+  // The string table must be emitted first because we need the index
+  // into the string table for all the symbol names.
+  assert(StringTable.size() && "Missing string table");
+
+  // FIXME: Make sure the start of the symbol table is aligned.
+
+  // The first entry is the undefined symbol entry.
+  unsigned EntrySize = Is64Bit ? ELF::SYMENTRY_SIZE64 : ELF::SYMENTRY_SIZE32;
+  F->getContents().append(EntrySize, '\x00');
+
+  // Write the symbol table entries.
+  LastLocalSymbolIndex = LocalSymbolData.size() + 1;
+  for (unsigned i = 0, e = LocalSymbolData.size(); i != e; ++i) {
+    ELFSymbolData &MSD = LocalSymbolData[i];
+    WriteSymbol(F, MSD, Layout);
+  }
+
+  // Write out a symbol table entry for each section.
+  // leaving out the just added .symtab which is at
+  // the very end
+  unsigned Index = 1;
+  for (MCAssembler::const_iterator it = Asm.begin(),
+       ie = Asm.end(); it != ie; ++it, ++Index) {
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF&>(it->getSection());
+    // Leave out relocations so we don't have indexes within
+    // the relocations messed up
+    if (Section.getType() == ELF::SHT_RELA || Section.getType() == ELF::SHT_REL)
+      continue;
+    if (Index == Asm.size())
+      continue;
+    WriteSymbolEntry(F, 0, ELF::STT_SECTION, 0, 0, ELF::STV_DEFAULT, Index);
+    LastLocalSymbolIndex++;
+  }
+
+  for (unsigned i = 0, e = ExternalSymbolData.size(); i != e; ++i) {
+    ELFSymbolData &MSD = ExternalSymbolData[i];
+    MCSymbolData &Data = *MSD.SymbolData;
+    assert((Data.getFlags() & ELF_STB_Global) &&
+           "External symbol requires STB_GLOBAL flag");
+    WriteSymbol(F, MSD, Layout);
+    if (Data.getFlags() & ELF_STB_Local)
+      LastLocalSymbolIndex++;
+  }
+
+  for (unsigned i = 0, e = UndefinedSymbolData.size(); i != e; ++i) {
+    ELFSymbolData &MSD = UndefinedSymbolData[i];
+    MCSymbolData &Data = *MSD.SymbolData;
+    Data.setFlags(Data.getFlags() | ELF_STB_Global);
+    WriteSymbol(F, MSD, Layout);
+    if (Data.getFlags() & ELF_STB_Local)
+      LastLocalSymbolIndex++;
+  }
+}
+
+// FIXME: this is currently X86/X86_64 only
+void ELFObjectWriterImpl::RecordRelocation(const MCAssembler &Asm,
+                                           const MCAsmLayout &Layout,
+                                           const MCFragment *Fragment,
+                                           const MCFixup &Fixup,
+                                           MCValue Target,
+                                           uint64_t &FixedValue) {
+  int64_t Addend = 0;
+  unsigned Index = 0;
+  int64_t Value = Target.getConstant();
+
+  if (!Target.isAbsolute()) {
+    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+    MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData *Base = Asm.getAtom(Layout, &SD);
+    MCFragment *F = SD.getFragment();
+
+    if (Base) {
+      if (F && (!Symbol->isInSection() || SD.isCommon()) && !SD.isExternal()) {
+        Index = F->getParent()->getOrdinal() + LocalSymbolData.size() + 1;
+        Value += Layout.getSymbolAddress(&SD);
+      } else
+        Index = getSymbolIndexInSymbolTable(Asm, Symbol);
+      if (Base != &SD)
+        Value += Layout.getSymbolAddress(&SD) - Layout.getSymbolAddress(Base);
+      Addend = Value;
+      // Compensate for the addend on i386.
+      if (Is64Bit)
+        Value = 0;
+    } else {
+      if (F) {
+        // Index of the section in .symtab against this symbol
+        // is being relocated + 2 (empty section + abs. symbols).
+        Index = F->getParent()->getOrdinal() + LocalSymbolData.size() + 1;
+
+        MCSectionData *FSD = F->getParent();
+        // Offset of the symbol in the section
+        Addend = Layout.getSymbolAddress(&SD) - Layout.getSectionAddress(FSD);
+      } else {
+        FixedValue = Value;
+        return;
+      }
+    }
+  }
+
+  FixedValue = Value;
+
+  // determine the type of the relocation
+  bool IsPCRel = isFixupKindX86PCRel(Fixup.getKind());
+  unsigned Type;
+  if (Is64Bit) {
+    if (IsPCRel) {
+      Type = ELF::R_X86_64_PC32;
+    } else {
+      switch ((unsigned)Fixup.getKind()) {
+      default: llvm_unreachable("invalid fixup kind!");
+      case FK_Data_8: Type = ELF::R_X86_64_64; break;
+      case X86::reloc_pcrel_4byte:
+      case FK_Data_4:
+        // check that the offset fits within a signed long
+        if (isInt<32>(Target.getConstant()))
+          Type = ELF::R_X86_64_32S;
+        else
+          Type = ELF::R_X86_64_32;
+        break;
+      case FK_Data_2: Type = ELF::R_X86_64_16; break;
+      case X86::reloc_pcrel_1byte:
+      case FK_Data_1: Type = ELF::R_X86_64_8; break;
+      }
+    }
+  } else {
+    if (IsPCRel) {
+      Type = ELF::R_386_PC32;
+    } else {
+      switch ((unsigned)Fixup.getKind()) {
+      default: llvm_unreachable("invalid fixup kind!");
+      case X86::reloc_pcrel_4byte:
+      case FK_Data_4: Type = ELF::R_386_32; break;
+      case FK_Data_2: Type = ELF::R_386_16; break;
+      case X86::reloc_pcrel_1byte:
+      case FK_Data_1: Type = ELF::R_386_8; break;
+      }
+    }
+  }
+
+  ELFRelocationEntry ERE;
+
+  if (Is64Bit) {
+    struct ELF::Elf64_Rela ERE64;
+    ERE64.setSymbolAndType(Index, Type);
+    ERE.r_info = ERE64.r_info;
+  } else {
+    struct ELF::Elf32_Rela ERE32;
+    ERE32.setSymbolAndType(Index, Type);
+    ERE.r_info = ERE32.r_info;
+  }
+
+  ERE.r_offset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+
+  if (HasRelocationAddend)
+    ERE.r_addend = Addend;
+  else
+    ERE.r_addend = 0; // Silence compiler warning.
+
+  Relocations[Fragment->getParent()].push_back(ERE);
+}
+
+uint64_t
+ELFObjectWriterImpl::getSymbolIndexInSymbolTable(const MCAssembler &Asm,
+                                                 const MCSymbol *S) {
+  MCSymbolData &SD = Asm.getSymbolData(*S);
+
+  // Local symbol.
+  if (!SD.isExternal() && !S->isUndefined())
+    return SD.getIndex() + /* empty symbol */ 1;
+
+  // External or undefined symbol.
+  return SD.getIndex() + Asm.size() + /* empty symbol */ 1;
+}
+
+void ELFObjectWriterImpl::ComputeSymbolTable(MCAssembler &Asm) {
+  // Build section lookup table.
+  DenseMap<const MCSection*, uint8_t> SectionIndexMap;
+  unsigned Index = 1;
+  for (MCAssembler::iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it, ++Index)
+    SectionIndexMap[&it->getSection()] = Index;
+
+  // Index 0 is always the empty string.
+  StringMap<uint64_t> StringIndexMap;
+  StringTable += '\x00';
+
+  // Add the data for local symbols.
+  for (MCAssembler::symbol_iterator it = Asm.symbol_begin(),
+         ie = Asm.symbol_end(); it != ie; ++it) {
+    const MCSymbol &Symbol = it->getSymbol();
+
+    // Ignore non-linker visible symbols.
+    if (!Asm.isSymbolLinkerVisible(Symbol))
+      continue;
+
+    if (it->isExternal() || Symbol.isUndefined())
+      continue;
+
+    uint64_t &Entry = StringIndexMap[Symbol.getName()];
+    if (!Entry) {
+      Entry = StringTable.size();
+      StringTable += Symbol.getName();
+      StringTable += '\x00';
+    }
+
+    ELFSymbolData MSD;
+    MSD.SymbolData = it;
+    MSD.StringIndex = Entry;
+
+    if (Symbol.isAbsolute()) {
+      MSD.SectionIndex = ELF::SHN_ABS;
+      LocalSymbolData.push_back(MSD);
+    } else {
+      MSD.SectionIndex = SectionIndexMap.lookup(&Symbol.getSection());
+      assert(MSD.SectionIndex && "Invalid section index!");
+      LocalSymbolData.push_back(MSD);
+    }
+  }
+
+  // Now add non-local symbols.
+  for (MCAssembler::symbol_iterator it = Asm.symbol_begin(),
+         ie = Asm.symbol_end(); it != ie; ++it) {
+    const MCSymbol &Symbol = it->getSymbol();
+
+    // Ignore non-linker visible symbols.
+    if (!Asm.isSymbolLinkerVisible(Symbol))
+      continue;
+
+    if (!it->isExternal() && !Symbol.isUndefined())
+      continue;
+
+    uint64_t &Entry = StringIndexMap[Symbol.getName()];
+    if (!Entry) {
+      Entry = StringTable.size();
+      StringTable += Symbol.getName();
+      StringTable += '\x00';
+    }
+
+    ELFSymbolData MSD;
+    MSD.SymbolData = it;
+    MSD.StringIndex = Entry;
+
+    if (Symbol.isUndefined()) {
+      MSD.SectionIndex = ELF::SHN_UNDEF;
+      // XXX: for some reason we dont Emit* this
+      it->setFlags(it->getFlags() | ELF_STB_Global);
+      UndefinedSymbolData.push_back(MSD);
+    } else if (Symbol.isAbsolute()) {
+      MSD.SectionIndex = ELF::SHN_ABS;
+      ExternalSymbolData.push_back(MSD);
+    } else if (it->isCommon()) {
+      MSD.SectionIndex = ELF::SHN_COMMON;
+      ExternalSymbolData.push_back(MSD);
+    } else {
+      MSD.SectionIndex = SectionIndexMap.lookup(&Symbol.getSection());
+      assert(MSD.SectionIndex && "Invalid section index!");
+      ExternalSymbolData.push_back(MSD);
+    }
+  }
+
+  // Symbols are required to be in lexicographic order.
+  array_pod_sort(LocalSymbolData.begin(), LocalSymbolData.end());
+  array_pod_sort(ExternalSymbolData.begin(), ExternalSymbolData.end());
+  array_pod_sort(UndefinedSymbolData.begin(), UndefinedSymbolData.end());
+
+  // Set the symbol indices. Local symbols must come before all other
+  // symbols with non-local bindings.
+  Index = 0;
+  for (unsigned i = 0, e = LocalSymbolData.size(); i != e; ++i)
+    LocalSymbolData[i].SymbolData->setIndex(Index++);
+  for (unsigned i = 0, e = ExternalSymbolData.size(); i != e; ++i)
+    ExternalSymbolData[i].SymbolData->setIndex(Index++);
+  for (unsigned i = 0, e = UndefinedSymbolData.size(); i != e; ++i)
+    UndefinedSymbolData[i].SymbolData->setIndex(Index++);
+}
+
+void ELFObjectWriterImpl::WriteRelocation(MCAssembler &Asm, MCAsmLayout &Layout,
+                                          const MCSectionData &SD) {
+  if (!Relocations[&SD].empty()) {
+    MCContext &Ctx = Asm.getContext();
+    const MCSection *RelaSection;
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF&>(SD.getSection());
+
+    const StringRef SectionName = Section.getSectionName();
+    std::string RelaSectionName = HasRelocationAddend ? ".rela" : ".rel";
+    RelaSectionName += SectionName;
+
+    unsigned EntrySize;
+    if (HasRelocationAddend)
+      EntrySize = Is64Bit ? sizeof(ELF::Elf64_Rela) : sizeof(ELF::Elf32_Rela);
+    else
+      EntrySize = Is64Bit ? sizeof(ELF::Elf64_Rel) : sizeof(ELF::Elf32_Rel);
+
+    RelaSection = Ctx.getELFSection(RelaSectionName, HasRelocationAddend ?
+                                    ELF::SHT_RELA : ELF::SHT_REL, 0,
+                                    SectionKind::getReadOnly(),
+                                    false, EntrySize);
+
+    MCSectionData &RelaSD = Asm.getOrCreateSectionData(*RelaSection);
+    RelaSD.setAlignment(1);
+
+    MCDataFragment *F = new MCDataFragment(&RelaSD);
+
+    WriteRelocationsFragment(Asm, F, &SD);
+
+    Asm.AddSectionToTheEnd(RelaSD, Layout);
+  }
+}
+
+void ELFObjectWriterImpl::WriteSecHdrEntry(uint32_t Name, uint32_t Type,
+                                           uint64_t Flags, uint64_t Address,
+                                           uint64_t Offset, uint64_t Size,
+                                           uint32_t Link, uint32_t Info,
+                                           uint64_t Alignment,
+                                           uint64_t EntrySize) {
+  Write32(Name);        // sh_name: index into string table
+  Write32(Type);        // sh_type
+  WriteWord(Flags);     // sh_flags
+  WriteWord(Address);   // sh_addr
+  WriteWord(Offset);    // sh_offset
+  WriteWord(Size);      // sh_size
+  Write32(Link);        // sh_link
+  Write32(Info);        // sh_info
+  WriteWord(Alignment); // sh_addralign
+  WriteWord(EntrySize); // sh_entsize
+}
+
+void ELFObjectWriterImpl::WriteRelocationsFragment(const MCAssembler &Asm,
+                                                   MCDataFragment *F,
+                                                   const MCSectionData *SD) {
+  std::vector<ELFRelocationEntry> &Relocs = Relocations[SD];
+  // sort by the r_offset just like gnu as does
+  array_pod_sort(Relocs.begin(), Relocs.end());
+
+  for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
+    ELFRelocationEntry entry = Relocs[e - i - 1];
+
+    unsigned WordSize = Is64Bit ? 8 : 4;
+    F->getContents() += StringRef((const char *)&entry.r_offset, WordSize);
+    F->getContents() += StringRef((const char *)&entry.r_info, WordSize);
+
+    if (HasRelocationAddend)
+      F->getContents() += StringRef((const char *)&entry.r_addend, WordSize);
+  }
+}
+
+void ELFObjectWriterImpl::CreateMetadataSections(MCAssembler &Asm,
+                                                 MCAsmLayout &Layout) {
+  MCContext &Ctx = Asm.getContext();
+  MCDataFragment *F;
+
+  WriteRelocations(Asm, Layout);
+
+  const MCSection *SymtabSection;
+  unsigned EntrySize = Is64Bit ? ELF::SYMENTRY_SIZE64 : ELF::SYMENTRY_SIZE32;
+
+  SymtabSection = Ctx.getELFSection(".symtab", ELF::SHT_SYMTAB, 0,
+                                    SectionKind::getReadOnly(),
+                                    false, EntrySize);
+
+  MCSectionData &SymtabSD = Asm.getOrCreateSectionData(*SymtabSection);
+
+  SymtabSD.setAlignment(Is64Bit ? 8 : 4);
+
+  F = new MCDataFragment(&SymtabSD);
+
+  // Symbol table
+  WriteSymbolTable(F, Asm, Layout);
+  Asm.AddSectionToTheEnd(SymtabSD, Layout);
+
+  const MCSection *StrtabSection;
+  StrtabSection = Ctx.getELFSection(".strtab", ELF::SHT_STRTAB, 0,
+                                    SectionKind::getReadOnly(), false);
+
+  MCSectionData &StrtabSD = Asm.getOrCreateSectionData(*StrtabSection);
+  StrtabSD.setAlignment(1);
+
+  // FIXME: This isn't right. If the sections get rearranged this will
+  // be wrong. We need a proper lookup.
+  StringTableIndex = Asm.size();
+
+  F = new MCDataFragment(&StrtabSD);
+  F->getContents().append(StringTable.begin(), StringTable.end());
+  Asm.AddSectionToTheEnd(StrtabSD, Layout);
+
+  const MCSection *ShstrtabSection;
+  ShstrtabSection = Ctx.getELFSection(".shstrtab", ELF::SHT_STRTAB, 0,
+                                      SectionKind::getReadOnly(), false);
+
+  MCSectionData &ShstrtabSD = Asm.getOrCreateSectionData(*ShstrtabSection);
+  ShstrtabSD.setAlignment(1);
+
+  F = new MCDataFragment(&ShstrtabSD);
+
+  // FIXME: This isn't right. If the sections get rearranged this will
+  // be wrong. We need a proper lookup.
+  ShstrtabIndex = Asm.size();
+
+  // Section header string table.
+  //
+  // The first entry of a string table holds a null character so skip
+  // section 0.
+  uint64_t Index = 1;
+  F->getContents() += '\x00';
+
+  for (MCAssembler::const_iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF&>(it->getSection());
+
+    // Remember the index into the string table so we can write it
+    // into the sh_name field of the section header table.
+    SectionStringTableIndex[&it->getSection()] = Index;
+
+    Index += Section.getSectionName().size() + 1;
+    F->getContents() += Section.getSectionName();
+    F->getContents() += '\x00';
+  }
+
+  Asm.AddSectionToTheEnd(ShstrtabSD, Layout);
+}
+
+void ELFObjectWriterImpl::WriteObject(const MCAssembler &Asm,
+                                      const MCAsmLayout &Layout) {
+  CreateMetadataSections(const_cast<MCAssembler&>(Asm),
+                         const_cast<MCAsmLayout&>(Layout));
+
+  // Add 1 for the null section.
+  unsigned NumSections = Asm.size() + 1;
+
+  uint64_t SectionDataSize = 0;
+
+  for (MCAssembler::const_iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    const MCSectionData &SD = *it;
+
+    // Get the size of the section in the output file (including padding).
+    uint64_t Size = Layout.getSectionFileSize(&SD);
+    SectionDataSize += Size;
+  }
+
+  // Write out the ELF header ...
+  WriteHeader(SectionDataSize, NumSections);
+  FileOff = Is64Bit ? sizeof(ELF::Elf64_Ehdr) : sizeof(ELF::Elf32_Ehdr);
+
+  // ... then all of the sections ...
+  DenseMap<const MCSection*, uint64_t> SectionOffsetMap;
+
+  DenseMap<const MCSection*, uint8_t> SectionIndexMap;
+
+  unsigned Index = 1;
+  for (MCAssembler::const_iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    // Remember the offset into the file for this section.
+    SectionOffsetMap[&it->getSection()] = FileOff;
+
+    SectionIndexMap[&it->getSection()] = Index++;
+
+    const MCSectionData &SD = *it;
+    FileOff += Layout.getSectionFileSize(&SD);
+
+    Asm.WriteSectionData(it, Layout, Writer);
+  }
+
+  // ... and then the section header table.
+  // Should we align the section header table?
+  //
+  // Null section first.
+  WriteSecHdrEntry(0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+
+  for (MCAssembler::const_iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    const MCSectionData &SD = *it;
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF&>(SD.getSection());
+
+    uint64_t sh_link = 0;
+    uint64_t sh_info = 0;
+
+    switch(Section.getType()) {
+    case ELF::SHT_DYNAMIC:
+      sh_link = SectionStringTableIndex[&it->getSection()];
+      sh_info = 0;
+      break;
+
+    case ELF::SHT_REL:
+    case ELF::SHT_RELA: {
+      const MCSection *SymtabSection;
+      const MCSection *InfoSection;
+
+      SymtabSection = Asm.getContext().getELFSection(".symtab", ELF::SHT_SYMTAB, 0,
+                                                     SectionKind::getReadOnly(),
+                                                     false);
+      sh_link = SectionIndexMap[SymtabSection];
+
+      // Remove ".rel" and ".rela" prefixes.
+      unsigned SecNameLen = (Section.getType() == ELF::SHT_REL) ? 4 : 5;
+      StringRef SectionName = Section.getSectionName().substr(SecNameLen);
+
+      InfoSection = Asm.getContext().getELFSection(SectionName,
+                                                   ELF::SHT_PROGBITS, 0,
+                                                   SectionKind::getReadOnly(),
+                                                   false);
+      sh_info = SectionIndexMap[InfoSection];
+      break;
+    }
+
+    case ELF::SHT_SYMTAB:
+    case ELF::SHT_DYNSYM:
+      sh_link = StringTableIndex;
+      sh_info = LastLocalSymbolIndex;
+      break;
+
+    case ELF::SHT_PROGBITS:
+    case ELF::SHT_STRTAB:
+    case ELF::SHT_NOBITS:
+    case ELF::SHT_NULL:
+      // Nothing to do.
+      break;
+
+    case ELF::SHT_HASH:
+    case ELF::SHT_GROUP:
+    case ELF::SHT_SYMTAB_SHNDX:
+    default:
+      assert(0 && "FIXME: sh_type value not supported!");
+      break;
+    }
+
+    WriteSecHdrEntry(SectionStringTableIndex[&it->getSection()],
+                     Section.getType(), Section.getFlags(),
+                     Layout.getSectionAddress(&SD),
+                     SectionOffsetMap.lookup(&SD.getSection()),
+                     Layout.getSectionSize(&SD), sh_link,
+                     sh_info, SD.getAlignment(),
+                     Section.getEntrySize());
+  }
+}
+
+ELFObjectWriter::ELFObjectWriter(raw_ostream &OS,
+                                 bool Is64Bit,
+                                 bool IsLittleEndian,
+                                 bool HasRelocationAddend)
+  : MCObjectWriter(OS, IsLittleEndian)
+{
+  Impl = new ELFObjectWriterImpl(this, Is64Bit, HasRelocationAddend);
+}
+
+ELFObjectWriter::~ELFObjectWriter() {
+  delete (ELFObjectWriterImpl*) Impl;
+}
+
+void ELFObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm) {
+  ((ELFObjectWriterImpl*) Impl)->ExecutePostLayoutBinding(Asm);
+}
+
+void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
+                                       const MCAsmLayout &Layout,
+                                       const MCFragment *Fragment,
+                                       const MCFixup &Fixup, MCValue Target,
+                                       uint64_t &FixedValue) {
+  ((ELFObjectWriterImpl*) Impl)->RecordRelocation(Asm, Layout, Fragment, Fixup,
+                                                  Target, FixedValue);
+}
+
+void ELFObjectWriter::WriteObject(const MCAssembler &Asm,
+                                  const MCAsmLayout &Layout) {
+  ((ELFObjectWriterImpl*) Impl)->WriteObject(Asm, Layout);
+}
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index a275be2c53c5f..670b2e9b292a5 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -68,7 +68,9 @@ MCAsmInfo::MCAsmInfo() {
   ExceptionsType = ExceptionHandling::None;
   DwarfRequiresFrameSection = true;
   DwarfUsesInlineInfoSection = false;
+  DwarfUsesAbsoluteLabelForStmtList = true;
   DwarfSectionOffsetDirective = 0;
+  DwarfUsesLabelOffsetForRanges = true;
   HasMicrosoftFastStdCallMangling = false;
 
   AsmTransCBE = 0;
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index 0bd3b2d001e8c..e0e261a63c707 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -44,5 +44,8 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
   
   HasDotTypeDotSizeDirective = false;
   HasNoDeadStrip = true;
+
+  DwarfUsesAbsoluteLabelForStmtList = false;
+  DwarfUsesLabelOffsetForRanges = false;
 }
 
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index e272b60c44759..1cc8fb0b54867 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -31,7 +31,7 @@ class MCAsmStreamer : public MCStreamer {
   formatted_raw_ostream &OS;
   const MCAsmInfo &MAI;
   OwningPtr<MCInstPrinter> InstPrinter;
-  MCCodeEmitter *Emitter;
+  OwningPtr<MCCodeEmitter> Emitter;
   
   SmallString<128> CommentToEmit;
   raw_svector_ostream CommentStream;
@@ -217,6 +217,7 @@ static inline int64_t truncateToSize(int64_t Value, unsigned Bytes) {
 void MCAsmStreamer::SwitchSection(const MCSection *Section) {
   assert(Section && "Cannot switch to a null section!");
   if (Section != CurSection) {
+    PrevSection = CurSection;
     CurSection = Section;
     Section->PrintSwitchToSection(MAI, OS);
   }
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 7d8455492780c..f0e1d7fbc21c5 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -178,8 +178,12 @@ uint64_t MCAsmLayout::getSectionSize(const MCSectionData *SD) const {
 MCFragment::MCFragment() : Kind(FragmentType(~0)) {
 }
 
+MCFragment::~MCFragment() {
+}
+
 MCFragment::MCFragment(FragmentType _Kind, MCSectionData *_Parent)
-  : Kind(_Kind), Parent(_Parent), Atom(0), EffectiveSize(~UINT64_C(0))
+  : Kind(_Kind), Parent(_Parent), Atom(0), Offset(~UINT64_C(0)),
+    EffectiveSize(~UINT64_C(0))
 {
   if (Parent)
     Parent->getFragmentList().push_back(this);
@@ -207,7 +211,8 @@ MCSymbolData::MCSymbolData(const MCSymbol &_Symbol, MCFragment *_Fragment,
                            uint64_t _Offset, MCAssembler *A)
   : Symbol(&_Symbol), Fragment(_Fragment), Offset(_Offset),
     IsExternal(false), IsPrivateExtern(false),
-    CommonSize(0), CommonAlign(0), Flags(0), Index(0)
+    CommonSize(0), SymbolSize(0), CommonAlign(0),
+    Flags(0), Index(0)
 {
   if (A)
     A->getSymbolList().push_back(this);
@@ -623,8 +628,23 @@ void MCAssembler::WriteSectionData(const MCSectionData *SD,
       switch (it->getKind()) {
       default:
         assert(0 && "Invalid fragment in virtual section!");
+      case MCFragment::FT_Data: {
+        // Check that we aren't trying to write a non-zero contents (or fixups)
+        // into a virtual section. This is to support clients which use standard
+        // directives to fill the contents of virtual sections.
+        MCDataFragment &DF = cast<MCDataFragment>(*it);
+        assert(DF.fixup_begin() == DF.fixup_end() &&
+               "Cannot have fixups in virtual section!");
+        for (unsigned i = 0, e = DF.getContents().size(); i != e; ++i)
+          assert(DF.getContents()[i] == 0 &&
+                 "Invalid data value for virtual section!");
+        break;
+      }
       case MCFragment::FT_Align:
-        assert(!cast<MCAlignFragment>(it)->getValueSize() &&
+        // Check that we aren't trying to write a non-zero value into a virtual
+        // section.
+        assert((!cast<MCAlignFragment>(it)->getValueSize() ||
+                !cast<MCAlignFragment>(it)->getValue()) &&
                "Invalid align in virtual section!");
         break;
       case MCFragment::FT_Fill:
@@ -647,7 +667,41 @@ void MCAssembler::WriteSectionData(const MCSectionData *SD,
   assert(OW->getStream().tell() - Start == Layout.getSectionFileSize(SD));
 }
 
-void MCAssembler::Finish() {
+void MCAssembler::AddSectionToTheEnd(MCSectionData &SD, MCAsmLayout &Layout) {
+  // Create dummy fragments and assign section ordinals.
+  unsigned SectionIndex = 0;
+  for (MCAssembler::iterator it = begin(), ie = end(); it != ie; ++it)
+    SectionIndex++;
+
+  SD.setOrdinal(SectionIndex);
+
+  // Assign layout order indices to sections and fragments.
+  unsigned FragmentIndex = 0;
+  unsigned i = 0;
+  for (unsigned e = Layout.getSectionOrder().size(); i != e; ++i) {
+    MCSectionData *SD = Layout.getSectionOrder()[i];
+
+    for (MCSectionData::iterator it2 = SD->begin(),
+           ie2 = SD->end(); it2 != ie2; ++it2)
+      FragmentIndex++;
+  }
+
+  SD.setLayoutOrder(i);
+  for (MCSectionData::iterator it2 = SD.begin(),
+         ie2 = SD.end(); it2 != ie2; ++it2) {
+    it2->setLayoutOrder(FragmentIndex++);
+  }
+  Layout.getSectionOrder().push_back(&SD);
+
+  Layout.LayoutSection(&SD);
+
+  // Layout until everything fits.
+  while (LayoutOnce(Layout))
+    continue;
+
+}
+
+void MCAssembler::Finish(MCObjectWriter *Writer) {
   DEBUG_WITH_TYPE("mc-dump", {
       llvm::errs() << "assembler backend - pre-layout\n--\n";
       dump(); });
@@ -717,9 +771,15 @@ void MCAssembler::Finish() {
       dump(); });
 
   uint64_t StartOffset = OS.tell();
-  llvm::OwningPtr<MCObjectWriter> Writer(getBackend().createObjectWriter(OS));
-  if (!Writer)
-    report_fatal_error("unable to create object writer!");
+
+  llvm::OwningPtr<MCObjectWriter> OwnWriter(0);
+  if (Writer == 0) {
+    //no custom Writer_ : create the default one life-managed by OwningPtr
+    OwnWriter.reset(getBackend().createObjectWriter(OS));
+    Writer = OwnWriter.get();
+    if (!Writer)
+      report_fatal_error("unable to create object writer!");
+  }
 
   // Allow the object writer a chance to perform post-layout binding (for
   // example, to set the index fields in the symbol data).
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 11370642530a1..e5586a0d7c311 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCLabel.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 using namespace llvm;
@@ -23,7 +24,8 @@ typedef StringMap<const MCSectionELF*> ELFUniqueMapTy;
 typedef StringMap<const MCSectionCOFF*> COFFUniqueMapTy;
 
 
-MCContext::MCContext(const MCAsmInfo &mai) : MAI(mai), NextUniqueID(0) {
+MCContext::MCContext(const MCAsmInfo &mai) : MAI(mai), NextUniqueID(0),
+                     CurrentDwarfLoc(0,0,0,0,0) {
   MachOUniquingMap = 0;
   ELFUniquingMap = 0;
   COFFUniquingMap = 0;
@@ -31,6 +33,8 @@ MCContext::MCContext(const MCAsmInfo &mai) : MAI(mai), NextUniqueID(0) {
   SecureLogFile = getenv("AS_SECURE_LOG_FILE");
   SecureLog = 0;
   SecureLogUsed = false;
+
+  DwarfLocSeen = false;
 }
 
 MCContext::~MCContext() {
@@ -147,7 +151,7 @@ getMachOSection(StringRef Segment, StringRef Section,
 
 const MCSection *MCContext::
 getELFSection(StringRef Section, unsigned Type, unsigned Flags,
-              SectionKind Kind, bool IsExplicit) {
+              SectionKind Kind, bool IsExplicit, unsigned EntrySize) {
   if (ELFUniquingMap == 0)
     ELFUniquingMap = new ELFUniqueMapTy();
   ELFUniqueMapTy &Map = *(ELFUniqueMapTy*)ELFUniquingMap;
@@ -157,7 +161,7 @@ getELFSection(StringRef Section, unsigned Type, unsigned Flags,
   if (Entry.getValue()) return Entry.getValue();
   
   MCSectionELF *Result = new (*this) MCSectionELF(Entry.getKey(), Type, Flags,
-                                                  Kind, IsExplicit);
+                                                  Kind, IsExplicit, EntrySize);
   Entry.setValue(Result);
   return Result;
 }
@@ -181,3 +185,81 @@ const MCSection *MCContext::getCOFFSection(StringRef Section,
   Entry.setValue(Result);
   return Result;
 }
+
+//===----------------------------------------------------------------------===//
+// Dwarf Management
+//===----------------------------------------------------------------------===//
+
+/// GetDwarfFile - takes a file name an number to place in the dwarf file and
+/// directory tables.  If the file number has already been allocated it is an
+/// error and zero is returned and the client reports the error, else the
+/// allocated file number is returned.  The file numbers may be in any order.
+unsigned MCContext::GetDwarfFile(StringRef FileName, unsigned FileNumber) {
+  // TODO: a FileNumber of zero says to use the next available file number.
+  // Note: in GenericAsmParser::ParseDirectiveFile() FileNumber was checked
+  // to not be less than one.  This needs to be change to be not less than zero.
+
+  // Make space for this FileNumber in the MCDwarfFiles vector if needed.
+  if (FileNumber >= MCDwarfFiles.size()) {
+    MCDwarfFiles.resize(FileNumber + 1);
+  } else {
+    MCDwarfFile *&ExistingFile = MCDwarfFiles[FileNumber];
+    if (ExistingFile)
+      // It is an error to use see the same number more than once.
+      return 0;
+  }
+
+  // Get the new MCDwarfFile slot for this FileNumber.
+  MCDwarfFile *&File = MCDwarfFiles[FileNumber];
+
+  // Separate the directory part from the basename of the FileName.
+  std::pair<StringRef, StringRef> Slash = FileName.rsplit('/');
+
+  // Find or make a entry in the MCDwarfDirs vector for this Directory.
+  StringRef Name;
+  unsigned DirIndex;
+  // Capture directory name.
+  if (Slash.second.empty()) {
+    Name = Slash.first;
+    DirIndex = 0; // For FileNames with no directories a DirIndex of 0 is used.
+  } else {
+    StringRef Directory = Slash.first;
+    Name = Slash.second;
+    for (DirIndex = 0; DirIndex < MCDwarfDirs.size(); DirIndex++) {
+      if (Directory == MCDwarfDirs[DirIndex])
+        break;
+    }
+    if (DirIndex >= MCDwarfDirs.size()) {
+      char *Buf = static_cast<char *>(Allocate(Directory.size()));
+      memcpy(Buf, Directory.data(), Directory.size());
+      MCDwarfDirs.push_back(StringRef(Buf, Directory.size()));
+    }
+    // The DirIndex is one based, as DirIndex of 0 is used for FileNames with
+    // no directories.  MCDwarfDirs[] is unlike MCDwarfFiles[] in that the
+    // directory names are stored at MCDwarfDirs[DirIndex-1] where FileNames are
+    // stored at MCDwarfFiles[FileNumber].Name .
+    DirIndex++;
+  }
+  
+  // Now make the MCDwarfFile entry and place it in the slot in the MCDwarfFiles
+  // vector.
+  char *Buf = static_cast<char *>(Allocate(Name.size()));
+  memcpy(Buf, Name.data(), Name.size());
+  File = new (*this) MCDwarfFile(StringRef(Buf, Name.size()), DirIndex);
+
+  // return the allocated FileNumber.
+  return FileNumber;
+}
+
+/// ValidateDwarfFileNumber - takes a dwarf file number and returns true if it
+/// currently is assigned and false otherwise.
+bool MCContext::ValidateDwarfFileNumber(unsigned FileNumber) {
+  if(FileNumber == 0 || FileNumber >= MCDwarfFiles.size())
+    return false;
+
+  MCDwarfFile *&ExistingFile = MCDwarfFiles[FileNumber];
+  if (ExistingFile)
+    return true;
+  else
+    return false;
+}
diff --git a/lib/MC/MCDisassembler/CMakeLists.txt b/lib/MC/MCDisassembler/CMakeLists.txt
new file mode 100644
index 0000000000000..5fa7b70194b24
--- /dev/null
+++ b/lib/MC/MCDisassembler/CMakeLists.txt
@@ -0,0 +1,7 @@
+
+add_llvm_library(LLVMMCDisassembler
+  EDDisassembler.cpp
+  EDOperand.cpp
+  EDInst.cpp
+  EDToken.cpp
+  )
diff --git a/lib/MC/MCDisassembler/EDDisassembler.cpp b/lib/MC/MCDisassembler/EDDisassembler.cpp
new file mode 100644
index 0000000000000..697b3d9c05153
--- /dev/null
+++ b/lib/MC/MCDisassembler/EDDisassembler.cpp
@@ -0,0 +1,402 @@
+//===-EDDisassembler.cpp - LLVM Enhanced Disassembler ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Enhanced Disassembly library's  disassembler class.
+// The disassembler is responsible for vending individual instructions according
+// to a given architecture and disassembly syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#include "EDDisassembler.h"
+#include "EDInst.h"
+#include "llvm/MC/EDInstInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCParser/AsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Target/TargetAsmLexer.h"
+#include "llvm/Target/TargetAsmParser.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSelect.h"
+using namespace llvm;
+
+bool EDDisassembler::sInitialized = false;
+EDDisassembler::DisassemblerMap_t EDDisassembler::sDisassemblers;
+
+struct TripleMap {
+  Triple::ArchType Arch;
+  const char *String;
+};
+
+static struct TripleMap triplemap[] = {
+  { Triple::x86,          "i386-unknown-unknown"    },
+  { Triple::x86_64,       "x86_64-unknown-unknown"  },
+  { Triple::arm,          "arm-unknown-unknown"     },
+  { Triple::thumb,        "thumb-unknown-unknown"   },
+  { Triple::InvalidArch,  NULL,                     }
+};
+
+/// infoFromArch - Returns the TripleMap corresponding to a given architecture,
+///   or NULL if there is an error
+///
+/// @arg arch - The Triple::ArchType for the desired architecture
+static const char *tripleFromArch(Triple::ArchType arch) {
+  unsigned int infoIndex;
+  
+  for (infoIndex = 0; triplemap[infoIndex].String != NULL; ++infoIndex) {
+    if (arch == triplemap[infoIndex].Arch)
+      return triplemap[infoIndex].String;
+  }
+  
+  return NULL;
+}
+
+/// getLLVMSyntaxVariant - gets the constant to use to get an assembly printer
+///   for the desired assembly syntax, suitable for passing to 
+///   Target::createMCInstPrinter()
+///
+/// @arg arch   - The target architecture
+/// @arg syntax - The assembly syntax in sd form
+static int getLLVMSyntaxVariant(Triple::ArchType arch,
+                                EDDisassembler::AssemblySyntax syntax) {
+  switch (syntax) {
+  default:
+    return -1;
+  // Mappings below from X86AsmPrinter.cpp
+  case EDDisassembler::kEDAssemblySyntaxX86ATT:
+    if (arch == Triple::x86 || arch == Triple::x86_64)
+      return 0;
+    else
+      return -1;
+  case EDDisassembler::kEDAssemblySyntaxX86Intel:
+    if (arch == Triple::x86 || arch == Triple::x86_64)
+      return 1;
+    else
+      return -1;
+  case EDDisassembler::kEDAssemblySyntaxARMUAL:
+    if (arch == Triple::arm || arch == Triple::thumb)
+      return 0;
+    else
+      return -1;
+  }
+}
+
+void EDDisassembler::initialize() {
+  if (sInitialized)
+    return;
+  
+  sInitialized = true;
+  
+  InitializeAllTargetInfos();
+  InitializeAllTargets();
+  InitializeAllAsmPrinters();
+  InitializeAllAsmParsers();
+  InitializeAllDisassemblers();
+}
+
+#undef BRINGUP_TARGET
+
+EDDisassembler *EDDisassembler::getDisassembler(Triple::ArchType arch,
+                                                AssemblySyntax syntax) {
+  CPUKey key;
+  key.Arch = arch;
+  key.Syntax = syntax;
+  
+  EDDisassembler::DisassemblerMap_t::iterator i = sDisassemblers.find(key);
+  
+  if (i != sDisassemblers.end()) {
+    return i->second;
+  } else {
+    EDDisassembler* sdd = new EDDisassembler(key);
+    if (!sdd->valid()) {
+      delete sdd;
+      return NULL;
+    }
+    
+    sDisassemblers[key] = sdd;
+    
+    return sdd;
+  }
+  
+  return NULL;
+}
+
+EDDisassembler *EDDisassembler::getDisassembler(StringRef str,
+                                                AssemblySyntax syntax) {
+  return getDisassembler(Triple(str).getArch(), syntax);
+}
+
+EDDisassembler::EDDisassembler(CPUKey &key) : 
+  Valid(false), 
+  HasSemantics(false), 
+  ErrorStream(nulls()), 
+  Key(key) {
+  const char *triple = tripleFromArch(key.Arch);
+    
+  if (!triple)
+    return;
+  
+  LLVMSyntaxVariant = getLLVMSyntaxVariant(key.Arch, key.Syntax);
+  
+  if (LLVMSyntaxVariant < 0)
+    return;
+  
+  std::string tripleString(triple);
+  std::string errorString;
+  
+  Tgt = TargetRegistry::lookupTarget(tripleString, 
+                                     errorString);
+  
+  if (!Tgt)
+    return;
+  
+  std::string featureString;
+  
+  TargetMachine.reset(Tgt->createTargetMachine(tripleString,
+                                               featureString));
+  
+  const TargetRegisterInfo *registerInfo = TargetMachine->getRegisterInfo();
+  
+  if (!registerInfo)
+    return;
+    
+  initMaps(*registerInfo);
+  
+  AsmInfo.reset(Tgt->createAsmInfo(tripleString));
+  
+  if (!AsmInfo)
+    return;
+  
+  Disassembler.reset(Tgt->createMCDisassembler());
+  
+  if (!Disassembler)
+    return;
+    
+  InstInfos = Disassembler->getEDInfo();
+  
+  InstString.reset(new std::string);
+  InstStream.reset(new raw_string_ostream(*InstString));
+  InstPrinter.reset(Tgt->createMCInstPrinter(LLVMSyntaxVariant, *AsmInfo));
+  
+  if (!InstPrinter)
+    return;
+    
+  GenericAsmLexer.reset(new AsmLexer(*AsmInfo));
+  SpecificAsmLexer.reset(Tgt->createAsmLexer(*AsmInfo));
+  SpecificAsmLexer->InstallLexer(*GenericAsmLexer);
+  
+  initMaps(*TargetMachine->getRegisterInfo());
+    
+  Valid = true;
+}
+
+EDDisassembler::~EDDisassembler() {
+  if (!valid())
+    return;
+}
+
+namespace {
+  /// EDMemoryObject - a subclass of MemoryObject that allows use of a callback
+  ///   as provided by the sd interface.  See MemoryObject.
+  class EDMemoryObject : public llvm::MemoryObject {
+  private:
+    EDByteReaderCallback Callback;
+    void *Arg;
+  public:
+    EDMemoryObject(EDByteReaderCallback callback,
+                   void *arg) : Callback(callback), Arg(arg) { }
+    ~EDMemoryObject() { }
+    uint64_t getBase() const { return 0x0; }
+    uint64_t getExtent() const { return (uint64_t)-1; }
+    int readByte(uint64_t address, uint8_t *ptr) const {
+      if (!Callback)
+        return -1;
+      
+      if (Callback(ptr, address, Arg))
+        return -1;
+      
+      return 0;
+    }
+  };
+}
+
+EDInst *EDDisassembler::createInst(EDByteReaderCallback byteReader, 
+                                   uint64_t address, 
+                                   void *arg) {
+  EDMemoryObject memoryObject(byteReader, arg);
+  
+  MCInst* inst = new MCInst;
+  uint64_t byteSize;
+  
+  if (!Disassembler->getInstruction(*inst,
+                                    byteSize,
+                                    memoryObject,
+                                    address,
+                                    ErrorStream)) {
+    delete inst;
+    return NULL;
+  } else {
+    const llvm::EDInstInfo *thisInstInfo;
+
+    thisInstInfo = &InstInfos[inst->getOpcode()];
+    
+    EDInst* sdInst = new EDInst(inst, byteSize, *this, thisInstInfo);
+    return sdInst;
+  }
+}
+
+void EDDisassembler::initMaps(const TargetRegisterInfo &registerInfo) {
+  unsigned numRegisters = registerInfo.getNumRegs();
+  unsigned registerIndex;
+  
+  for (registerIndex = 0; registerIndex < numRegisters; ++registerIndex) {
+    const char* registerName = registerInfo.get(registerIndex).Name;
+    
+    RegVec.push_back(registerName);
+    RegRMap[registerName] = registerIndex;
+  }
+  
+  switch (Key.Arch) {
+  default:
+    break;
+  case Triple::x86:
+  case Triple::x86_64:
+    stackPointers.insert(registerIDWithName("SP"));
+    stackPointers.insert(registerIDWithName("ESP"));
+    stackPointers.insert(registerIDWithName("RSP"));
+    
+    programCounters.insert(registerIDWithName("IP"));
+    programCounters.insert(registerIDWithName("EIP"));
+    programCounters.insert(registerIDWithName("RIP"));
+    break;
+  case Triple::arm:
+  case Triple::thumb:
+    stackPointers.insert(registerIDWithName("SP"));
+    
+    programCounters.insert(registerIDWithName("PC"));
+    break;  
+  }
+}
+
+const char *EDDisassembler::nameWithRegisterID(unsigned registerID) const {
+  if (registerID >= RegVec.size())
+    return NULL;
+  else
+    return RegVec[registerID].c_str();
+}
+
+unsigned EDDisassembler::registerIDWithName(const char *name) const {
+  regrmap_t::const_iterator iter = RegRMap.find(std::string(name));
+  if (iter == RegRMap.end())
+    return 0;
+  else
+    return (*iter).second;
+}
+
+bool EDDisassembler::registerIsStackPointer(unsigned registerID) {
+  return (stackPointers.find(registerID) != stackPointers.end());
+}
+
+bool EDDisassembler::registerIsProgramCounter(unsigned registerID) {
+  return (programCounters.find(registerID) != programCounters.end());
+}
+
+int EDDisassembler::printInst(std::string &str, MCInst &inst) {
+  PrinterMutex.acquire();
+  
+  InstPrinter->printInst(&inst, *InstStream);
+  InstStream->flush();
+  str = *InstString;
+  InstString->clear();
+  
+  PrinterMutex.release();
+  
+  return 0;
+}
+
+int EDDisassembler::parseInst(SmallVectorImpl<MCParsedAsmOperand*> &operands,
+                              SmallVectorImpl<AsmToken> &tokens,
+                              const std::string &str) {
+  int ret = 0;
+  
+  switch (Key.Arch) {
+  default:
+    return -1;
+  case Triple::x86:
+  case Triple::x86_64:
+  case Triple::arm:
+  case Triple::thumb:
+    break;
+  }
+  
+  const char *cStr = str.c_str();
+  MemoryBuffer *buf = MemoryBuffer::getMemBuffer(cStr, cStr + strlen(cStr));
+  
+  StringRef instName;
+  SMLoc instLoc;
+  
+  SourceMgr sourceMgr;
+  sourceMgr.AddNewSourceBuffer(buf, SMLoc()); // ownership of buf handed over
+  MCContext context(*AsmInfo);
+  OwningPtr<MCStreamer> streamer(createNullStreamer(context));
+  OwningPtr<MCAsmParser> genericParser(createMCAsmParser(*Tgt, sourceMgr,
+                                                         context, *streamer,
+                                                         *AsmInfo));
+  OwningPtr<TargetAsmParser> TargetParser(Tgt->createAsmParser(*genericParser,
+                                                               *TargetMachine));
+  
+  AsmToken OpcodeToken = genericParser->Lex();
+  AsmToken NextToken = genericParser->Lex();  // consume next token, because specificParser expects us to
+    
+  if (OpcodeToken.is(AsmToken::Identifier)) {
+    instName = OpcodeToken.getString();
+    instLoc = OpcodeToken.getLoc();
+    
+    if (NextToken.isNot(AsmToken::Eof) &&
+        TargetParser->ParseInstruction(instName, instLoc, operands))
+      ret = -1;
+  } else {
+    ret = -1;
+  }
+  
+  ParserMutex.acquire();
+  
+  if (!ret) {
+    GenericAsmLexer->setBuffer(buf);
+  
+    while (SpecificAsmLexer->Lex(),
+           SpecificAsmLexer->isNot(AsmToken::Eof) &&
+           SpecificAsmLexer->isNot(AsmToken::EndOfStatement)) {
+      if (SpecificAsmLexer->is(AsmToken::Error)) {
+        ret = -1;
+        break;
+      }
+      tokens.push_back(SpecificAsmLexer->getTok());
+    }
+  }
+
+  ParserMutex.release();
+  
+  return ret;
+}
+
+int EDDisassembler::llvmSyntaxVariant() const {
+  return LLVMSyntaxVariant;
+}
diff --git a/lib/MC/MCDisassembler/EDDisassembler.h b/lib/MC/MCDisassembler/EDDisassembler.h
new file mode 100644
index 0000000000000..e2f850bcdba97
--- /dev/null
+++ b/lib/MC/MCDisassembler/EDDisassembler.h
@@ -0,0 +1,271 @@
+//===-- EDDisassembler.h - LLVM Enhanced Disassembler -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface for the Enhanced Disassembly library's
+// disassembler class.  The disassembler is responsible for vending individual
+// instructions according to a given architecture and disassembly syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EDDISASSEMBLER_H
+#define LLVM_EDDISASSEMBLER_H
+
+#include "EDInfo.h"
+
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/System/Mutex.h"
+
+#include <map>
+#include <set>
+#include <vector>
+
+namespace llvm {
+class AsmLexer;
+class AsmToken;
+class MCContext;
+class MCAsmInfo;
+class MCAsmLexer;
+class AsmParser;
+class TargetAsmLexer;
+class TargetAsmParser;
+class MCDisassembler;
+class MCInstPrinter;
+class MCInst;
+class MCParsedAsmOperand;
+class MCStreamer;
+template <typename T> class SmallVectorImpl;
+class SourceMgr;
+class Target;
+class TargetMachine;
+class TargetRegisterInfo;
+
+struct EDInstInfo;
+struct EDInst;
+struct EDOperand;
+struct EDToken;
+
+typedef int (*EDByteReaderCallback)(uint8_t *byte, uint64_t address, void *arg);
+
+/// EDDisassembler - Encapsulates a disassembler for a single architecture and
+///   disassembly syntax.  Also manages the static disassembler registry.
+struct EDDisassembler {
+  typedef enum {
+    /*! @constant kEDAssemblySyntaxX86Intel Intel syntax for i386 and x86_64. */
+    kEDAssemblySyntaxX86Intel  = 0,
+    /*! @constant kEDAssemblySyntaxX86ATT AT&T syntax for i386 and x86_64. */
+    kEDAssemblySyntaxX86ATT    = 1,
+    kEDAssemblySyntaxARMUAL    = 2
+  } AssemblySyntax;
+  
+  
+  ////////////////////
+  // Static members //
+  ////////////////////
+  
+  /// CPUKey - Encapsulates the descriptor of an architecture/disassembly-syntax
+  ///   pair
+  struct CPUKey {
+    /// The architecture type
+    llvm::Triple::ArchType Arch;
+    
+    /// The assembly syntax
+    AssemblySyntax Syntax;
+    
+    /// operator== - Equality operator
+    bool operator==(const CPUKey &key) const {
+      return (Arch == key.Arch &&
+              Syntax == key.Syntax);
+    }
+    
+    /// operator< - Less-than operator
+    bool operator<(const CPUKey &key) const {
+      if(Arch > key.Arch)
+        return false;
+      if(Syntax >= key.Syntax)
+        return false;
+      return true;
+    }
+  };
+  
+  typedef std::map<CPUKey, EDDisassembler*> DisassemblerMap_t;
+  
+  /// True if the disassembler registry has been initialized; false if not
+  static bool sInitialized;
+  /// A map from disassembler specifications to disassemblers.  Populated
+  ///   lazily.
+  static DisassemblerMap_t sDisassemblers;
+
+  /// getDisassembler - Returns the specified disassemble, or NULL on failure
+  ///
+  /// @arg arch   - The desired architecture
+  /// @arg syntax - The desired disassembly syntax
+  static EDDisassembler *getDisassembler(llvm::Triple::ArchType arch,
+                                         AssemblySyntax syntax);
+  
+  /// getDisassembler - Returns the disassembler for a given combination of
+  ///   CPU type, CPU subtype, and assembly syntax, or NULL on failure
+  ///
+  /// @arg str    - The string representation of the architecture triple, e.g.,
+  ///               "x86_64-apple-darwin"
+  /// @arg syntax - The disassembly syntax for the required disassembler
+  static EDDisassembler *getDisassembler(llvm::StringRef str,
+                                         AssemblySyntax syntax);
+  
+  /// initialize - Initializes the disassembler registry and the LLVM backend
+  static void initialize();
+  
+  ////////////////////////
+  // Per-object members //
+  ////////////////////////
+  
+  /// True only if the object has been successfully initialized
+  bool Valid;
+  /// True if the disassembler can provide semantic information
+  bool HasSemantics;
+  
+  /// The stream to write errors to
+  llvm::raw_ostream &ErrorStream;
+
+  /// The architecture/syntax pair for the current architecture
+  CPUKey Key;
+  /// The LLVM target corresponding to the disassembler
+  const llvm::Target *Tgt;
+  /// The target machine instance.
+  llvm::OwningPtr<llvm::TargetMachine> TargetMachine;
+  /// The assembly information for the target architecture
+  llvm::OwningPtr<const llvm::MCAsmInfo> AsmInfo;
+  /// The disassembler for the target architecture
+  llvm::OwningPtr<const llvm::MCDisassembler> Disassembler;
+  /// The output string for the instruction printer; must be guarded with 
+  ///   PrinterMutex
+  llvm::OwningPtr<std::string> InstString;
+  /// The output stream for the disassembler; must be guarded with
+  ///   PrinterMutex
+  llvm::OwningPtr<llvm::raw_string_ostream> InstStream;
+  /// The instruction printer for the target architecture; must be guarded with
+  ///   PrinterMutex when printing
+  llvm::OwningPtr<llvm::MCInstPrinter> InstPrinter;
+  /// The mutex that guards the instruction printer's printing functions, which
+  ///   use a shared stream
+  llvm::sys::Mutex PrinterMutex;
+  /// The array of instruction information provided by the TableGen backend for
+  ///   the target architecture
+  const llvm::EDInstInfo *InstInfos;
+  /// The target-specific lexer for use in tokenizing strings, in
+  ///   target-independent and target-specific portions
+  llvm::OwningPtr<llvm::AsmLexer> GenericAsmLexer;
+  llvm::OwningPtr<llvm::TargetAsmLexer> SpecificAsmLexer;
+  /// The guard for the above
+  llvm::sys::Mutex ParserMutex;
+  /// The LLVM number used for the target disassembly syntax variant
+  int LLVMSyntaxVariant;
+    
+  typedef std::vector<std::string> regvec_t;
+  typedef std::map<std::string, unsigned> regrmap_t;
+  
+  /// A vector of registers for quick mapping from LLVM register IDs to names
+  regvec_t RegVec;
+  /// A map of registers for quick mapping from register names to LLVM IDs
+  regrmap_t RegRMap;
+  
+  /// A set of register IDs for aliases of the stack pointer for the current
+  ///   architecture
+  std::set<unsigned> stackPointers;
+  /// A set of register IDs for aliases of the program counter for the current
+  ///   architecture
+  std::set<unsigned> programCounters;
+  
+  /// Constructor - initializes a disassembler with all the necessary objects,
+  ///   which come pre-allocated from the registry accessor function
+  ///
+  /// @arg key                - the architecture and disassembly syntax for the 
+  ///                           disassembler
+  EDDisassembler(CPUKey& key);
+  
+  /// valid - reports whether there was a failure in the constructor.
+  bool valid() {
+    return Valid;
+  }
+  
+  /// hasSemantics - reports whether the disassembler can provide operands and
+  ///   tokens.
+  bool hasSemantics() {
+    return HasSemantics;
+  }
+  
+  ~EDDisassembler();
+  
+  /// createInst - creates and returns an instruction given a callback and
+  ///   memory address, or NULL on failure
+  ///
+  /// @arg byteReader - A callback function that provides machine code bytes
+  /// @arg address    - The address of the first byte of the instruction,
+  ///                   suitable for passing to byteReader
+  /// @arg arg        - An opaque argument for byteReader
+  EDInst *createInst(EDByteReaderCallback byteReader, 
+                     uint64_t address, 
+                     void *arg);
+
+  /// initMaps - initializes regVec and regRMap using the provided register
+  ///   info
+  ///
+  /// @arg registerInfo - the register information to use as a source
+  void initMaps(const llvm::TargetRegisterInfo &registerInfo);
+  /// nameWithRegisterID - Returns the name (owned by the EDDisassembler) of a 
+  ///   register for a given register ID, or NULL on failure
+  ///
+  /// @arg registerID - the ID of the register to be queried
+  const char *nameWithRegisterID(unsigned registerID) const;
+  /// registerIDWithName - Returns the ID of a register for a given register
+  ///   name, or (unsigned)-1 on failure
+  ///
+  /// @arg name - The name of the register
+  unsigned registerIDWithName(const char *name) const;
+  
+  /// registerIsStackPointer - reports whether a register ID is an alias for the
+  ///   stack pointer register
+  ///
+  /// @arg registerID - The LLVM register ID
+  bool registerIsStackPointer(unsigned registerID);
+  /// registerIsStackPointer - reports whether a register ID is an alias for the
+  ///   stack pointer register
+  ///
+  /// @arg registerID - The LLVM register ID
+  bool registerIsProgramCounter(unsigned registerID);
+  
+  /// printInst - prints an MCInst to a string, returning 0 on success, or -1
+  ///   otherwise
+  ///
+  /// @arg str  - A reference to a string which is filled in with the string
+  ///             representation of the instruction
+  /// @arg inst - A reference to the MCInst to be printed
+  int printInst(std::string& str,
+                llvm::MCInst& inst);
+  
+  /// parseInst - extracts operands and tokens from a string for use in
+  ///   tokenizing the string.  Returns 0 on success, or -1 otherwise.
+  ///
+  /// @arg operands - A reference to a vector that will be filled in with the
+  ///                 parsed operands
+  /// @arg tokens   - A reference to a vector that will be filled in with the
+  ///                 tokens
+  /// @arg str      - The string representation of the instruction
+  int parseInst(llvm::SmallVectorImpl<llvm::MCParsedAsmOperand*> &operands,
+                llvm::SmallVectorImpl<llvm::AsmToken> &tokens,
+                const std::string &str);
+  
+  /// llvmSyntaxVariant - returns the LLVM syntax variant for this disassembler
+  int llvmSyntaxVariant() const;  
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/MC/MCDisassembler/EDInfo.h b/lib/MC/MCDisassembler/EDInfo.h
new file mode 100644
index 0000000000000..627c06641dbc1
--- /dev/null
+++ b/lib/MC/MCDisassembler/EDInfo.h
@@ -0,0 +1,73 @@
+//===-- EDInfo.h - LLVM Enhanced Disassembler -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EDINFO_H
+#define LLVM_EDINFO_H
+
+enum {
+  EDIS_MAX_OPERANDS = 13,
+  EDIS_MAX_SYNTAXES = 2
+};
+
+enum OperandTypes {
+  kOperandTypeNone,
+  kOperandTypeImmediate,
+  kOperandTypeRegister,
+  kOperandTypeX86Memory,
+  kOperandTypeX86EffectiveAddress,
+  kOperandTypeX86PCRelative,
+  kOperandTypeARMBranchTarget,
+  kOperandTypeARMSoReg,
+  kOperandTypeARMSoImm,
+  kOperandTypeARMSoImm2Part,
+  kOperandTypeARMPredicate,
+  kOperandTypeARMAddrMode2,
+  kOperandTypeARMAddrMode2Offset,
+  kOperandTypeARMAddrMode3,
+  kOperandTypeARMAddrMode3Offset,
+  kOperandTypeARMAddrMode4,
+  kOperandTypeARMAddrMode5,
+  kOperandTypeARMAddrMode6,
+  kOperandTypeARMAddrMode6Offset,
+  kOperandTypeARMAddrModePC,
+  kOperandTypeARMRegisterList,
+  kOperandTypeARMTBAddrMode,
+  kOperandTypeThumbITMask,
+  kOperandTypeThumbAddrModeS1,
+  kOperandTypeThumbAddrModeS2,
+  kOperandTypeThumbAddrModeS4,
+  kOperandTypeThumbAddrModeRR,
+  kOperandTypeThumbAddrModeSP,
+  kOperandTypeThumb2SoReg,
+  kOperandTypeThumb2SoImm,
+  kOperandTypeThumb2AddrModeImm8,
+  kOperandTypeThumb2AddrModeImm8Offset,
+  kOperandTypeThumb2AddrModeImm12,
+  kOperandTypeThumb2AddrModeSoReg,
+  kOperandTypeThumb2AddrModeImm8s4,
+  kOperandTypeThumb2AddrModeImm8s4Offset
+};
+
+enum OperandFlags {
+  kOperandFlagSource = 0x1,
+  kOperandFlagTarget = 0x2
+};
+
+enum InstructionTypes {
+  kInstructionTypeNone,
+  kInstructionTypeMove,
+  kInstructionTypeBranch,
+  kInstructionTypePush,
+  kInstructionTypePop,
+  kInstructionTypeCall,
+  kInstructionTypeReturn
+};
+
+
+#endif
diff --git a/lib/MC/MCDisassembler/EDInst.cpp b/lib/MC/MCDisassembler/EDInst.cpp
new file mode 100644
index 0000000000000..e22408f060b1f
--- /dev/null
+++ b/lib/MC/MCDisassembler/EDInst.cpp
@@ -0,0 +1,207 @@
+//===-EDInst.cpp - LLVM Enhanced Disassembler -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Enhanced Disassembly library's instruction class.
+// The instruction is responsible for vending the string representation, 
+// individual tokens, and operands for a single instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "EDInst.h"
+#include "EDDisassembler.h"
+#include "EDOperand.h"
+#include "EDToken.h"
+
+#include "llvm/MC/EDInstInfo.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+EDInst::EDInst(llvm::MCInst *inst,
+               uint64_t byteSize, 
+               EDDisassembler &disassembler,
+               const llvm::EDInstInfo *info) :
+  Disassembler(disassembler),
+  Inst(inst),
+  ThisInstInfo(info),
+  ByteSize(byteSize),
+  BranchTarget(-1),
+  MoveSource(-1),
+  MoveTarget(-1) {
+  OperandOrder = ThisInstInfo->operandOrders[Disassembler.llvmSyntaxVariant()];
+}
+
+EDInst::~EDInst() {
+  unsigned int index;
+  unsigned int numOperands = Operands.size();
+  
+  for (index = 0; index < numOperands; ++index)
+    delete Operands[index];
+  
+  unsigned int numTokens = Tokens.size();
+  
+  for (index = 0; index < numTokens; ++index)
+    delete Tokens[index];
+  
+  delete Inst;
+}
+
+uint64_t EDInst::byteSize() {
+  return ByteSize;
+}
+
+int EDInst::stringify() {
+  if (StringifyResult.valid())
+    return StringifyResult.result();
+  
+  if (Disassembler.printInst(String, *Inst))
+    return StringifyResult.setResult(-1);
+  
+  return StringifyResult.setResult(0);
+}
+
+int EDInst::getString(const char*& str) {
+  if (stringify())
+    return -1;
+  
+  str = String.c_str();
+  
+  return 0;
+}
+
+unsigned EDInst::instID() {
+  return Inst->getOpcode();
+}
+
+bool EDInst::isBranch() {
+  if (ThisInstInfo)
+    return 
+      ThisInstInfo->instructionType == kInstructionTypeBranch ||
+      ThisInstInfo->instructionType == kInstructionTypeCall;
+  else
+    return false;
+}
+
+bool EDInst::isMove() {
+  if (ThisInstInfo)
+    return ThisInstInfo->instructionType == kInstructionTypeMove;
+  else
+    return false;
+}
+
+int EDInst::parseOperands() {
+  if (ParseResult.valid())
+    return ParseResult.result();
+  
+  if (!ThisInstInfo)
+    return ParseResult.setResult(-1);
+  
+  unsigned int opIndex;
+  unsigned int mcOpIndex = 0;
+  
+  for (opIndex = 0; opIndex < ThisInstInfo->numOperands; ++opIndex) {
+    if (isBranch() &&
+        (ThisInstInfo->operandFlags[opIndex] & kOperandFlagTarget)) {
+      BranchTarget = opIndex;
+    }
+    else if (isMove()) {
+      if (ThisInstInfo->operandFlags[opIndex] & kOperandFlagSource)
+        MoveSource = opIndex;
+      else if (ThisInstInfo->operandFlags[opIndex] & kOperandFlagTarget)
+        MoveTarget = opIndex;
+    }
+    
+    EDOperand *operand = new EDOperand(Disassembler, *this, opIndex, mcOpIndex);
+    
+    Operands.push_back(operand);
+  }
+  
+  return ParseResult.setResult(0);
+}
+
+int EDInst::branchTargetID() {
+  if (parseOperands())
+    return -1;
+  return BranchTarget;
+}
+
+int EDInst::moveSourceID() {
+  if (parseOperands())
+    return -1;
+  return MoveSource;
+}
+
+int EDInst::moveTargetID() {
+  if (parseOperands())
+    return -1;
+  return MoveTarget;
+}
+
+int EDInst::numOperands() {
+  if (parseOperands())
+    return -1;
+  return Operands.size();
+}
+
+int EDInst::getOperand(EDOperand *&operand, unsigned int index) {
+  if (parseOperands())
+    return -1;
+  
+  if (index >= Operands.size())
+    return -1;
+  
+  operand = Operands[index];
+  return 0;
+}
+
+int EDInst::tokenize() {
+  if (TokenizeResult.valid())
+    return TokenizeResult.result();
+  
+  if (stringify())
+    return TokenizeResult.setResult(-1);
+    
+  return TokenizeResult.setResult(EDToken::tokenize(Tokens,
+                                                    String,
+                                                    OperandOrder,
+                                                    Disassembler));
+    
+}
+
+int EDInst::numTokens() {
+  if (tokenize())
+    return -1;
+  return Tokens.size();
+}
+
+int EDInst::getToken(EDToken *&token, unsigned int index) {
+  if (tokenize())
+    return -1;
+  token = Tokens[index];
+  return 0;
+}
+
+#ifdef __BLOCKS__
+int EDInst::visitTokens(EDTokenVisitor_t visitor) {
+  if (tokenize())
+    return -1;
+  
+  tokvec_t::iterator iter;
+  
+  for (iter = Tokens.begin(); iter != Tokens.end(); ++iter) {
+    int ret = visitor(*iter);
+    if (ret == 1)
+      return 0;
+    if (ret != 0)
+      return -1;
+  }
+  
+  return 0;
+}
+#endif
diff --git a/lib/MC/MCDisassembler/EDInst.h b/lib/MC/MCDisassembler/EDInst.h
new file mode 100644
index 0000000000000..39d264fb7aadc
--- /dev/null
+++ b/lib/MC/MCDisassembler/EDInst.h
@@ -0,0 +1,182 @@
+//===-- EDInst.h - LLVM Enhanced Disassembler -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface for the Enhanced Disassembly library's
+// instruction class.  The instruction is responsible for vending the string
+// representation, individual tokens and operands for a single instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EDINST_H
+#define LLVM_EDINST_H
+
+#include "llvm/System/DataTypes.h"
+#include "llvm/ADT/SmallVector.h"
+#include <string>
+#include <vector>
+
+namespace llvm {
+  class MCInst;
+  struct EDInstInfo;
+  struct EDToken;
+  struct EDDisassembler;
+  struct EDOperand;
+
+#ifdef __BLOCKS__
+  typedef int (^EDTokenVisitor_t)(EDToken *token);
+#endif
+
+/// CachedResult - Encapsulates the result of a function along with the validity
+///   of that result, so that slow functions don't need to run twice
+struct CachedResult {
+  /// True if the result has been obtained by executing the function
+  bool Valid;
+  /// The result last obtained from the function
+  int Result;
+  
+  /// Constructor - Initializes an invalid result
+  CachedResult() : Valid(false) { }
+  /// valid - Returns true if the result has been obtained by executing the
+  ///   function and false otherwise
+  bool valid() { return Valid; }
+  /// result - Returns the result of the function or an undefined value if
+  ///   valid() is false
+  int result() { return Result; }
+  /// setResult - Sets the result of the function and declares it valid
+  ///   returning the result (so that setResult() can be called from inside a
+  ///   return statement)
+  /// @arg result - The result of the function
+  int setResult(int result) { Result = result; Valid = true; return result; }
+};
+
+/// EDInst - Encapsulates a single instruction, which can be queried for its
+///   string representation, as well as its operands and tokens
+struct EDInst {
+  /// The parent disassembler
+  EDDisassembler &Disassembler;
+  /// The containing MCInst
+  llvm::MCInst *Inst;
+  /// The instruction information provided by TableGen for this instruction
+  const llvm::EDInstInfo *ThisInstInfo;
+  /// The number of bytes for the machine code representation of the instruction
+  uint64_t ByteSize;
+  
+  /// The result of the stringify() function
+  CachedResult StringifyResult;
+  /// The string representation of the instruction
+  std::string String;
+  /// The order in which operands from the InstInfo's operand information appear
+  /// in String
+  const char* OperandOrder;
+  
+  /// The result of the parseOperands() function
+  CachedResult ParseResult;
+  typedef llvm::SmallVector<EDOperand*, 5> opvec_t;
+  /// The instruction's operands
+  opvec_t Operands;
+  /// The operand corresponding to the target, if the instruction is a branch
+  int BranchTarget;
+  /// The operand corresponding to the source, if the instruction is a move
+  int MoveSource;
+  /// The operand corresponding to the target, if the instruction is a move
+  int MoveTarget;
+  
+  /// The result of the tokenize() function
+  CachedResult TokenizeResult;
+  typedef std::vector<EDToken*> tokvec_t;
+  /// The instruction's tokens
+  tokvec_t Tokens;
+  
+  /// Constructor - initializes an instruction given the output of the LLVM
+  ///   C++ disassembler
+  ///
+  /// @arg inst         - The MCInst, which will now be owned by this object
+  /// @arg byteSize     - The size of the consumed instruction, in bytes
+  /// @arg disassembler - The parent disassembler
+  /// @arg instInfo     - The instruction information produced by the table
+  ///                     generator for this instruction
+  EDInst(llvm::MCInst *inst,
+         uint64_t byteSize,
+         EDDisassembler &disassembler,
+         const llvm::EDInstInfo *instInfo);
+  ~EDInst();
+  
+  /// byteSize - returns the number of bytes consumed by the machine code
+  ///   representation of the instruction
+  uint64_t byteSize();
+  /// instID - returns the LLVM instruction ID of the instruction
+  unsigned instID();
+  
+  /// stringify - populates the String and AsmString members of the instruction,
+  ///   returning 0 on success or -1 otherwise
+  int stringify();
+  /// getString - retrieves a pointer to the string representation of the
+  ///   instructinon, returning 0 on success or -1 otherwise
+  ///
+  /// @arg str - A reference to a pointer that, on success, is set to point to
+  ///   the string representation of the instruction; this string is still owned
+  ///   by the instruction and will be deleted when it is
+  int getString(const char *&str);
+  
+  /// isBranch - Returns true if the instruction is a branch
+  bool isBranch();
+  /// isMove - Returns true if the instruction is a move
+  bool isMove();
+  
+  /// parseOperands - populates the Operands member of the instruction,
+  ///   returning 0 on success or -1 otherwise
+  int parseOperands();
+  /// branchTargetID - returns the ID (suitable for use with getOperand()) of 
+  ///   the target operand if the instruction is a branch, or -1 otherwise
+  int branchTargetID();
+  /// moveSourceID - returns the ID of the source operand if the instruction
+  ///   is a move, or -1 otherwise
+  int moveSourceID();
+  /// moveTargetID - returns the ID of the target operand if the instruction
+  ///   is a move, or -1 otherwise
+  int moveTargetID();
+  
+  /// numOperands - returns the number of operands available to retrieve, or -1
+  ///   on error
+  int numOperands();
+  /// getOperand - retrieves an operand from the instruction's operand list by
+  ///   index, returning 0 on success or -1 on error
+  ///
+  /// @arg operand  - A reference whose target is pointed at the operand on
+  ///                 success, although the operand is still owned by the EDInst
+  /// @arg index    - The index of the operand in the instruction
+  int getOperand(EDOperand *&operand, unsigned int index);
+
+  /// tokenize - populates the Tokens member of the instruction, returning 0 on
+  ///   success or -1 otherwise
+  int tokenize();
+  /// numTokens - returns the number of tokens in the instruction, or -1 on
+  ///   error
+  int numTokens();
+  /// getToken - retrieves a token from the instruction's token list by index,
+  ///   returning 0 on success or -1 on error
+  ///
+  /// @arg token  - A reference whose target is pointed at the token on success,
+  ///               although the token is still owned by the EDInst
+  /// @arg index  - The index of the token in the instrcutino
+  int getToken(EDToken *&token, unsigned int index);
+
+#ifdef __BLOCKS__
+  /// visitTokens - Visits each token in turn and applies a block to it,
+  ///   returning 0 if all blocks are visited and/or the block signals
+  ///   termination by returning 1; returns -1 on error
+  ///
+  /// @arg visitor  - The visitor block to apply to all tokens.
+  int visitTokens(EDTokenVisitor_t visitor);
+#endif
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/MC/MCDisassembler/EDOperand.cpp b/lib/MC/MCDisassembler/EDOperand.cpp
new file mode 100644
index 0000000000000..2aed123368dad
--- /dev/null
+++ b/lib/MC/MCDisassembler/EDOperand.cpp
@@ -0,0 +1,282 @@
+//===-- EDOperand.cpp - LLVM Enhanced Disassembler ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Enhanced Disassembly library's operand class.  The
+// operand is responsible for allowing evaluation given a particular register 
+// context.
+//
+//===----------------------------------------------------------------------===//
+
+#include "EDOperand.h"
+#include "EDDisassembler.h"
+#include "EDInst.h"
+#include "llvm/MC/EDInstInfo.h"
+#include "llvm/MC/MCInst.h"
+using namespace llvm;
+
+EDOperand::EDOperand(const EDDisassembler &disassembler,
+                     const EDInst &inst,
+                     unsigned int opIndex,
+                     unsigned int &mcOpIndex) :
+  Disassembler(disassembler),
+  Inst(inst),
+  OpIndex(opIndex),
+  MCOpIndex(mcOpIndex) {
+  unsigned int numMCOperands = 0;
+    
+  if (Disassembler.Key.Arch == Triple::x86 ||
+      Disassembler.Key.Arch == Triple::x86_64) {
+    uint8_t operandType = inst.ThisInstInfo->operandTypes[opIndex];
+    
+    switch (operandType) {
+    default:
+      break;
+    case kOperandTypeImmediate:
+      numMCOperands = 1;
+      break;
+    case kOperandTypeRegister:
+      numMCOperands = 1;
+      break;
+    case kOperandTypeX86Memory:
+      numMCOperands = 5;
+      break;
+    case kOperandTypeX86EffectiveAddress:
+      numMCOperands = 4;
+      break;
+    case kOperandTypeX86PCRelative:
+      numMCOperands = 1;
+      break;
+    }
+  }
+  else if (Disassembler.Key.Arch == Triple::arm ||
+           Disassembler.Key.Arch == Triple::thumb) {
+    uint8_t operandType = inst.ThisInstInfo->operandTypes[opIndex];
+    
+    switch (operandType) {
+    default:
+    case kOperandTypeARMRegisterList:
+      break;
+    case kOperandTypeImmediate:
+    case kOperandTypeRegister:
+    case kOperandTypeARMBranchTarget:
+    case kOperandTypeARMSoImm:
+    case kOperandTypeThumb2SoImm:
+    case kOperandTypeARMSoImm2Part:
+    case kOperandTypeARMPredicate:
+    case kOperandTypeThumbITMask:
+    case kOperandTypeThumb2AddrModeImm8Offset:
+    case kOperandTypeARMTBAddrMode:
+    case kOperandTypeThumb2AddrModeImm8s4Offset:
+      numMCOperands = 1;
+      break;
+    case kOperandTypeThumb2SoReg:
+    case kOperandTypeARMAddrMode2Offset:
+    case kOperandTypeARMAddrMode3Offset:
+    case kOperandTypeARMAddrMode4:
+    case kOperandTypeARMAddrMode5:
+    case kOperandTypeARMAddrModePC:
+    case kOperandTypeThumb2AddrModeImm8:
+    case kOperandTypeThumb2AddrModeImm12:
+    case kOperandTypeThumb2AddrModeImm8s4:
+    case kOperandTypeThumbAddrModeRR:
+    case kOperandTypeThumbAddrModeSP:
+      numMCOperands = 2;
+      break;
+    case kOperandTypeARMSoReg:
+    case kOperandTypeARMAddrMode2:
+    case kOperandTypeARMAddrMode3:
+    case kOperandTypeThumb2AddrModeSoReg:
+    case kOperandTypeThumbAddrModeS1:
+    case kOperandTypeThumbAddrModeS2:
+    case kOperandTypeThumbAddrModeS4:
+    case kOperandTypeARMAddrMode6Offset:
+      numMCOperands = 3;
+      break;
+    case kOperandTypeARMAddrMode6:
+      numMCOperands = 4;
+      break;
+    }
+  }
+    
+  mcOpIndex += numMCOperands;
+}
+
+EDOperand::~EDOperand() {
+}
+
+int EDOperand::evaluate(uint64_t &result,
+                        EDRegisterReaderCallback callback,
+                        void *arg) {
+  uint8_t operandType = Inst.ThisInstInfo->operandTypes[OpIndex];
+  
+  switch (Disassembler.Key.Arch) {
+  default:
+    return -1;  
+  case Triple::x86:
+  case Triple::x86_64:    
+    switch (operandType) {
+    default:
+      return -1;
+    case kOperandTypeImmediate:
+      result = Inst.Inst->getOperand(MCOpIndex).getImm();
+      return 0;
+    case kOperandTypeRegister:
+    {
+      unsigned reg = Inst.Inst->getOperand(MCOpIndex).getReg();
+      return callback(&result, reg, arg);
+    }
+    case kOperandTypeX86PCRelative:
+    {
+      int64_t displacement = Inst.Inst->getOperand(MCOpIndex).getImm();
+        
+      uint64_t ripVal;
+        
+      // TODO fix how we do this
+        
+      if (callback(&ripVal, Disassembler.registerIDWithName("RIP"), arg))
+        return -1;
+        
+      result = ripVal + displacement;
+      return 0;
+    }
+    case kOperandTypeX86Memory:
+    case kOperandTypeX86EffectiveAddress:  
+    {
+      unsigned baseReg = Inst.Inst->getOperand(MCOpIndex).getReg();
+      uint64_t scaleAmount = Inst.Inst->getOperand(MCOpIndex+1).getImm();
+      unsigned indexReg = Inst.Inst->getOperand(MCOpIndex+2).getReg();
+      int64_t displacement = Inst.Inst->getOperand(MCOpIndex+3).getImm();
+      //unsigned segmentReg = Inst.Inst->getOperand(MCOpIndex+4).getReg();
+      
+      uint64_t addr = 0;
+        
+      if (baseReg) {
+        uint64_t baseVal;
+        if (callback(&baseVal, baseReg, arg))
+          return -1;
+        addr += baseVal;
+      }
+        
+      if (indexReg) {
+        uint64_t indexVal;
+        if (callback(&indexVal, indexReg, arg))
+          return -1;
+        addr += (scaleAmount * indexVal);
+      }
+       
+      addr += displacement;
+       
+      result = addr;
+      return 0;
+    }
+    }
+    break;
+  case Triple::arm:
+  case Triple::thumb:
+    switch (operandType) {
+    default:
+      return -1;
+    case kOperandTypeImmediate:
+      result = Inst.Inst->getOperand(MCOpIndex).getImm();
+      return 0;
+    case kOperandTypeRegister:
+    {
+      unsigned reg = Inst.Inst->getOperand(MCOpIndex).getReg();
+      return callback(&result, reg, arg);
+    }
+    case kOperandTypeARMBranchTarget:
+    {
+      int64_t displacement = Inst.Inst->getOperand(MCOpIndex).getImm();
+      
+      uint64_t pcVal;
+      
+      if (callback(&pcVal, Disassembler.registerIDWithName("PC"), arg))
+        return -1;
+      
+      result = pcVal + displacement;
+      return 0;
+    }
+    }
+  }
+  
+  return -1;
+}
+
+int EDOperand::isRegister() {
+  return(Inst.ThisInstInfo->operandFlags[OpIndex] == kOperandTypeRegister);
+}
+
+unsigned EDOperand::regVal() {
+  return Inst.Inst->getOperand(MCOpIndex).getReg(); 
+}
+
+int EDOperand::isImmediate() {
+  return(Inst.ThisInstInfo->operandFlags[OpIndex] == kOperandTypeImmediate);
+}
+
+uint64_t EDOperand::immediateVal() {
+  return Inst.Inst->getOperand(MCOpIndex).getImm();
+}
+
+int EDOperand::isMemory() {
+  uint8_t operandType = Inst.ThisInstInfo->operandTypes[OpIndex];
+    
+  switch (operandType) {
+  default:
+    return 0;
+  case kOperandTypeX86Memory:
+  case kOperandTypeX86PCRelative:
+  case kOperandTypeX86EffectiveAddress:
+  case kOperandTypeARMSoReg:
+  case kOperandTypeARMSoImm:
+  case kOperandTypeARMAddrMode2:
+  case kOperandTypeARMAddrMode2Offset:
+  case kOperandTypeARMAddrMode3:
+  case kOperandTypeARMAddrMode3Offset:
+  case kOperandTypeARMAddrMode4:
+  case kOperandTypeARMAddrMode5:
+  case kOperandTypeARMAddrMode6:
+  case kOperandTypeARMAddrModePC:
+  case kOperandTypeARMBranchTarget:
+  case kOperandTypeThumbAddrModeS1:
+  case kOperandTypeThumbAddrModeS2:
+  case kOperandTypeThumbAddrModeS4:
+  case kOperandTypeThumbAddrModeRR:
+  case kOperandTypeThumbAddrModeSP:
+  case kOperandTypeThumb2SoImm:
+  case kOperandTypeThumb2AddrModeImm8:
+  case kOperandTypeThumb2AddrModeImm8Offset:
+  case kOperandTypeThumb2AddrModeImm12:
+  case kOperandTypeThumb2AddrModeSoReg:
+  case kOperandTypeThumb2AddrModeImm8s4:
+    return 1;
+  }
+}
+
+#ifdef __BLOCKS__
+struct RegisterReaderWrapper {
+  EDOperand::EDRegisterBlock_t regBlock;
+};
+
+int readerWrapperCallback(uint64_t *value, 
+                          unsigned regID, 
+                          void *arg) {
+  struct RegisterReaderWrapper *wrapper = (struct RegisterReaderWrapper *)arg;
+  return wrapper->regBlock(value, regID);
+}
+
+int EDOperand::evaluate(uint64_t &result,
+                        EDRegisterBlock_t regBlock) {
+  struct RegisterReaderWrapper wrapper;
+  wrapper.regBlock = regBlock;
+  return evaluate(result, 
+                  readerWrapperCallback, 
+                  (void*)&wrapper);
+}
+#endif
diff --git a/lib/MC/MCDisassembler/EDOperand.h b/lib/MC/MCDisassembler/EDOperand.h
new file mode 100644
index 0000000000000..6e695224318c5
--- /dev/null
+++ b/lib/MC/MCDisassembler/EDOperand.h
@@ -0,0 +1,91 @@
+//===-EDOperand.h - LLVM Enhanced Disassembler ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface for the Enhanced Disassembly library's 
+// operand class.  The operand is responsible for allowing evaluation given a
+// particular register context.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EDOPERAND_H
+#define LLVM_EDOPERAND_H
+
+#include "llvm/System/DataTypes.h"
+
+namespace llvm {
+
+struct EDDisassembler;
+struct EDInst;
+  
+typedef int (*EDRegisterReaderCallback)(uint64_t *value, unsigned regID, 
+                                        void* arg);
+
+
+/// EDOperand - Encapsulates a single operand, which can be evaluated by the
+///   client
+struct EDOperand {
+  /// The parent disassembler
+  const EDDisassembler &Disassembler;
+  /// The parent instruction
+  const EDInst &Inst;
+  
+  /// The index of the operand in the EDInst
+  unsigned int OpIndex;
+  /// The index of the first component of the operand in the MCInst
+  unsigned int MCOpIndex;
+  
+  /// Constructor - Initializes an EDOperand
+  ///
+  /// @arg disassembler - The disassembler responsible for the operand
+  /// @arg inst         - The instruction containing this operand
+  /// @arg opIndex      - The index of the operand in inst
+  /// @arg mcOpIndex    - The index of the operand in the original MCInst
+  EDOperand(const EDDisassembler &disassembler,
+            const EDInst &inst,
+            unsigned int opIndex,
+            unsigned int &mcOpIndex);
+  ~EDOperand();
+  
+  /// evaluate - Returns the numeric value of an operand to the extent possible,
+  ///   returning 0 on success or -1 if there was some problem (such as a 
+  ///   register not being readable)
+  ///
+  /// @arg result   - A reference whose target is filled in with the value of
+  ///                 the operand (the address if it is a memory operand)
+  /// @arg callback - A function to call to obtain register values
+  /// @arg arg      - An opaque argument to pass to callback
+  int evaluate(uint64_t &result,
+               EDRegisterReaderCallback callback,
+               void *arg);
+
+  /// isRegister - Returns 1 if the operand is a register or 0 otherwise
+  int isRegister();
+  /// regVal - Returns the register value.
+  unsigned regVal();
+  
+  /// isImmediate - Returns 1 if the operand is an immediate or 0 otherwise
+  int isImmediate();
+  /// immediateVal - Returns the immediate value.
+  uint64_t immediateVal();
+  
+  /// isMemory - Returns 1 if the operand is a memory location or 0 otherwise
+  int isMemory();
+  
+#ifdef __BLOCKS__
+  typedef int (^EDRegisterBlock_t)(uint64_t *value, unsigned regID);
+
+  /// evaluate - Like evaluate for a callback, but uses a block instead
+  int evaluate(uint64_t &result,
+               EDRegisterBlock_t regBlock);
+#endif
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/MC/MCDisassembler/EDToken.cpp b/lib/MC/MCDisassembler/EDToken.cpp
new file mode 100644
index 0000000000000..400e1649e9709
--- /dev/null
+++ b/lib/MC/MCDisassembler/EDToken.cpp
@@ -0,0 +1,206 @@
+//===-- EDToken.cpp - LLVM Enhanced Disassembler --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Enhanced Disassembler library's token class.  The
+// token is responsible for vending information about the token, such as its
+// type and logical value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "EDToken.h"
+#include "EDDisassembler.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+EDToken::EDToken(StringRef str,
+                 enum tokenType type,
+                 uint64_t localType,
+                 EDDisassembler &disassembler) :
+  Disassembler(disassembler),
+  Str(str),
+  Type(type),
+  LocalType(localType),
+  OperandID(-1) {
+}
+
+EDToken::~EDToken() {
+}
+
+void EDToken::makeLiteral(bool sign, uint64_t absoluteValue) {
+  Type = kTokenLiteral;
+  LiteralSign = sign;
+  LiteralAbsoluteValue = absoluteValue;
+}
+
+void EDToken::makeRegister(unsigned registerID) {
+  Type = kTokenRegister;
+  RegisterID = registerID;
+}
+
+void EDToken::setOperandID(int operandID) {
+  OperandID = operandID;
+}
+
+enum EDToken::tokenType EDToken::type() const {
+  return Type;
+}
+
+uint64_t EDToken::localType() const {
+  return LocalType;
+}
+
+StringRef EDToken::string() const {
+  return Str;
+}
+
+int EDToken::operandID() const {
+  return OperandID;
+}
+
+int EDToken::literalSign() const {
+  if (Type != kTokenLiteral)
+    return -1;
+  return (LiteralSign ? 1 : 0);
+}
+
+int EDToken::literalAbsoluteValue(uint64_t &value) const {
+  if (Type != kTokenLiteral)
+    return -1;
+  value = LiteralAbsoluteValue;
+  return 0;
+}
+
+int EDToken::registerID(unsigned &registerID) const {
+  if (Type != kTokenRegister)
+    return -1;
+  registerID = RegisterID;
+  return 0;
+}
+
+int EDToken::tokenize(std::vector<EDToken*> &tokens,
+                      std::string &str,
+                      const char *operandOrder,
+                      EDDisassembler &disassembler) {
+  SmallVector<MCParsedAsmOperand*, 5> parsedOperands;
+  SmallVector<AsmToken, 10> asmTokens;
+  
+  if (disassembler.parseInst(parsedOperands, asmTokens, str))
+    return -1;
+  
+  SmallVectorImpl<MCParsedAsmOperand*>::iterator operandIterator;
+  unsigned int operandIndex;
+  SmallVectorImpl<AsmToken>::iterator tokenIterator;
+  
+  operandIterator = parsedOperands.begin();
+  operandIndex = 0;
+  
+  bool readOpcode = false;
+  
+  const char *wsPointer = asmTokens.begin()->getLoc().getPointer();
+  
+  for (tokenIterator = asmTokens.begin();
+       tokenIterator != asmTokens.end();
+       ++tokenIterator) {
+    SMLoc tokenLoc = tokenIterator->getLoc();
+    
+    const char *tokenPointer = tokenLoc.getPointer();
+    
+    if (tokenPointer > wsPointer) {
+      unsigned long wsLength = tokenPointer - wsPointer;
+      
+      EDToken *whitespaceToken = new EDToken(StringRef(wsPointer, wsLength),
+                                             EDToken::kTokenWhitespace,
+                                             0,
+                                             disassembler);
+      
+      tokens.push_back(whitespaceToken);
+    }
+    
+    wsPointer = tokenPointer + tokenIterator->getString().size();
+    
+    while (operandIterator != parsedOperands.end() &&
+           tokenLoc.getPointer() > 
+           (*operandIterator)->getEndLoc().getPointer()) {
+      ++operandIterator;
+      ++operandIndex;
+    }
+    
+    EDToken *token;
+    
+    switch (tokenIterator->getKind()) {
+    case AsmToken::Identifier:
+      if (!readOpcode) {
+        token = new EDToken(tokenIterator->getString(),
+                            EDToken::kTokenOpcode,
+                            (uint64_t)tokenIterator->getKind(),
+                            disassembler);
+        readOpcode = true;
+        break;
+      }
+      // any identifier that isn't an opcode is mere punctuation; so we fall
+      // through
+    default:
+      token = new EDToken(tokenIterator->getString(),
+                          EDToken::kTokenPunctuation,
+                          (uint64_t)tokenIterator->getKind(),
+                          disassembler);
+      break;
+    case AsmToken::Integer:
+    {
+      token = new EDToken(tokenIterator->getString(),
+                          EDToken::kTokenLiteral,
+                          (uint64_t)tokenIterator->getKind(),
+                          disassembler);
+        
+      int64_t intVal = tokenIterator->getIntVal();
+      
+      if (intVal < 0)  
+        token->makeLiteral(true, -intVal);
+      else
+        token->makeLiteral(false, intVal);
+      break;
+    }
+    case AsmToken::Register:
+    {
+      token = new EDToken(tokenIterator->getString(),
+                          EDToken::kTokenLiteral,
+                          (uint64_t)tokenIterator->getKind(),
+                          disassembler);
+      
+      token->makeRegister((unsigned)tokenIterator->getRegVal());
+      break;
+    }
+    }
+    
+    if (operandIterator != parsedOperands.end() &&
+       tokenLoc.getPointer() >= 
+       (*operandIterator)->getStartLoc().getPointer()) {
+      /// operandIndex == 0 means the operand is the instruction (which the
+      /// AsmParser treats as an operand but edis does not).  We therefore skip
+      /// operandIndex == 0 and subtract 1 from all other operand indices.
+      
+      if (operandIndex > 0)
+        token->setOperandID(operandOrder[operandIndex - 1]);
+    }
+    
+    tokens.push_back(token);
+  }
+  
+  return 0;
+}
+
+int EDToken::getString(const char*& buf) {
+  if (PermStr.length() == 0) {
+    PermStr = Str.str();
+  }
+  buf = PermStr.c_str();
+  return 0;
+}
diff --git a/lib/MC/MCDisassembler/EDToken.h b/lib/MC/MCDisassembler/EDToken.h
new file mode 100644
index 0000000000000..6b2aeac60ba51
--- /dev/null
+++ b/lib/MC/MCDisassembler/EDToken.h
@@ -0,0 +1,139 @@
+//===-EDToken.h - LLVM Enhanced Disassembler --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface for the Enhanced Disassembly library's token
+// class.  The token is responsible for vending information about the token, 
+// such as its type and logical value.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EDTOKEN_H
+#define LLVM_EDTOKEN_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/System/DataTypes.h"
+#include <string>
+#include <vector>
+
+namespace llvm {
+  
+struct EDDisassembler;
+
+/// EDToken - Encapsulates a single token, which can provide a string
+///   representation of itself or interpret itself in various ways, depending
+///   on the token type.
+struct EDToken {
+  enum tokenType {
+    kTokenWhitespace,
+    kTokenOpcode,
+    kTokenLiteral,
+    kTokenRegister,
+    kTokenPunctuation
+  };
+  
+  /// The parent disassembler
+  EDDisassembler &Disassembler;
+
+  /// The token's string representation
+  llvm::StringRef Str;
+  /// The token's string representation, but in a form suitable for export
+  std::string PermStr;
+  /// The type of the token, as exposed through the external API
+  enum tokenType Type;
+  /// The type of the token, as recorded by the syntax-specific tokenizer
+  uint64_t LocalType;
+  /// The operand corresponding to the token, or (unsigned int)-1 if not
+  ///   part of an operand.
+  int OperandID;
+  
+  /// The sign if the token is a literal (1 if negative, 0 otherwise)
+  bool LiteralSign;
+  /// The absolute value if the token is a literal
+  uint64_t LiteralAbsoluteValue;
+  /// The LLVM register ID if the token is a register name
+  unsigned RegisterID;
+  
+  /// Constructor - Initializes an EDToken with the information common to all
+  ///   tokens
+  ///
+  /// @arg str          - The string corresponding to the token
+  /// @arg type         - The token's type as exposed through the public API
+  /// @arg localType    - The token's type as recorded by the tokenizer
+  /// @arg disassembler - The disassembler responsible for the token
+  EDToken(llvm::StringRef str,
+          enum tokenType type,
+          uint64_t localType,
+          EDDisassembler &disassembler);
+  
+  /// makeLiteral - Adds the information specific to a literal
+  /// @arg sign           - The sign of the literal (1 if negative, 0 
+  ///                       otherwise)
+  ///
+  /// @arg absoluteValue  - The absolute value of the literal
+  void makeLiteral(bool sign, uint64_t absoluteValue);
+  /// makeRegister - Adds the information specific to a register
+  ///
+  /// @arg registerID - The LLVM register ID
+  void makeRegister(unsigned registerID);
+  
+  /// setOperandID - Links the token to a numbered operand
+  ///
+  /// @arg operandID  - The operand ID to link to
+  void setOperandID(int operandID);
+  
+  ~EDToken();
+  
+  /// type - Returns the public type of the token
+  enum tokenType type() const;
+  /// localType - Returns the tokenizer-specific type of the token
+  uint64_t localType() const;
+  /// string - Returns the string representation of the token
+  llvm::StringRef string() const;
+  /// operandID - Returns the operand ID of the token
+  int operandID() const;
+  
+  /// literalSign - Returns the sign of the token 
+  ///   (1 if negative, 0 if positive or unsigned, -1 if it is not a literal)
+  int literalSign() const;
+  /// literalAbsoluteValue - Retrieves the absolute value of the token, and
+  ///   returns -1 if the token is not a literal
+  /// @arg value  - A reference to a value that is filled in with the absolute
+  ///               value, if it is valid
+  int literalAbsoluteValue(uint64_t &value) const;
+  /// registerID - Retrieves the register ID of the token, and returns -1 if the
+  ///   token is not a register
+  ///
+  /// @arg registerID - A reference to a value that is filled in with the 
+  ///                   register ID, if it is valid
+  int registerID(unsigned &registerID) const;
+  
+  /// tokenize - Tokenizes a string using the platform- and syntax-specific
+  ///   tokenizer, and returns 0 on success (-1 on failure)
+  ///
+  /// @arg tokens       - A vector that will be filled in with pointers to
+  ///                     allocated tokens
+  /// @arg str          - The string, as outputted by the AsmPrinter
+  /// @arg operandOrder - The order of the operands from the operandFlags array
+  ///                     as they appear in str
+  /// @arg disassembler - The disassembler for the desired target and
+  //                      assembly syntax
+  static int tokenize(std::vector<EDToken*> &tokens,
+                      std::string &str,
+                      const char *operandOrder,
+                      EDDisassembler &disassembler);
+  
+  /// getString - Directs a character pointer to the string, returning 0 on
+  ///   success (-1 on failure)
+  /// @arg buf  - A reference to a pointer that is set to point to the string.
+  ///   The string is still owned by the token.
+  int getString(const char*& buf);
+};
+
+} // end namespace llvm
+#endif
diff --git a/lib/Target/MSIL/Makefile b/lib/MC/MCDisassembler/Makefile
index 70eadb32e3606..7d71cd381a7c3 100644
--- a/lib/Target/MSIL/Makefile
+++ b/lib/MC/MCDisassembler/Makefile
@@ -1,16 +1,14 @@
-##===- lib/Target/MSIL/Makefile ----------------------------*- Makefile -*-===##
-#
-#		      The LLVM Compiler Infrastructure
+##===- lib/MC/MCDisassembler/Makefile ----------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
-#
+# 
 ##===----------------------------------------------------------------------===##
 
 LEVEL = ../../..
-LIBRARYNAME = LLVMMSIL
-DIRS = TargetInfo
+LIBRARYNAME = LLVMMCDisassembler
 
 include $(LEVEL)/Makefile.common
 
-CompileCommonOpts := $(CompileCommonOpts) -Wno-format
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
new file mode 100644
index 0000000000000..2da71f96c676a
--- /dev/null
+++ b/lib/MC/MCDwarf.cpp
@@ -0,0 +1,21 @@
+//===- lib/MC/MCDwarf.cpp - MCDwarf implementation ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+void MCDwarfFile::print(raw_ostream &OS) const {
+  OS << '"' << getName() << '"';
+}
+
+void MCDwarfFile::dump() const {
+  print(dbgs());
+}
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
new file mode 100644
index 0000000000000..570c3917ab46f
--- /dev/null
+++ b/lib/MC/MCELFStreamer.cpp
@@ -0,0 +1,408 @@
+//===- lib/MC/MCELFStreamer.cpp - ELF Object Output ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits ELF .o object files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCStreamer.h"
+
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetAsmBackend.h"
+
+using namespace llvm;
+
+namespace {
+
+class MCELFStreamer : public MCObjectStreamer {
+  void EmitInstToFragment(const MCInst &Inst);
+  void EmitInstToData(const MCInst &Inst);
+public:
+  MCELFStreamer(MCContext &Context, TargetAsmBackend &TAB,
+                  raw_ostream &OS, MCCodeEmitter *Emitter)
+    : MCObjectStreamer(Context, TAB, OS, Emitter) {}
+
+  ~MCELFStreamer() {}
+
+  /// @name MCStreamer Interface
+  /// @{
+
+  virtual void EmitLabel(MCSymbol *Symbol);
+  virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
+  virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
+  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+  virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+    assert(0 && "ELF doesn't support this directive");
+  }
+  virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                unsigned ByteAlignment);
+  virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol) {
+    assert(0 && "ELF doesn't support this directive");
+  }
+
+  virtual void EmitCOFFSymbolStorageClass(int StorageClass) {
+    assert(0 && "ELF doesn't support this directive");
+  }
+
+  virtual void EmitCOFFSymbolType(int Type) {
+    assert(0 && "ELF doesn't support this directive");
+  }
+
+  virtual void EndCOFFSymbolDef() {
+    assert(0 && "ELF doesn't support this directive");
+  }
+
+  virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
+     MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+     SD.setSize(Value);
+  }
+
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size) {
+    assert(0 && "ELF doesn't support this directive");
+  }
+  virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
+                            unsigned Size = 0, unsigned ByteAlignment = 0) {
+    assert(0 && "ELF doesn't support this directive");
+  }
+  virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                              uint64_t Size, unsigned ByteAlignment = 0) {
+    assert(0 && "ELF doesn't support this directive");
+  }
+  virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
+  virtual void EmitValue(const MCExpr *Value, unsigned Size,unsigned AddrSpace);
+  virtual void EmitGPRel32Value(const MCExpr *Value) {
+    assert(0 && "ELF doesn't support this directive");
+  }
+  virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
+                                    unsigned ValueSize = 1,
+                                    unsigned MaxBytesToEmit = 0);
+  virtual void EmitCodeAlignment(unsigned ByteAlignment,
+                                 unsigned MaxBytesToEmit = 0);
+  virtual void EmitValueToOffset(const MCExpr *Offset,
+                                 unsigned char Value = 0);
+
+  virtual void EmitFileDirective(StringRef Filename);
+  virtual void EmitDwarfFileDirective(unsigned FileNo, StringRef Filename) {
+    DEBUG(dbgs() << "FIXME: MCELFStreamer:EmitDwarfFileDirective not implemented\n");
+  }
+
+  virtual void EmitInstruction(const MCInst &Inst);
+  virtual void Finish();
+
+  /// @}
+};
+
+} // end anonymous namespace.
+
+void MCELFStreamer::EmitLabel(MCSymbol *Symbol) {
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+
+  // FIXME: This is wasteful, we don't necessarily need to create a data
+  // fragment. Instead, we should mark the symbol as pointing into the data
+  // fragment if it exists, otherwise we should just queue the label and set its
+  // fragment pointer when we emit the next fragment.
+  MCDataFragment *F = getOrCreateDataFragment();
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+  assert(!SD.getFragment() && "Unexpected fragment on symbol data!");
+  SD.setFragment(F);
+  SD.setOffset(F->getContents().size());
+
+  Symbol->setSection(*CurSection);
+}
+
+void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  switch (Flag) {
+  case MCAF_SubsectionsViaSymbols:
+    getAssembler().setSubsectionsViaSymbols(true);
+    return;
+  }
+
+  assert(0 && "invalid assembler flag!");
+}
+
+void MCELFStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
+  // MCObjectStreamer.
+  // FIXME: Lift context changes into super class.
+  getAssembler().getOrCreateSymbolData(*Symbol);
+  Symbol->setVariableValue(AddValueSymbols(Value));
+}
+
+void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+                                          MCSymbolAttr Attribute) {
+  // Indirect symbols are handled differently, to match how 'as' handles
+  // them. This makes writing matching .o files easier.
+  if (Attribute == MCSA_IndirectSymbol) {
+    // Note that we intentionally cannot use the symbol data here; this is
+    // important for matching the string table that 'as' generates.
+    IndirectSymbolData ISD;
+    ISD.Symbol = Symbol;
+    ISD.SectionData = getCurrentSectionData();
+    getAssembler().getIndirectSymbols().push_back(ISD);
+    return;
+  }
+
+  // Adding a symbol attribute always introduces the symbol, note that an
+  // important side effect of calling getOrCreateSymbolData here is to register
+  // the symbol with the assembler.
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+
+  // The implementation of symbol attributes is designed to match 'as', but it
+  // leaves much to desired. It doesn't really make sense to arbitrarily add and
+  // remove flags, but 'as' allows this (in particular, see .desc).
+  //
+  // In the future it might be worth trying to make these operations more well
+  // defined.
+  switch (Attribute) {
+  case MCSA_LazyReference:
+  case MCSA_Reference:
+  case MCSA_NoDeadStrip:
+  case MCSA_PrivateExtern:
+  case MCSA_WeakDefinition:
+  case MCSA_WeakDefAutoPrivate:
+  case MCSA_Invalid:
+  case MCSA_ELF_TypeIndFunction:
+  case MCSA_IndirectSymbol:
+    assert(0 && "Invalid symbol attribute for ELF!");
+    break;
+
+  case MCSA_Global:
+    SD.setFlags(SD.getFlags() | ELF_STB_Global);
+    SD.setExternal(true);
+    break;
+
+  case MCSA_WeakReference:
+  case MCSA_Weak:
+    SD.setFlags(SD.getFlags() | ELF_STB_Weak);
+    break;
+
+  case MCSA_Local:
+    SD.setFlags(SD.getFlags() | ELF_STB_Local);
+    break;
+
+  case MCSA_ELF_TypeFunction:
+    SD.setFlags(SD.getFlags() | ELF_STT_Func);
+    break;
+
+  case MCSA_ELF_TypeObject:
+    SD.setFlags(SD.getFlags() | ELF_STT_Object);
+    break;
+
+  case MCSA_ELF_TypeTLS:
+    SD.setFlags(SD.getFlags() | ELF_STT_Tls);
+    break;
+
+  case MCSA_ELF_TypeCommon:
+    SD.setFlags(SD.getFlags() | ELF_STT_Common);
+    break;
+
+  case MCSA_ELF_TypeNoType:
+    SD.setFlags(SD.getFlags() | ELF_STT_Notype);
+    break;
+
+  case MCSA_Protected:
+    SD.setFlags(SD.getFlags() | ELF_STV_Protected);
+    break;
+
+  case MCSA_Hidden:
+    SD.setFlags(SD.getFlags() | ELF_STV_Hidden);
+    break;
+
+  case MCSA_Internal:
+    SD.setFlags(SD.getFlags() | ELF_STV_Internal);
+    break;
+  }
+}
+
+void MCELFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                       unsigned ByteAlignment) {
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+
+  if ((SD.getFlags() & (0xf << ELF_STB_Shift)) == ELF_STB_Local) {
+    const MCSection *Section = getAssembler().getContext().getELFSection(".bss",
+                                                                    MCSectionELF::SHT_NOBITS,
+                                                                    MCSectionELF::SHF_WRITE |
+                                                                    MCSectionELF::SHF_ALLOC,
+                                                                    SectionKind::getBSS());
+
+    MCSectionData &SectData = getAssembler().getOrCreateSectionData(*Section);
+    MCFragment *F = new MCFillFragment(0, 0, Size, &SectData);
+    SD.setFragment(F);
+    Symbol->setSection(*Section);
+    SD.setSize(MCConstantExpr::Create(Size, getContext()));
+  }
+
+  SD.setFlags(SD.getFlags() | ELF_STB_Global);
+  SD.setExternal(true);
+
+  SD.setCommon(Size, ByteAlignment);
+}
+
+void MCELFStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
+  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
+  // MCObjectStreamer.
+  getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
+}
+
+void MCELFStreamer::EmitValue(const MCExpr *Value, unsigned Size,
+                                unsigned AddrSpace) {
+  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
+  // MCObjectStreamer.
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  // Avoid fixups when possible.
+  int64_t AbsValue;
+  if (AddValueSymbols(Value)->EvaluateAsAbsolute(AbsValue)) {
+    // FIXME: Endianness assumption.
+    for (unsigned i = 0; i != Size; ++i)
+      DF->getContents().push_back(uint8_t(AbsValue >> (i * 8)));
+  } else {
+    DF->addFixup(MCFixup::Create(DF->getContents().size(), AddValueSymbols(Value),
+                                 MCFixup::getKindForSize(Size)));
+    DF->getContents().resize(DF->getContents().size() + Size, 0);
+  }
+}
+
+void MCELFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
+                                           int64_t Value, unsigned ValueSize,
+                                           unsigned MaxBytesToEmit) {
+  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
+  // MCObjectStreamer.
+  if (MaxBytesToEmit == 0)
+    MaxBytesToEmit = ByteAlignment;
+  new MCAlignFragment(ByteAlignment, Value, ValueSize, MaxBytesToEmit,
+                      getCurrentSectionData());
+
+  // Update the maximum alignment on the current section if necessary.
+  if (ByteAlignment > getCurrentSectionData()->getAlignment())
+    getCurrentSectionData()->setAlignment(ByteAlignment);
+}
+
+void MCELFStreamer::EmitCodeAlignment(unsigned ByteAlignment,
+                                        unsigned MaxBytesToEmit) {
+  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
+  // MCObjectStreamer.
+  if (MaxBytesToEmit == 0)
+    MaxBytesToEmit = ByteAlignment;
+  MCAlignFragment *F = new MCAlignFragment(ByteAlignment, 0, 1, MaxBytesToEmit,
+                                           getCurrentSectionData());
+  F->setEmitNops(true);
+
+  // Update the maximum alignment on the current section if necessary.
+  if (ByteAlignment > getCurrentSectionData()->getAlignment())
+    getCurrentSectionData()->setAlignment(ByteAlignment);
+}
+
+void MCELFStreamer::EmitValueToOffset(const MCExpr *Offset,
+                                        unsigned char Value) {
+  // TODO: This is exactly the same as MCMachOStreamer. Consider merging into
+  // MCObjectStreamer.
+  new MCOrgFragment(*Offset, Value, getCurrentSectionData());
+}
+
+// Add a symbol for the file name of this module. This is the second
+// entry in the module's symbol table (the first being the null symbol).
+void MCELFStreamer::EmitFileDirective(StringRef Filename) {
+  MCSymbol *Symbol = getAssembler().getContext().GetOrCreateSymbol(Filename);
+  Symbol->setSection(*CurSection);
+  Symbol->setAbsolute();
+
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+
+  SD.setFlags(ELF_STT_File | ELF_STB_Local | ELF_STV_Default);
+}
+
+void MCELFStreamer::EmitInstToFragment(const MCInst &Inst) {
+  MCInstFragment *IF = new MCInstFragment(Inst, getCurrentSectionData());
+
+  // Add the fixups and data.
+  //
+  // FIXME: Revisit this design decision when relaxation is done, we may be
+  // able to get away with not storing any extra data in the MCInst.
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  getAssembler().getEmitter().EncodeInstruction(Inst, VecOS, Fixups);
+  VecOS.flush();
+
+  IF->getCode() = Code;
+  IF->getFixups() = Fixups;
+}
+
+void MCELFStreamer::EmitInstToData(const MCInst &Inst) {
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  getAssembler().getEmitter().EncodeInstruction(Inst, VecOS, Fixups);
+  VecOS.flush();
+
+  // Add the fixups and data.
+  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+    Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
+    DF->addFixup(Fixups[i]);
+  }
+  DF->getContents().append(Code.begin(), Code.end());
+}
+
+void MCELFStreamer::EmitInstruction(const MCInst &Inst) {
+  // Scan for values.
+  for (unsigned i = 0; i != Inst.getNumOperands(); ++i)
+    if (Inst.getOperand(i).isExpr())
+      AddValueSymbols(Inst.getOperand(i).getExpr());
+
+  getCurrentSectionData()->setHasInstructions(true);
+
+  // If this instruction doesn't need relaxation, just emit it as data.
+  if (!getAssembler().getBackend().MayNeedRelaxation(Inst)) {
+    EmitInstToData(Inst);
+    return;
+  }
+
+  // Otherwise, if we are relaxing everything, relax the instruction as much as
+  // possible and emit it as data.
+  if (getAssembler().getRelaxAll()) {
+    MCInst Relaxed;
+    getAssembler().getBackend().RelaxInstruction(Inst, Relaxed);
+    while (getAssembler().getBackend().MayNeedRelaxation(Relaxed))
+      getAssembler().getBackend().RelaxInstruction(Relaxed, Relaxed);
+    EmitInstToData(Relaxed);
+    return;
+  }
+
+  // Otherwise emit to a separate fragment.
+  EmitInstToFragment(Inst);
+}
+
+void MCELFStreamer::Finish() {
+  getAssembler().Finish();
+}
+
+MCStreamer *llvm::createELFStreamer(MCContext &Context, TargetAsmBackend &TAB,
+                                      raw_ostream &OS, MCCodeEmitter *CE,
+                                      bool RelaxAll) {
+  MCELFStreamer *S = new MCELFStreamer(Context, TAB, OS, CE);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  return S;
+}
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 44bc267c11c29..671874df2c69e 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -18,6 +18,8 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCMachOSymbolFlags.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetAsmBackend.h"
@@ -28,58 +30,19 @@ namespace {
 
 class MCMachOStreamer : public MCObjectStreamer {
 private:
-  MCFragment *getCurrentFragment() const {
-    assert(getCurrentSectionData() && "No current section!");
-
-    if (!getCurrentSectionData()->empty())
-      return &getCurrentSectionData()->getFragmentList().back();
-
-    return 0;
-  }
-
-  /// Get a data fragment to write into, creating a new one if the current
-  /// fragment is not a data fragment.
-  MCDataFragment *getOrCreateDataFragment() const {
-    MCDataFragment *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
-    if (!F)
-      F = new MCDataFragment(getCurrentSectionData());
-    return F;
-  }
-
   void EmitInstToFragment(const MCInst &Inst);
   void EmitInstToData(const MCInst &Inst);
+  // FIXME: These will likely moved to a better place.
+  void MakeLineEntryForSection(const MCSection *Section);
+  const MCExpr * MakeStartMinusEndExpr(MCSymbol *Start, MCSymbol *End,
+                                                        int IntVal);
+  void EmitDwarfFileTable(void);
 
 public:
   MCMachOStreamer(MCContext &Context, TargetAsmBackend &TAB,
                   raw_ostream &OS, MCCodeEmitter *Emitter)
     : MCObjectStreamer(Context, TAB, OS, Emitter) {}
 
-  const MCExpr *AddValueSymbols(const MCExpr *Value) {
-    switch (Value->getKind()) {
-    case MCExpr::Target: assert(0 && "Can't handle target exprs yet!");
-    case MCExpr::Constant:
-      break;
-
-    case MCExpr::Binary: {
-      const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-      AddValueSymbols(BE->getLHS());
-      AddValueSymbols(BE->getRHS());
-      break;
-    }
-
-    case MCExpr::SymbolRef:
-      getAssembler().getOrCreateSymbolData(
-        cast<MCSymbolRefExpr>(Value)->getSymbol());
-      break;
-
-    case MCExpr::Unary:
-      AddValueSymbols(cast<MCUnaryExpr>(Value)->getSubExpr());
-      break;
-    }
-
-    return Value;
-  }
-
   /// @name MCStreamer Interface
   /// @{
 
@@ -126,10 +89,16 @@ public:
                                  unsigned char Value = 0);
 
   virtual void EmitFileDirective(StringRef Filename) {
-    report_fatal_error("unsupported directive: '.file'");
+    // FIXME: Just ignore the .file; it isn't important enough to fail the
+    // entire assembly.
+
+    //report_fatal_error("unsupported directive: '.file'");
   }
   virtual void EmitDwarfFileDirective(unsigned FileNo, StringRef Filename) {
-    report_fatal_error("unsupported directive: '.file'");
+    // FIXME: Just ignore the .file; it isn't important enough to fail the
+    // entire assembly.
+
+    //report_fatal_error("unsupported directive: '.file'");
   }
 
   virtual void EmitInstruction(const MCInst &Inst);
@@ -142,6 +111,8 @@ public:
 } // end anonymous namespace.
 
 void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
+  // TODO: This is almost exactly the same as WinCOFFStreamer. Consider merging
+  // into MCObjectStreamer.
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(CurSection && "Cannot emit before setting section!");
@@ -185,6 +156,8 @@ void MCMachOStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
 }
 
 void MCMachOStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
+  // MCObjectStreamer.
   // FIXME: Lift context changes into super class.
   getAssembler().getOrCreateSymbolData(*Symbol);
   Symbol->setVariableValue(AddValueSymbols(Value));
@@ -335,11 +308,15 @@ void MCMachOStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
 }
 
 void MCMachOStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
+  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
+  // MCObjectStreamer.
   getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
 }
 
 void MCMachOStreamer::EmitValue(const MCExpr *Value, unsigned Size,
                                 unsigned AddrSpace) {
+  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
+  // MCObjectStreamer.
   MCDataFragment *DF = getOrCreateDataFragment();
 
   // Avoid fixups when possible.
@@ -359,6 +336,8 @@ void MCMachOStreamer::EmitValue(const MCExpr *Value, unsigned Size,
 void MCMachOStreamer::EmitValueToAlignment(unsigned ByteAlignment,
                                            int64_t Value, unsigned ValueSize,
                                            unsigned MaxBytesToEmit) {
+  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
+  // MCObjectStreamer.
   if (MaxBytesToEmit == 0)
     MaxBytesToEmit = ByteAlignment;
   new MCAlignFragment(ByteAlignment, Value, ValueSize, MaxBytesToEmit,
@@ -371,6 +350,8 @@ void MCMachOStreamer::EmitValueToAlignment(unsigned ByteAlignment,
 
 void MCMachOStreamer::EmitCodeAlignment(unsigned ByteAlignment,
                                         unsigned MaxBytesToEmit) {
+  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
+  // MCObjectStreamer.
   if (MaxBytesToEmit == 0)
     MaxBytesToEmit = ByteAlignment;
   MCAlignFragment *F = new MCAlignFragment(ByteAlignment, 0, 1, MaxBytesToEmit,
@@ -429,6 +410,10 @@ void MCMachOStreamer::EmitInstruction(const MCInst &Inst) {
 
   getCurrentSectionData()->setHasInstructions(true);
 
+  // Now that a machine instruction has been assembled into this section, make
+  // a line entry for any .loc directive that has been seen.
+  MakeLineEntryForSection(getCurrentSection());
+
   // If this instruction doesn't need relaxation, just emit it as data.
   if (!getAssembler().getBackend().MayNeedRelaxation(Inst)) {
     EmitInstToData(Inst);
@@ -450,7 +435,207 @@ void MCMachOStreamer::EmitInstruction(const MCInst &Inst) {
   EmitInstToFragment(Inst);
 }
 
+//
+// This is called when an instruction is assembled into the specified section
+// and if there is information from the last .loc directive that has yet to have
+// a line entry made for it is made.
+//
+void MCMachOStreamer::MakeLineEntryForSection(const MCSection *Section) {
+  if (!getContext().getDwarfLocSeen())
+    return;
+
+  // Create a symbol at in the current section for use in the line entry.
+  MCSymbol *LineSym = getContext().CreateTempSymbol();
+  // Set the value of the symbol to use for the MCLineEntry.
+  EmitLabel(LineSym);
+
+  // Get the current .loc info saved in the context.
+  const MCDwarfLoc &DwarfLoc = getContext().getCurrentDwarfLoc();
+
+  // Create a (local) line entry with the symbol and the current .loc info.
+  MCLineEntry LineEntry(LineSym, DwarfLoc);
+
+  // clear DwarfLocSeen saying the current .loc info is now used.
+  getContext().clearDwarfLocSeen();
+
+  // Get the MCLineSection for this section, if one does not exist for this
+  // section create it.
+  DenseMap<const MCSection *, MCLineSection *> &MCLineSections =
+    getContext().getMCLineSections();
+  MCLineSection *LineSection = MCLineSections[Section];
+  if (!LineSection) {
+    // Create a new MCLineSection.  This will be deleted after the dwarf line
+    // table is created using it by iterating through the MCLineSections
+    // DenseMap.
+    LineSection = new MCLineSection;
+    // Save a pointer to the new LineSection into the MCLineSections DenseMap.
+    MCLineSections[Section] = LineSection;
+  }
+
+  // Add the line entry to this section's entries.
+  LineSection->addLineEntry(LineEntry);
+}
+
+//
+// This helper routine returns an expression of End - Start + IntVal for use
+// by EmitDwarfFileTable() below.
+// 
+const MCExpr * MCMachOStreamer::MakeStartMinusEndExpr(MCSymbol *Start,
+                                                      MCSymbol *End,
+                                                      int IntVal) {
+  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+  const MCExpr *Res =
+    MCSymbolRefExpr::Create(End, Variant, getContext());
+  const MCExpr *RHS =
+    MCSymbolRefExpr::Create(Start, Variant, getContext());
+  const MCExpr *Res1 =
+    MCBinaryExpr::Create(MCBinaryExpr::Sub, Res, RHS,getContext());
+  const MCExpr *Res2 =
+    MCConstantExpr::Create(IntVal, getContext());
+  const MCExpr *Res3 =
+    MCBinaryExpr::Create(MCBinaryExpr::Sub, Res1, Res2, getContext());
+  return Res3;
+}
+
+//
+// This emits the Dwarf file (and eventually the line) table.
+//
+void MCMachOStreamer::EmitDwarfFileTable(void) {
+  // For now make sure we don't put out the Dwarf file table if no .file
+  // directives were seen.
+  const std::vector<MCDwarfFile *> &MCDwarfFiles =
+    getContext().getMCDwarfFiles();
+  if (MCDwarfFiles.size() == 0)
+    return;
+
+  // This is the Mach-O section, for ELF it is the .debug_line section.
+  SwitchSection(getContext().getMachOSection("__DWARF", "__debug_line",
+                                         MCSectionMachO::S_ATTR_DEBUG,
+                                         0, SectionKind::getDataRelLocal()));
+
+  // Create a symbol at the beginning of this section.
+  MCSymbol *LineStartSym = getContext().CreateTempSymbol();
+  // Set the value of the symbol, as we are at the start of the section.
+  EmitLabel(LineStartSym);
+
+  // Create a symbol for the end of the section (to be set when we get there).
+  MCSymbol *LineEndSym = getContext().CreateTempSymbol();
+
+  // The first 4 bytes is the total length of the information for this
+  // compilation unit (not including these 4 bytes for the length).
+  EmitValue(MakeStartMinusEndExpr(LineStartSym, LineEndSym, 4), 4, 0);
+
+  // Next 2 bytes is the Version, which is Dwarf 2.
+  EmitIntValue(2, 2);
+
+  // Create a symbol for the end of the prologue (to be set when we get there).
+  MCSymbol *ProEndSym = getContext().CreateTempSymbol(); // Lprologue_end
+
+  // Length of the prologue, is the next 4 bytes.  Which is the start of the
+  // section to the end of the prologue.  Not including the 4 bytes for the
+  // total length, the 2 bytes for the version, and these 4 bytes for the
+  // length of the prologue.
+  EmitValue(MakeStartMinusEndExpr(LineStartSym, ProEndSym, (4 + 2 + 4)), 4, 0);
+
+  // Parameters of the state machine, are next.
+  //  Define the architecture-dependent minimum instruction length (in
+  //  bytes).  This value should be rather too small than too big.  */
+  //  DWARF2_LINE_MIN_INSN_LENGTH
+  EmitIntValue(1, 1);
+  //  Flag that indicates the initial value of the is_stmt_start flag.
+  //  DWARF2_LINE_DEFAULT_IS_STMT
+  EmitIntValue(1, 1);
+  //  Minimum line offset in a special line info. opcode.  This value
+  //  was chosen to give a reasonable range of values.  */
+  //  DWARF2_LINE_BASE
+  EmitIntValue(uint64_t(-5), 1);
+  //  Range of line offsets in a special line info. opcode.
+  //  DWARF2_LINE_RANGE
+  EmitIntValue(14, 1);
+  //  First special line opcode - leave room for the standard opcodes.
+  //  DWARF2_LINE_OPCODE_BASE
+  EmitIntValue(13, 1);
+
+  // Standard opcode lengths
+  EmitIntValue(0, 1); // length of DW_LNS_copy
+  EmitIntValue(1, 1); // length of DW_LNS_advance_pc
+  EmitIntValue(1, 1); // length of DW_LNS_advance_line
+  EmitIntValue(1, 1); // length of DW_LNS_set_file
+  EmitIntValue(1, 1); // length of DW_LNS_set_column
+  EmitIntValue(0, 1); // length of DW_LNS_negate_stmt
+  EmitIntValue(0, 1); // length of DW_LNS_set_basic_block
+  EmitIntValue(0, 1); // length of DW_LNS_const_add_pc
+  EmitIntValue(1, 1); // length of DW_LNS_fixed_advance_pc
+  EmitIntValue(0, 1); // length of DW_LNS_set_prologue_end
+  EmitIntValue(0, 1); // length of DW_LNS_set_epilogue_begin
+  EmitIntValue(1, 1); // DW_LNS_set_isa
+
+  // Put out the directory and file tables.
+
+  // First the directory table.
+  const std::vector<StringRef> &MCDwarfDirs =
+    getContext().getMCDwarfDirs();
+  for (unsigned i = 0; i < MCDwarfDirs.size(); i++) {
+    EmitBytes(MCDwarfDirs[i], 0); // the DirectoryName
+    EmitBytes(StringRef("\0", 1), 0); // the null termination of the string
+  }
+  EmitIntValue(0, 1); // Terminate the directory list
+
+  // Second the file table.
+  for (unsigned i = 1; i < MCDwarfFiles.size(); i++) {
+    EmitBytes(MCDwarfFiles[i]->getName(), 0); // FileName
+    EmitBytes(StringRef("\0", 1), 0); // the null termination of the string
+    // FIXME the Directory number should be a .uleb128 not a .byte
+    EmitIntValue(MCDwarfFiles[i]->getDirIndex(), 1);
+    EmitIntValue(0, 1); // last modification timestamp (always 0)
+    EmitIntValue(0, 1); // filesize (always 0)
+  }
+  EmitIntValue(0, 1); // Terminate the file list
+
+  // This is the end of the prologue, so set the value of the symbol at the
+  // end of the prologue (that was used in a previous expression).
+  EmitLabel(ProEndSym);
+
+  // TODO: This is the point where the line tables would be emitted.
+
+  // Delete the MCLineSections that were created in 
+  // MCMachOStreamer::MakeLineEntryForSection() and used to emit the line
+  // tables.
+  DenseMap<const MCSection *, MCLineSection *> &MCLineSections =
+    getContext().getMCLineSections();
+  for (DenseMap<const MCSection *, MCLineSection *>::iterator it =
+	MCLineSections.begin(), ie = MCLineSections.end(); it != ie; ++it) {
+    delete it->second;
+  }
+
+  // If there are no line tables emited then we emit:
+  // The following DW_LNE_set_address sequence to set the address to zero
+  //   TODO test for 32-bit or 64-bit output
+  //     This is the sequence for 32-bit code
+  EmitIntValue(0, 1);
+  EmitIntValue(5, 1);
+  EmitIntValue(2, 1);
+  EmitIntValue(0, 1);
+  EmitIntValue(0, 1);
+  EmitIntValue(0, 1);
+  EmitIntValue(0, 1);
+
+  // Lastly emit the DW_LNE_end_sequence which consists of 3 bytes '00 01 01'
+  // (00 is the code for extended opcodes, followed by a ULEB128 length of the
+  // extended opcode (01), and the DW_LNE_end_sequence (01).
+  EmitIntValue(0, 1); // DW_LNS_extended_op
+  EmitIntValue(1, 1); // ULEB128 length of the extended opcode
+  EmitIntValue(1, 1); // DW_LNE_end_sequence
+
+  // This is the end of the section, so set the value of the symbol at the end
+  // of this section (that was used in a previous expression).
+  EmitLabel(LineEndSym);
+}
+
 void MCMachOStreamer::Finish() {
+  // Dump out the dwarf file and directory tables (soon to include line table)
+  EmitDwarfFileTable();
+
   // We have to set the fragment atom associations so we can relax properly for
   // Mach-O.
 
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index 5332ade211535..f7a2f20ca4bc3 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -26,6 +26,7 @@ namespace {
     /// @{
 
     virtual void SwitchSection(const MCSection *Section) {
+      PrevSection = CurSection;
       CurSection = Section;
     }
 
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index d3f7f7783ffa7..2b2385ef9156d 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -9,7 +9,11 @@
 
 #include "llvm/MC/MCObjectStreamer.h"
 
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Target/TargetAsmBackend.h"
 using namespace llvm;
 
 MCObjectStreamer::MCObjectStreamer(MCContext &Context, TargetAsmBackend &TAB,
@@ -21,15 +25,59 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context, TargetAsmBackend &TAB,
 }
 
 MCObjectStreamer::~MCObjectStreamer() {
+  delete &Assembler->getBackend();
+  delete &Assembler->getEmitter();
   delete Assembler;
 }
 
+MCFragment *MCObjectStreamer::getCurrentFragment() const {
+  assert(getCurrentSectionData() && "No current section!");
+
+  if (!getCurrentSectionData()->empty())
+    return &getCurrentSectionData()->getFragmentList().back();
+
+  return 0;
+}
+
+MCDataFragment *MCObjectStreamer::getOrCreateDataFragment() const {
+  MCDataFragment *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
+  if (!F)
+    F = new MCDataFragment(getCurrentSectionData());
+  return F;
+}
+
+const MCExpr *MCObjectStreamer::AddValueSymbols(const MCExpr *Value) {
+  switch (Value->getKind()) {
+  case MCExpr::Target: llvm_unreachable("Can't handle target exprs yet!");
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbols(BE->getLHS());
+    AddValueSymbols(BE->getRHS());
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Assembler->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbols(cast<MCUnaryExpr>(Value)->getSubExpr());
+    break;
+  }
+
+  return Value;
+}
+
 void MCObjectStreamer::SwitchSection(const MCSection *Section) {
   assert(Section && "Cannot switch to a null section!");
 
   // If already in this section, then this is a noop.
   if (Section == CurSection) return;
 
+  PrevSection = CurSection;
   CurSection = Section;
   CurSectionData = &getAssembler().getOrCreateSectionData(*Section);
 }
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index 465d98382877b..086df081a938f 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -117,6 +117,13 @@ AsmToken AsmLexer::LexLineComment() {
   return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0));
 }
 
+static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
+  if (CurPtr[0] == 'L' && CurPtr[1] == 'L')
+    CurPtr += 2;
+  if (CurPtr[0] == 'U' && CurPtr[1] == 'L' && CurPtr[2] == 'L')
+    CurPtr += 3;
+}
+
 
 /// LexDigit: First character is [0-9].
 ///   Local Label: [0-9][:]
@@ -133,7 +140,7 @@ AsmToken AsmLexer::LexDigit() {
       ++CurPtr;
     
     StringRef Result(TokStart, CurPtr - TokStart);
-    
+
     long long Value;
     if (Result.getAsInteger(10, Value)) {
       // We have to handle minint_as_a_positive_value specially, because
@@ -143,6 +150,11 @@ AsmToken AsmLexer::LexDigit() {
       else
         return ReturnError(TokStart, "Invalid decimal number");
     }
+    
+    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
+    // suffixes on integer literals.
+    SkipIgnoredIntegerSuffix(CurPtr);
+    
     return AsmToken(AsmToken::Integer, Result, Value);
   }
   
@@ -165,9 +177,13 @@ AsmToken AsmLexer::LexDigit() {
     StringRef Result(TokStart, CurPtr - TokStart);
     
     long long Value;
-    if (Result.getAsInteger(2, Value))
+    if (Result.substr(2).getAsInteger(2, Value))
       return ReturnError(TokStart, "Invalid binary number");
     
+    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
+    // suffixes on integer literals.
+    SkipIgnoredIntegerSuffix(CurPtr);
+    
     return AsmToken(AsmToken::Integer, Result, Value);
   }
  
@@ -185,6 +201,10 @@ AsmToken AsmLexer::LexDigit() {
     if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
       return ReturnError(TokStart, "Invalid hexadecimal number");
       
+    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
+    // suffixes on integer literals.
+    SkipIgnoredIntegerSuffix(CurPtr);
+    
     return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart),
                     (int64_t)Result);
   }
@@ -198,6 +218,10 @@ AsmToken AsmLexer::LexDigit() {
   if (Result.getAsInteger(8, Value))
     return ReturnError(TokStart, "Invalid octal number");
   
+  // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
+  // suffixes on integer literals.
+  SkipIgnoredIntegerSuffix(CurPtr);
+  
   return AsmToken(AsmToken::Integer, Result, Value);
 }
 
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index e0949bd2856f7..f83cd5eb2a160 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -11,47 +11,237 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/AsmParser.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/AsmCond.h"
+#include "llvm/MC/MCParser/AsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetAsmParser.h"
+#include <vector>
 using namespace llvm;
 
 namespace {
 
+/// \brief Helper class for tracking macro definitions.
+struct Macro {
+  StringRef Name;
+  StringRef Body;
+
+public:
+  Macro(StringRef N, StringRef B) : Name(N), Body(B) {}
+};
+
+/// \brief Helper class for storing information about an active macro
+/// instantiation.
+struct MacroInstantiation {
+  /// The macro being instantiated.
+  const Macro *TheMacro;
+
+  /// The macro instantiation with substitutions.
+  MemoryBuffer *Instantiation;
+
+  /// The location of the instantiation.
+  SMLoc InstantiationLoc;
+
+  /// The location where parsing should resume upon instantiation completion.
+  SMLoc ExitLoc;
+
+public:
+  MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL,
+                     const std::vector<std::vector<AsmToken> > &A);
+};
+
+/// \brief The concrete assembly parser instance.
+class AsmParser : public MCAsmParser {
+  friend class GenericAsmParser;
+
+  AsmParser(const AsmParser &);   // DO NOT IMPLEMENT
+  void operator=(const AsmParser &);  // DO NOT IMPLEMENT
+private:
+  AsmLexer Lexer;
+  MCContext &Ctx;
+  MCStreamer &Out;
+  SourceMgr &SrcMgr;
+  MCAsmParserExtension *GenericParser;
+  MCAsmParserExtension *PlatformParser;
+
+  /// This is the current buffer index we're lexing from as managed by the
+  /// SourceMgr object.
+  int CurBuffer;
+
+  AsmCond TheCondState;
+  std::vector<AsmCond> TheCondStack;
+
+  /// DirectiveMap - This is a table handlers for directives.  Each handler is
+  /// invoked after the directive identifier is read and is responsible for
+  /// parsing and validating the rest of the directive.  The handler is passed
+  /// in the directive name and the location of the directive keyword.
+  StringMap<std::pair<MCAsmParserExtension*, DirectiveHandler> > DirectiveMap;
+
+  /// MacroMap - Map of currently defined macros.
+  StringMap<Macro*> MacroMap;
+
+  /// ActiveMacros - Stack of active macro instantiations.
+  std::vector<MacroInstantiation*> ActiveMacros;
+
+  /// Boolean tracking whether macro substitution is enabled.
+  unsigned MacrosEnabled : 1;
+
+public:
+  AsmParser(const Target &T, SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
+            const MCAsmInfo &MAI);
+  ~AsmParser();
+
+  virtual bool Run(bool NoInitialTextSection, bool NoFinalize = false);
+
+  void AddDirectiveHandler(MCAsmParserExtension *Object,
+                           StringRef Directive,
+                           DirectiveHandler Handler) {
+    DirectiveMap[Directive] = std::make_pair(Object, Handler);
+  }
+
+public:
+  /// @name MCAsmParser Interface
+  /// {
+
+  virtual SourceMgr &getSourceManager() { return SrcMgr; }
+  virtual MCAsmLexer &getLexer() { return Lexer; }
+  virtual MCContext &getContext() { return Ctx; }
+  virtual MCStreamer &getStreamer() { return Out; }
+
+  virtual void Warning(SMLoc L, const Twine &Meg);
+  virtual bool Error(SMLoc L, const Twine &Msg);
+
+  const AsmToken &Lex();
+
+  bool ParseExpression(const MCExpr *&Res);
+  virtual bool ParseExpression(const MCExpr *&Res, SMLoc &EndLoc);
+  virtual bool ParseParenExpression(const MCExpr *&Res, SMLoc &EndLoc);
+  virtual bool ParseAbsoluteExpression(int64_t &Res);
+
+  /// }
+
+private:
+  bool ParseStatement();
+
+  bool HandleMacroEntry(StringRef Name, SMLoc NameLoc, const Macro *M);
+  void HandleMacroExit();
+
+  void PrintMacroInstantiations();
+  void PrintMessage(SMLoc Loc, const std::string &Msg, const char *Type) const;
+    
+  /// EnterIncludeFile - Enter the specified file. This returns true on failure.
+  bool EnterIncludeFile(const std::string &Filename);
+
+  /// \brief Reset the current lexer position to that given by \arg Loc. The
+  /// current token is not set; clients should ensure Lex() is called
+  /// subsequently.
+  void JumpToLoc(SMLoc Loc);
+
+  void EatToEndOfStatement();
+
+  /// \brief Parse up to the end of statement and a return the contents from the
+  /// current token until the end of the statement; the current token on exit
+  /// will be either the EndOfStatement or EOF.
+  StringRef ParseStringToEndOfStatement();
+
+  bool ParseAssignment(StringRef Name);
+
+  bool ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc);
+  bool ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res, SMLoc &EndLoc);
+  bool ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc);
+
+  /// ParseIdentifier - Parse an identifier or string (as a quoted identifier)
+  /// and set \arg Res to the identifier contents.
+  bool ParseIdentifier(StringRef &Res);
+  
+  // Directive Parsing.
+  bool ParseDirectiveAscii(bool ZeroTerminated); // ".ascii", ".asciiz"
+  bool ParseDirectiveValue(unsigned Size); // ".byte", ".long", ...
+  bool ParseDirectiveFill(); // ".fill"
+  bool ParseDirectiveSpace(); // ".space"
+  bool ParseDirectiveSet(); // ".set"
+  bool ParseDirectiveOrg(); // ".org"
+  // ".align{,32}", ".p2align{,w,l}"
+  bool ParseDirectiveAlign(bool IsPow2, unsigned ValueSize);
+
+  /// ParseDirectiveSymbolAttribute - Parse a directive like ".globl" which
+  /// accepts a single symbol (which should be a label or an external).
+  bool ParseDirectiveSymbolAttribute(MCSymbolAttr Attr);
+  bool ParseDirectiveELFType(); // ELF specific ".type"
+
+  bool ParseDirectiveComm(bool IsLocal); // ".comm" and ".lcomm"
+
+  bool ParseDirectiveAbort(); // ".abort"
+  bool ParseDirectiveInclude(); // ".include"
+
+  bool ParseDirectiveIf(SMLoc DirectiveLoc); // ".if"
+  bool ParseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif"
+  bool ParseDirectiveElse(SMLoc DirectiveLoc); // ".else"
+  bool ParseDirectiveEndIf(SMLoc DirectiveLoc); // .endif
+
+  /// ParseEscapedString - Parse the current token as a string which may include
+  /// escaped characters and return the string contents.
+  bool ParseEscapedString(std::string &Data);
+};
+
 /// \brief Generic implementations of directive handling, etc. which is shared
 /// (or the default, at least) for all assembler parser.
 class GenericAsmParser : public MCAsmParserExtension {
+  template<bool (GenericAsmParser::*Handler)(StringRef, SMLoc)>
+  void AddDirectiveHandler(StringRef Directive) {
+    getParser().AddDirectiveHandler(this, Directive,
+                                    HandleDirective<GenericAsmParser, Handler>);
+  }
+
 public:
   GenericAsmParser() {}
 
+  AsmParser &getParser() {
+    return (AsmParser&) this->MCAsmParserExtension::getParser();
+  }
+
   virtual void Initialize(MCAsmParser &Parser) {
     // Call the base implementation.
     this->MCAsmParserExtension::Initialize(Parser);
 
     // Debugging directives.
-    Parser.AddDirectiveHandler(this, ".file", MCAsmParser::DirectiveHandler(
-                                 &GenericAsmParser::ParseDirectiveFile));
-    Parser.AddDirectiveHandler(this, ".line", MCAsmParser::DirectiveHandler(
-                                 &GenericAsmParser::ParseDirectiveLine));
-    Parser.AddDirectiveHandler(this, ".loc", MCAsmParser::DirectiveHandler(
-                                 &GenericAsmParser::ParseDirectiveLoc));
+    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveFile>(".file");
+    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveLine>(".line");
+    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveLoc>(".loc");
+
+    // Macro directives.
+    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveMacrosOnOff>(
+      ".macros_on");
+    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveMacrosOnOff>(
+      ".macros_off");
+    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveMacro>(".macro");
+    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveEndMacro>(".endm");
+    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveEndMacro>(".endmacro");
   }
 
-  bool ParseDirectiveFile(StringRef, SMLoc DirectiveLoc); // ".file"
-  bool ParseDirectiveLine(StringRef, SMLoc DirectiveLoc); // ".line"
-  bool ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc); // ".loc"
+  bool ParseDirectiveFile(StringRef, SMLoc DirectiveLoc);
+  bool ParseDirectiveLine(StringRef, SMLoc DirectiveLoc);
+  bool ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc);
+
+  bool ParseDirectiveMacrosOnOff(StringRef, SMLoc DirectiveLoc);
+  bool ParseDirectiveMacro(StringRef, SMLoc DirectiveLoc);
+  bool ParseDirectiveEndMacro(StringRef, SMLoc DirectiveLoc);
 };
 
 }
@@ -69,7 +259,7 @@ AsmParser::AsmParser(const Target &T, SourceMgr &_SM, MCContext &_Ctx,
                      MCStreamer &_Out, const MCAsmInfo &_MAI)
   : Lexer(_MAI), Ctx(_Ctx), Out(_Out), SrcMgr(_SM),
     GenericParser(new GenericAsmParser), PlatformParser(0),
-    TargetParser(0), CurBuffer(0) {
+    CurBuffer(0), MacrosEnabled(true) {
   Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
 
   // Initialize the generic parser.
@@ -89,22 +279,33 @@ AsmParser::AsmParser(const Target &T, SourceMgr &_SM, MCContext &_Ctx,
 }
 
 AsmParser::~AsmParser() {
+  assert(ActiveMacros.empty() && "Unexpected active macro instantiation!");
+
+  // Destroy any macros.
+  for (StringMap<Macro*>::iterator it = MacroMap.begin(),
+         ie = MacroMap.end(); it != ie; ++it)
+    delete it->getValue();
+
   delete PlatformParser;
   delete GenericParser;
 }
 
-void AsmParser::setTargetParser(TargetAsmParser &P) {
-  assert(!TargetParser && "Target parser is already initialized!");
-  TargetParser = &P;
-  TargetParser->Initialize(*this);
+void AsmParser::PrintMacroInstantiations() {
+  // Print the active macro instantiation stack.
+  for (std::vector<MacroInstantiation*>::const_reverse_iterator
+         it = ActiveMacros.rbegin(), ie = ActiveMacros.rend(); it != ie; ++it)
+    PrintMessage((*it)->InstantiationLoc, "while in macro instantiation",
+                 "note");
 }
 
 void AsmParser::Warning(SMLoc L, const Twine &Msg) {
   PrintMessage(L, Msg.str(), "warning");
+  PrintMacroInstantiations();
 }
 
 bool AsmParser::Error(SMLoc L, const Twine &Msg) {
   PrintMessage(L, Msg.str(), "error");
+  PrintMacroInstantiations();
   return true;
 }
 
@@ -124,7 +325,12 @@ bool AsmParser::EnterIncludeFile(const std::string &Filename) {
   
   return false;
 }
-                  
+
+void AsmParser::JumpToLoc(SMLoc Loc) {
+  CurBuffer = SrcMgr.FindBufferContainingLoc(Loc);
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer), Loc.getPointer());
+}
+
 const AsmToken &AsmParser::Lex() {
   const AsmToken *tok = &Lexer.Lex();
   
@@ -133,15 +339,13 @@ const AsmToken &AsmParser::Lex() {
     // include stack.
     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
     if (ParentIncludeLoc != SMLoc()) {
-      CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
-      Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer), 
-                      ParentIncludeLoc.getPointer());
+      JumpToLoc(ParentIncludeLoc);
       tok = &Lexer.Lex();
     }
   }
     
   if (tok->is(AsmToken::Error))
-    PrintMessage(Lexer.getErrLoc(), Lexer.getErr(), "error");
+    Error(Lexer.getErrLoc(), Lexer.getErr());
   
   return *tok;
 }
@@ -174,6 +378,16 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   if (TheCondState.TheCond != StartingCondState.TheCond ||
       TheCondState.Ignore != StartingCondState.Ignore)
     return TokError("unmatched .ifs or .elses");
+
+  // Check to see there are no empty DwarfFile slots.
+  const std::vector<MCDwarfFile *> &MCDwarfFiles =
+    getContext().getMCDwarfFiles();
+  for (unsigned i = 1; i < MCDwarfFiles.size(); i++) {
+    if (!MCDwarfFiles[i]){
+      TokError("unassigned file number: " + Twine(i) + " for .file directives");
+      HadError = true;
+    }
+  }
   
   // Finalize the output stream if there are no errors and if the client wants
   // us to.
@@ -194,6 +408,16 @@ void AsmParser::EatToEndOfStatement() {
     Lex();
 }
 
+StringRef AsmParser::ParseStringToEndOfStatement() {
+  const char *Start = getTok().getLoc().getPointer();
+
+  while (Lexer.isNot(AsmToken::EndOfStatement) &&
+         Lexer.isNot(AsmToken::Eof))
+    Lex();
+
+  const char *End = getTok().getLoc().getPointer();
+  return StringRef(Start, End - Start);
+}
 
 /// ParseParenExpr - Parse a paren expression and return it.
 /// NOTE: This assumes the leading '(' has already been consumed.
@@ -225,10 +449,17 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
       return true;
     Res = MCUnaryExpr::CreateLNot(Res, getContext());
     return false;
+  case AsmToken::Dollar:
   case AsmToken::String:
   case AsmToken::Identifier: {
+    EndLoc = Lexer.getLoc();
+
+    StringRef Identifier;
+    if (ParseIdentifier(Identifier))
+      return false;
+
     // This is a symbol reference.
-    std::pair<StringRef, StringRef> Split = getTok().getIdentifier().split('@');
+    std::pair<StringRef, StringRef> Split = Identifier.split('@');
     MCSymbol *Sym = getContext().GetOrCreateSymbol(Split.first);
 
     // Mark the symbol as used in an expression.
@@ -236,12 +467,9 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
 
     // Lookup the symbol variant if used.
     MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
-    if (Split.first.size() != getTok().getIdentifier().size())
+    if (Split.first.size() != Identifier.size())
       Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
 
-    EndLoc = Lexer.getLoc();
-    Lex(); // Eat identifier.
-
     // If this is an absolute variable reference, substitute it now to preserve
     // semantics in the face of reassignment.
     if (Sym->isVariable() && isa<MCConstantExpr>(Sym->getVariableValue())) {
@@ -568,7 +796,12 @@ bool AsmParser::ParseStatement() {
   default: // Normal instruction or directive.
     break;
   }
-  
+
+  // If macros are enabled, check to see if this is a macro instantiation.
+  if (MacrosEnabled)
+    if (const Macro *M = MacroMap.lookup(IDVal))
+      return HandleMacroEntry(IDVal, IDLoc, M);
+
   // Otherwise, we have a normal instruction or directive.  
   if (IDVal[0] == '.') {
     // Assembler features
@@ -591,11 +824,14 @@ bool AsmParser::ParseStatement() {
     if (IDVal == ".quad")
       return ParseDirectiveValue(8);
 
-    // FIXME: Target hooks for IsPow2.
-    if (IDVal == ".align")
-      return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/1);
-    if (IDVal == ".align32")
-      return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/4);
+    if (IDVal == ".align") {
+      bool IsPow2 = !getContext().getAsmInfo().getAlignmentIsInBytes();
+      return ParseDirectiveAlign(IsPow2, /*ExprSize=*/1);
+    }
+    if (IDVal == ".align32") {
+      bool IsPow2 = !getContext().getAsmInfo().getAlignmentIsInBytes();
+      return ParseDirectiveAlign(IsPow2, /*ExprSize=*/4);
+    }
     if (IDVal == ".balign")
       return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/1);
     if (IDVal == ".balignw")
@@ -662,7 +898,7 @@ bool AsmParser::ParseStatement() {
     std::pair<MCAsmParserExtension*, DirectiveHandler> Handler =
       DirectiveMap.lookup(IDVal);
     if (Handler.first)
-      return (Handler.first->*Handler.second)(IDVal, IDLoc);
+      return (*Handler.second)(Handler.first, IDVal, IDLoc);
 
     // Target hook for parsing target specific directives.
     if (!getTargetParser().ParseDirective(ID))
@@ -684,20 +920,29 @@ bool AsmParser::ParseStatement() {
   if (!HadError && Lexer.isNot(AsmToken::EndOfStatement))
     HadError = TokError("unexpected token in argument list");
 
+  // Dump the parsed representation, if requested.
+  if (getShowParsedOperands()) {
+    SmallString<256> Str;
+    raw_svector_ostream OS(Str);
+    OS << "parsed instruction: [";
+    for (unsigned i = 0; i != ParsedOperands.size(); ++i) {
+      if (i != 0)
+        OS << ", ";
+      ParsedOperands[i]->dump(OS);
+    }
+    OS << "]";
+
+    PrintMessage(IDLoc, OS.str(), "note");
+  }
+
   // If parsing succeeded, match the instruction.
   if (!HadError) {
     MCInst Inst;
-    if (!getTargetParser().MatchInstruction(ParsedOperands, Inst)) {
+    if (!getTargetParser().MatchInstruction(IDLoc, ParsedOperands, Inst)) {
       // Emit the instruction on success.
       Out.EmitInstruction(Inst);
-    } else {
-      // Otherwise emit a diagnostic about the match failure and set the error
-      // flag.
-      //
-      // FIXME: We should give nicer diagnostics about the exact failure.
-      Error(IDLoc, "unrecognized instruction");
+    } else
       HadError = true;
-    }
   }
 
   // If there was no error, consume the end-of-statement token. Otherwise this
@@ -712,6 +957,132 @@ bool AsmParser::ParseStatement() {
   return HadError;
 }
 
+MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL,
+                                   const std::vector<std::vector<AsmToken> > &A)
+  : TheMacro(M), InstantiationLoc(IL), ExitLoc(EL)
+{
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  raw_svector_ostream OS(Buf);
+
+  StringRef Body = M->Body;
+  while (!Body.empty()) {
+    // Scan for the next substitution.
+    std::size_t End = Body.size(), Pos = 0;
+    for (; Pos != End; ++Pos) {
+      // Check for a substitution or escape.
+      if (Body[Pos] != '$' || Pos + 1 == End)
+        continue;
+
+      char Next = Body[Pos + 1];
+      if (Next == '$' || Next == 'n' || isdigit(Next))
+        break;
+    }
+
+    // Add the prefix.
+    OS << Body.slice(0, Pos);
+
+    // Check if we reached the end.
+    if (Pos == End)
+      break;
+
+    switch (Body[Pos+1]) {
+       // $$ => $
+    case '$':
+      OS << '$';
+      break;
+
+      // $n => number of arguments
+    case 'n':
+      OS << A.size();
+      break;
+
+       // $[0-9] => argument
+    default: {
+      // Missing arguments are ignored.
+      unsigned Index = Body[Pos+1] - '0';
+      if (Index >= A.size())
+        break;
+
+      // Otherwise substitute with the token values, with spaces eliminated.
+      for (std::vector<AsmToken>::const_iterator it = A[Index].begin(),
+             ie = A[Index].end(); it != ie; ++it)
+        OS << it->getString();
+      break;
+    }
+    }
+
+    // Update the scan point.
+    Body = Body.substr(Pos + 2);
+  }
+
+  // We include the .endmacro in the buffer as our queue to exit the macro
+  // instantiation.
+  OS << ".endmacro\n";
+
+  Instantiation = MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+}
+
+bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc,
+                                 const Macro *M) {
+  // Arbitrarily limit macro nesting depth, to match 'as'. We can eliminate
+  // this, although we should protect against infinite loops.
+  if (ActiveMacros.size() == 20)
+    return TokError("macros cannot be nested more than 20 levels deep");
+
+  // Parse the macro instantiation arguments.
+  std::vector<std::vector<AsmToken> > MacroArguments;
+  MacroArguments.push_back(std::vector<AsmToken>());
+  unsigned ParenLevel = 0;
+  for (;;) {
+    if (Lexer.is(AsmToken::Eof))
+      return TokError("unexpected token in macro instantiation");
+    if (Lexer.is(AsmToken::EndOfStatement))
+      break;
+
+    // If we aren't inside parentheses and this is a comma, start a new token
+    // list.
+    if (ParenLevel == 0 && Lexer.is(AsmToken::Comma)) {
+      MacroArguments.push_back(std::vector<AsmToken>());
+    } else {
+      // Adjust the current parentheses level.
+      if (Lexer.is(AsmToken::LParen))
+        ++ParenLevel;
+      else if (Lexer.is(AsmToken::RParen) && ParenLevel)
+        --ParenLevel;
+
+      // Append the token to the current argument list.
+      MacroArguments.back().push_back(getTok());
+    }
+    Lex();
+  }
+
+  // Create the macro instantiation object and add to the current macro
+  // instantiation stack.
+  MacroInstantiation *MI = new MacroInstantiation(M, NameLoc,
+                                                  getTok().getLoc(),
+                                                  MacroArguments);
+  ActiveMacros.push_back(MI);
+
+  // Jump to the macro instantiation and prime the lexer.
+  CurBuffer = SrcMgr.AddNewSourceBuffer(MI->Instantiation, SMLoc());
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
+  Lex();
+
+  return false;
+}
+
+void AsmParser::HandleMacroExit() {
+  // Jump to the EndOfStatement we should return to, and consume it.
+  JumpToLoc(ActiveMacros.back()->ExitLoc);
+  Lex();
+
+  // Pop the instantiation entry.
+  delete ActiveMacros.back();
+  ActiveMacros.pop_back();
+}
+
 bool AsmParser::ParseAssignment(StringRef Name) {
   // FIXME: Use better location, we should use proper tokens.
   SMLoc EqualLoc = Lexer.getLoc();
@@ -760,6 +1131,30 @@ bool AsmParser::ParseAssignment(StringRef Name) {
 ///   ::= identifier
 ///   ::= string
 bool AsmParser::ParseIdentifier(StringRef &Res) {
+  // The assembler has relaxed rules for accepting identifiers, in particular we
+  // allow things like '.globl $foo', which would normally be separate
+  // tokens. At this level, we have already lexed so we cannot (currently)
+  // handle this as a context dependent token, instead we detect adjacent tokens
+  // and return the combined identifier.
+  if (Lexer.is(AsmToken::Dollar)) {
+    SMLoc DollarLoc = getLexer().getLoc();
+
+    // Consume the dollar sign, and check for a following identifier.
+    Lex();
+    if (Lexer.isNot(AsmToken::Identifier))
+      return true;
+
+    // We have a '$' followed by an identifier, make sure they are adjacent.
+    if (DollarLoc.getPointer() + 1 != getTok().getLoc().getPointer())
+      return true;
+
+    // Construct the joined identifier and consume the token.
+    Res = StringRef(DollarLoc.getPointer(),
+                    getTok().getIdentifier().size() + 1);
+    Lex();
+    return false;
+  }
+
   if (Lexer.isNot(AsmToken::Identifier) &&
       Lexer.isNot(AsmToken::String))
     return true;
@@ -1081,13 +1476,14 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   bool UseCodeAlign = false;
   if (const MCSectionMachO *S = dyn_cast<MCSectionMachO>(
         getStreamer().getCurrentSection()))
-      UseCodeAlign = S->hasAttribute(MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS);
+    UseCodeAlign = S->hasAttribute(MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS);
   if ((!HasFillExpr || Lexer.getMAI().getTextAlignFillValue() == FillExpr) &&
       ValueSize == 1 && UseCodeAlign) {
     getStreamer().EmitCodeAlignment(Alignment, MaxBytesToFill);
   } else {
     // FIXME: Target specific behavior about how the "extra" bytes are filled.
-    getStreamer().EmitValueToAlignment(Alignment, FillExpr, ValueSize, MaxBytesToFill);
+    getStreamer().EmitValueToAlignment(Alignment, FillExpr, ValueSize,
+                                       MaxBytesToFill);
   }
 
   return false;
@@ -1238,31 +1634,22 @@ bool AsmParser::ParseDirectiveComm(bool IsLocal) {
 }
 
 /// ParseDirectiveAbort
-///  ::= .abort [ "abort_string" ]
+///  ::= .abort [... message ...]
 bool AsmParser::ParseDirectiveAbort() {
   // FIXME: Use loc from directive.
   SMLoc Loc = getLexer().getLoc();
 
-  StringRef Str = "";
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    if (getLexer().isNot(AsmToken::String))
-      return TokError("expected string in '.abort' directive");
-    
-    Str = getTok().getString();
-
-    Lex();
-  }
-
+  StringRef Str = ParseStringToEndOfStatement();
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.abort' directive");
-  
+
   Lex();
 
-  // FIXME: Handle here.
   if (Str.empty())
     Error(Loc, ".abort detected. Assembly stopping.");
   else
     Error(Loc, ".abort '" + Str + "' detected. Assembly stopping.");
+  // FIXME: Actually abort assembly here.
 
   return false;
 }
@@ -1286,9 +1673,7 @@ bool AsmParser::ParseDirectiveInclude() {
   // Attempt to switch the lexer to the included file before consuming the end
   // of statement to avoid losing it when we switch.
   if (EnterIncludeFile(Filename)) {
-    PrintMessage(IncludeLoc,
-                 "Could not find include file '" + Filename + "'",
-                 "error");
+    Error(IncludeLoc, "Could not find include file '" + Filename + "'");
     return true;
   }
 
@@ -1401,6 +1786,7 @@ bool AsmParser::ParseDirectiveEndIf(SMLoc DirectiveLoc) {
 bool GenericAsmParser::ParseDirectiveFile(StringRef, SMLoc DirectiveLoc) {
   // FIXME: I'm not sure what this is.
   int64_t FileNumber = -1;
+  SMLoc FileNumberLoc = getLexer().getLoc();
   if (getLexer().is(AsmToken::Integer)) {
     FileNumber = getTok().getIntVal();
     Lex();
@@ -1421,8 +1807,11 @@ bool GenericAsmParser::ParseDirectiveFile(StringRef, SMLoc DirectiveLoc) {
 
   if (FileNumber == -1)
     getStreamer().EmitFileDirective(Filename);
-  else
+  else {
+     if (getContext().GetDwarfFile(Filename, FileNumber) == 0)
+	Error(FileNumberLoc, "file number already allocated");
     getStreamer().EmitDwarfFileDirective(FileNumber, Filename);
+  }
 
   return false;
 }
@@ -1449,40 +1838,193 @@ bool GenericAsmParser::ParseDirectiveLine(StringRef, SMLoc DirectiveLoc) {
 
 
 /// ParseDirectiveLoc
-/// ::= .loc number [number [number]]
+/// ::= .loc FileNumber [LineNumber] [ColumnPos] [basic_block] [prologue_end]
+///                                [epilogue_begin] [is_stmt VALUE] [isa VALUE]
+/// The first number is a file number, must have been previously assigned with
+/// a .file directive, the second number is the line number and optionally the
+/// third number is a column position (zero if not specified).  The remaining
+/// optional items are .loc sub-directives.
 bool GenericAsmParser::ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc) {
+
   if (getLexer().isNot(AsmToken::Integer))
     return TokError("unexpected token in '.loc' directive");
-
-  // FIXME: What are these fields?
   int64_t FileNumber = getTok().getIntVal();
-  (void) FileNumber;
-  // FIXME: Validate file.
-
+  if (FileNumber < 1)
+    return TokError("file number less than one in '.loc' directive");
+  if (!getContext().ValidateDwarfFileNumber(FileNumber))
+    return TokError("unassigned file number in '.loc' directive");
   Lex();
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    if (getLexer().isNot(AsmToken::Integer))
-      return TokError("unexpected token in '.loc' directive");
 
-    int64_t Param2 = getTok().getIntVal();
-    (void) Param2;
+  int64_t LineNumber = 0;
+  if (getLexer().is(AsmToken::Integer)) {
+    LineNumber = getTok().getIntVal();
+    if (LineNumber < 1)
+      return TokError("line number less than one in '.loc' directive");
     Lex();
+  }
 
-    if (getLexer().isNot(AsmToken::EndOfStatement)) {
-      if (getLexer().isNot(AsmToken::Integer))
+  int64_t ColumnPos = 0;
+  if (getLexer().is(AsmToken::Integer)) {
+    ColumnPos = getTok().getIntVal();
+    if (ColumnPos < 0)
+      return TokError("column position less than zero in '.loc' directive");
+    Lex();
+  }
+
+  unsigned Flags = 0;
+  unsigned Isa = 0;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      StringRef Name;
+      SMLoc Loc = getTok().getLoc();
+      if (getParser().ParseIdentifier(Name))
         return TokError("unexpected token in '.loc' directive");
 
-      int64_t Param3 = getTok().getIntVal();
-      (void) Param3;
-      Lex();
+      if (Name == "basic_block")
+        Flags |= DWARF2_FLAG_BASIC_BLOCK;
+      else if (Name == "prologue_end")
+        Flags |= DWARF2_FLAG_PROLOGUE_END;
+      else if (Name == "epilogue_begin")
+        Flags |= DWARF2_FLAG_EPILOGUE_BEGIN;
+      else if (Name == "is_stmt") {
+        SMLoc Loc = getTok().getLoc();
+        const MCExpr *Value;
+        if (getParser().ParseExpression(Value))
+          return true;
+        // The expression must be the constant 0 or 1.
+        if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
+          int Value = MCE->getValue();
+          if (Value == 0)
+            Flags &= ~DWARF2_FLAG_IS_STMT;
+          else if (Value == 1)
+            Flags |= DWARF2_FLAG_IS_STMT;
+          else
+            return Error(Loc, "is_stmt value not 0 or 1");
+	}
+        else {
+          return Error(Loc, "is_stmt value not the constant value of 0 or 1");
+        }
+      }
+      else if (Name == "isa") {
+        SMLoc Loc = getTok().getLoc();
+        const MCExpr *Value;
+        if (getParser().ParseExpression(Value))
+          return true;
+        // The expression must be a constant greater or equal to 0.
+        if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
+          int Value = MCE->getValue();
+          if (Value < 0)
+            return Error(Loc, "isa number less than zero");
+          Isa = Value;
+	}
+        else {
+          return Error(Loc, "isa number not a constant value");
+        }
+      }
+      else {
+        return Error(Loc, "unknown sub-directive in '.loc' directive");
+      }
 
-      // FIXME: Do something with the .loc.
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
     }
   }
 
+  getContext().setCurrentDwarfLoc(FileNumber, LineNumber, ColumnPos, Flags,Isa);
+
+  return false;
+}
+
+/// ParseDirectiveMacrosOnOff
+/// ::= .macros_on
+/// ::= .macros_off
+bool GenericAsmParser::ParseDirectiveMacrosOnOff(StringRef Directive,
+                                                 SMLoc DirectiveLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.file' directive");
+    return Error(getLexer().getLoc(),
+                 "unexpected token in '" + Directive + "' directive");
+
+  getParser().MacrosEnabled = Directive == ".macros_on";
 
   return false;
 }
 
+/// ParseDirectiveMacro
+/// ::= .macro name
+bool GenericAsmParser::ParseDirectiveMacro(StringRef Directive,
+                                           SMLoc DirectiveLoc) {
+  StringRef Name;
+  if (getParser().ParseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.macro' directive");
+
+  // Eat the end of statement.
+  Lex();
+
+  AsmToken EndToken, StartToken = getTok();
+
+  // Lex the macro definition.
+  for (;;) {
+    // Check whether we have reached the end of the file.
+    if (getLexer().is(AsmToken::Eof))
+      return Error(DirectiveLoc, "no matching '.endmacro' in definition");
+
+    // Otherwise, check whether we have reach the .endmacro.
+    if (getLexer().is(AsmToken::Identifier) &&
+        (getTok().getIdentifier() == ".endm" ||
+         getTok().getIdentifier() == ".endmacro")) {
+      EndToken = getTok();
+      Lex();
+      if (getLexer().isNot(AsmToken::EndOfStatement))
+        return TokError("unexpected token in '" + EndToken.getIdentifier() +
+                        "' directive");
+      break;
+    }
+
+    // Otherwise, scan til the end of the statement.
+    getParser().EatToEndOfStatement();
+  }
+
+  if (getParser().MacroMap.lookup(Name)) {
+    return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
+  }
+
+  const char *BodyStart = StartToken.getLoc().getPointer();
+  const char *BodyEnd = EndToken.getLoc().getPointer();
+  StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
+  getParser().MacroMap[Name] = new Macro(Name, Body);
+  return false;
+}
+
+/// ParseDirectiveEndMacro
+/// ::= .endm
+/// ::= .endmacro
+bool GenericAsmParser::ParseDirectiveEndMacro(StringRef Directive,
+                                           SMLoc DirectiveLoc) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '" + Directive + "' directive");
+
+  // If we are inside a macro instantiation, terminate the current
+  // instantiation.
+  if (!getParser().ActiveMacros.empty()) {
+    getParser().HandleMacroExit();
+    return false;
+  }
+
+  // Otherwise, this .endmacro is a stray entry in the file; well formed
+  // .endmacro directives are handled during the macro definition parsing.
+  return TokError("unexpected '" + Directive + "' in file, "
+                  "no current macro definition");
+}
+
+/// \brief Create an MCAsmParser instance.
+MCAsmParser *llvm::createMCAsmParser(const Target &T, SourceMgr &SM,
+                                     MCContext &C, MCStreamer &Out,
+                                     const MCAsmInfo &MAI) {
+  return new AsmParser(T, SM, C, Out, MAI);
+}
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 7d8639ea4d81f..54ddb449b2859 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -25,6 +25,12 @@ namespace {
 /// \brief Implementation of directive handling which is shared across all
 /// Darwin targets.
 class DarwinAsmParser : public MCAsmParserExtension {
+  template<bool (DarwinAsmParser::*Handler)(StringRef, SMLoc)>
+  void AddDirectiveHandler(StringRef Directive) {
+    getParser().AddDirectiveHandler(this, Directive,
+                                    HandleDirective<DarwinAsmParser, Handler>);
+  }
+
   bool ParseSectionSwitch(const char *Segment, const char *Section,
                           unsigned TAA = 0, unsigned ImplicitAlign = 0,
                           unsigned StubSize = 0);
@@ -36,168 +42,70 @@ public:
     // Call the base implementation.
     this->MCAsmParserExtension::Initialize(Parser);
 
-    Parser.AddDirectiveHandler(this, ".desc", MCAsmParser::DirectiveHandler(
-                                 &DarwinAsmParser::ParseDirectiveDesc));
-    Parser.AddDirectiveHandler(this, ".lsym", MCAsmParser::DirectiveHandler(
-                                 &DarwinAsmParser::ParseDirectiveLsym));
-    Parser.AddDirectiveHandler(this, ".subsections_via_symbols",
-                               MCAsmParser::DirectiveHandler(
-                        &DarwinAsmParser::ParseDirectiveSubsectionsViaSymbols));
-    Parser.AddDirectiveHandler(this, ".dump", MCAsmParser::DirectiveHandler(
-                                 &DarwinAsmParser::ParseDirectiveDumpOrLoad));
-    Parser.AddDirectiveHandler(this, ".load", MCAsmParser::DirectiveHandler(
-                                 &DarwinAsmParser::ParseDirectiveDumpOrLoad));
-    Parser.AddDirectiveHandler(this, ".section", MCAsmParser::DirectiveHandler(
-                                 &DarwinAsmParser::ParseDirectiveSection));
-    Parser.AddDirectiveHandler(this, ".secure_log_unique",
-                               MCAsmParser::DirectiveHandler(
-                             &DarwinAsmParser::ParseDirectiveSecureLogUnique));
-    Parser.AddDirectiveHandler(this, ".secure_log_reset",
-                               MCAsmParser::DirectiveHandler(
-                             &DarwinAsmParser::ParseDirectiveSecureLogReset));
-    Parser.AddDirectiveHandler(this, ".tbss",
-                               MCAsmParser::DirectiveHandler(
-                                 &DarwinAsmParser::ParseDirectiveTBSS));
-    Parser.AddDirectiveHandler(this, ".zerofill",
-                               MCAsmParser::DirectiveHandler(
-                                 &DarwinAsmParser::ParseDirectiveZerofill));
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDesc>(".desc");
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveLsym>(".lsym");
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveSubsectionsViaSymbols>(
+      ".subsections_via_symbols");
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDumpOrLoad>(".dump");
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDumpOrLoad>(".load");
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveSection>(".section");
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveSecureLogUnique>(
+      ".secure_log_unique");
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveSecureLogReset>(
+      ".secure_log_reset");
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveTBSS>(".tbss");
+    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveZerofill>(".zerofill");
 
     // Special section directives.
-    Parser.AddDirectiveHandler(this, ".const",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveConst));
-    Parser.AddDirectiveHandler(this, ".const_data",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveConstData));
-    Parser.AddDirectiveHandler(this, ".constructor",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveConstructor));
-    Parser.AddDirectiveHandler(this, ".cstring",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveCString));
-    Parser.AddDirectiveHandler(this, ".data",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveData));
-    Parser.AddDirectiveHandler(this, ".destructor",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveDestructor));
-    Parser.AddDirectiveHandler(this, ".dyld",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveDyld));
-    Parser.AddDirectiveHandler(this, ".fvmlib_init0",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveFVMLibInit0));
-    Parser.AddDirectiveHandler(this, ".fvmlib_init1",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveFVMLibInit1));
-    Parser.AddDirectiveHandler(this, ".lazy_symbol_pointer",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveLazySymbolPointers));
-    Parser.AddDirectiveHandler(this, ".literal16",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveLiteral16));
-    Parser.AddDirectiveHandler(this, ".literal4",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveLiteral4));
-    Parser.AddDirectiveHandler(this, ".literal8",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveLiteral8));
-    Parser.AddDirectiveHandler(this, ".mod_init_func",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveModInitFunc));
-    Parser.AddDirectiveHandler(this, ".mod_term_func",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveModTermFunc));
-    Parser.AddDirectiveHandler(this, ".non_lazy_symbol_pointer",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveNonLazySymbolPointers));
-    Parser.AddDirectiveHandler(this, ".objc_cat_cls_meth",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCCatClsMeth));
-    Parser.AddDirectiveHandler(this, ".objc_cat_inst_meth",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCCatInstMeth));
-    Parser.AddDirectiveHandler(this, ".objc_category",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCCategory));
-    Parser.AddDirectiveHandler(this, ".objc_class",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCClass));
-    Parser.AddDirectiveHandler(this, ".objc_class_names",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCClassNames));
-    Parser.AddDirectiveHandler(this, ".objc_class_vars",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCClassVars));
-    Parser.AddDirectiveHandler(this, ".objc_cls_meth",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCClsMeth));
-    Parser.AddDirectiveHandler(this, ".objc_cls_refs",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCClsRefs));
-    Parser.AddDirectiveHandler(this, ".objc_inst_meth",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCInstMeth));
-    Parser.AddDirectiveHandler(this, ".objc_instance_vars",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCInstanceVars));
-    Parser.AddDirectiveHandler(this, ".objc_message_refs",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCMessageRefs));
-    Parser.AddDirectiveHandler(this, ".objc_meta_class",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCMetaClass));
-    Parser.AddDirectiveHandler(this, ".objc_meth_var_names",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCMethVarNames));
-    Parser.AddDirectiveHandler(this, ".objc_meth_var_types",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCMethVarTypes));
-    Parser.AddDirectiveHandler(this, ".objc_module_info",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCModuleInfo));
-    Parser.AddDirectiveHandler(this, ".objc_protocol",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCProtocol));
-    Parser.AddDirectiveHandler(this, ".objc_selector_strs",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCSelectorStrs));
-    Parser.AddDirectiveHandler(this, ".objc_string_object",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCStringObject));
-    Parser.AddDirectiveHandler(this, ".objc_symbols",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveObjCSymbols));
-    Parser.AddDirectiveHandler(this, ".picsymbol_stub",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectivePICSymbolStub));
-    Parser.AddDirectiveHandler(this, ".static_const",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveStaticConst));
-    Parser.AddDirectiveHandler(this, ".static_data",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveStaticData));
-    Parser.AddDirectiveHandler(this, ".symbol_stub",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveSymbolStub));
-    Parser.AddDirectiveHandler(this, ".tdata",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveTData));
-    Parser.AddDirectiveHandler(this, ".text",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveText));
-    Parser.AddDirectiveHandler(this, ".thread_init_func",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveThreadInitFunc));
-    Parser.AddDirectiveHandler(this, ".tlv",
-                               MCAsmParser::DirectiveHandler(
-                 &DarwinAsmParser::ParseSectionDirectiveTLV));
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConst>(".const");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConstData>(".const_data");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConstructor>(".constructor");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveCString>(".cstring");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveData>(".data");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveDestructor>(".destructor");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveDyld>(".dyld");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveFVMLibInit0>(".fvmlib_init0");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveFVMLibInit1>(".fvmlib_init1");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveLazySymbolPointers>(".lazy_symbol_pointer");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveLiteral16>(".literal16");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveLiteral4>(".literal4");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveLiteral8>(".literal8");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveModInitFunc>(".mod_init_func");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveModTermFunc>(".mod_term_func");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveNonLazySymbolPointers>(".non_lazy_symbol_pointer");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCCatClsMeth>(".objc_cat_cls_meth");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCCatInstMeth>(".objc_cat_inst_meth");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCCategory>(".objc_category");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClass>(".objc_class");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClassNames>(".objc_class_names");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClassVars>(".objc_class_vars");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClsMeth>(".objc_cls_meth");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClsRefs>(".objc_cls_refs");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCInstMeth>(".objc_inst_meth");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCInstanceVars>(".objc_instance_vars");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCMessageRefs>(".objc_message_refs");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCMetaClass>(".objc_meta_class");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCMethVarNames>(".objc_meth_var_names");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCMethVarTypes>(".objc_meth_var_types");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCModuleInfo>(".objc_module_info");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCProtocol>(".objc_protocol");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCSelectorStrs>(".objc_selector_strs");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCStringObject>(".objc_string_object");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCSymbols>(".objc_symbols");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectivePICSymbolStub>(".picsymbol_stub");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveStaticConst>(".static_const");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveStaticData>(".static_data");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveSymbolStub>(".symbol_stub");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveTData>(".tdata");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveText>(".text");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveThreadInitFunc>(".thread_init_func");
+    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveTLV>(".tlv");
   }
 
   bool ParseDirectiveDesc(StringRef, SMLoc);
   bool ParseDirectiveDumpOrLoad(StringRef, SMLoc);
   bool ParseDirectiveLsym(StringRef, SMLoc);
-  bool ParseDirectiveSection();
+  bool ParseDirectiveSection(StringRef, SMLoc);
   bool ParseDirectiveSecureLogReset(StringRef, SMLoc);
   bool ParseDirectiveSecureLogUnique(StringRef, SMLoc);
   bool ParseDirectiveSubsectionsViaSymbols(StringRef, SMLoc);
@@ -493,7 +401,7 @@ bool DarwinAsmParser::ParseDirectiveLsym(StringRef, SMLoc) {
 
 /// ParseDirectiveSection:
 ///   ::= .section identifier (',' identifier)*
-bool DarwinAsmParser::ParseDirectiveSection() {
+bool DarwinAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
   SMLoc Loc = getLexer().getLoc();
 
   StringRef SectionName;
@@ -537,28 +445,22 @@ bool DarwinAsmParser::ParseDirectiveSection() {
 }
 
 /// ParseDirectiveSecureLogUnique
-///  ::= .secure_log_unique "log message"
+///  ::= .secure_log_unique ... message ...
 bool DarwinAsmParser::ParseDirectiveSecureLogUnique(StringRef, SMLoc IDLoc) {
-  std::string LogMessage;
-
-  if (getLexer().isNot(AsmToken::String))
-    LogMessage = "";
-  else{
-    LogMessage = getTok().getString();
-    Lex();
-  }
-
+  StringRef LogMessage = getParser().ParseStringToEndOfStatement();
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.secure_log_unique' directive");
 
   if (getContext().getSecureLogUsed() != false)
     return Error(IDLoc, ".secure_log_unique specified multiple times");
 
-  char *SecureLogFile = getContext().getSecureLogFile();
+  // Get the secure log path.
+  const char *SecureLogFile = getContext().getSecureLogFile();
   if (SecureLogFile == NULL)
     return Error(IDLoc, ".secure_log_unique used but AS_SECURE_LOG_FILE "
                  "environment variable unset.");
 
+  // Open the secure log file if we haven't already.
   raw_ostream *OS = getContext().getSecureLog();
   if (OS == NULL) {
     std::string Err;
@@ -571,6 +473,7 @@ bool DarwinAsmParser::ParseDirectiveSecureLogUnique(StringRef, SMLoc IDLoc) {
     getContext().setSecureLog(OS);
   }
 
+  // Write the message.
   int CurBuf = getSourceManager().FindBufferContainingLoc(IDLoc);
   *OS << getSourceManager().getBufferInfo(CurBuf).Buffer->getBufferIdentifier()
       << ":" << getSourceManager().FindLineNumber(IDLoc, CurBuf) << ":"
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index 7a54dd39aa479..f982fdaecb123 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -8,15 +8,24 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
-#include "llvm/MC/MCSectionELF.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/ADT/Twine.h"
 using namespace llvm;
 
 namespace {
 
 class ELFAsmParser : public MCAsmParserExtension {
+  template<bool (ELFAsmParser::*Handler)(StringRef, SMLoc)>
+  void AddDirectiveHandler(StringRef Directive) {
+    getParser().AddDirectiveHandler(this, Directive,
+                                    HandleDirective<ELFAsmParser, Handler>);
+  }
+
   bool ParseSectionSwitch(StringRef Section, unsigned Type,
                           unsigned Flags, SectionKind Kind);
 
@@ -27,10 +36,21 @@ public:
     // Call the base implementation.
     this->MCAsmParserExtension::Initialize(Parser);
 
-    Parser.AddDirectiveHandler(this, ".data", MCAsmParser::DirectiveHandler(
-                                 &ELFAsmParser::ParseSectionDirectiveData));
-    Parser.AddDirectiveHandler(this, ".text", MCAsmParser::DirectiveHandler(
-                                 &ELFAsmParser::ParseSectionDirectiveText));
+    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveData>(".data");
+    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveText>(".text");
+    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveBSS>(".bss");
+    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveRoData>(".rodata");
+    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveTData>(".tdata");
+    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveTBSS>(".tbss");
+    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveDataRel>(".data.rel");
+    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveDataRelRo>(".data.rel.ro");
+    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveDataRelRoLocal>(".data.rel.ro.local");
+    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveEhFrame>(".eh_frame");
+    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSection>(".section");
+    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSize>(".size");
+    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveLEB128>(".sleb128");
+    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveLEB128>(".uleb128");
+    AddDirectiveHandler<&ELFAsmParser::ParseDirectivePrevious>(".previous");
   }
 
   bool ParseSectionDirectiveData(StringRef, SMLoc) {
@@ -43,6 +63,56 @@ public:
                               MCSectionELF::SHF_EXECINSTR |
                               MCSectionELF::SHF_ALLOC, SectionKind::getText());
   }
+  bool ParseSectionDirectiveBSS(StringRef, SMLoc) {
+    return ParseSectionSwitch(".bss", MCSectionELF::SHT_NOBITS,
+                              MCSectionELF::SHF_WRITE |
+                              MCSectionELF::SHF_ALLOC, SectionKind::getBSS());
+  }
+  bool ParseSectionDirectiveRoData(StringRef, SMLoc) {
+    return ParseSectionSwitch(".rodata", MCSectionELF::SHT_PROGBITS,
+                              MCSectionELF::SHF_ALLOC,
+                              SectionKind::getReadOnly());
+  }
+  bool ParseSectionDirectiveTData(StringRef, SMLoc) {
+    return ParseSectionSwitch(".tdata", MCSectionELF::SHT_PROGBITS,
+                              MCSectionELF::SHF_ALLOC |
+                              MCSectionELF::SHF_TLS | MCSectionELF::SHF_WRITE,
+                              SectionKind::getThreadData());
+  }
+  bool ParseSectionDirectiveTBSS(StringRef, SMLoc) {
+    return ParseSectionSwitch(".tbss", MCSectionELF::SHT_NOBITS,
+                              MCSectionELF::SHF_ALLOC |
+                              MCSectionELF::SHF_TLS | MCSectionELF::SHF_WRITE,
+                              SectionKind::getThreadBSS());
+  }
+  bool ParseSectionDirectiveDataRel(StringRef, SMLoc) {
+    return ParseSectionSwitch(".data.rel", MCSectionELF::SHT_PROGBITS,
+                              MCSectionELF::SHF_ALLOC |
+                              MCSectionELF::SHF_WRITE,
+                              SectionKind::getDataRel());
+  }
+  bool ParseSectionDirectiveDataRelRo(StringRef, SMLoc) {
+    return ParseSectionSwitch(".data.rel.ro", MCSectionELF::SHT_PROGBITS,
+                              MCSectionELF::SHF_ALLOC |
+                              MCSectionELF::SHF_WRITE,
+                              SectionKind::getReadOnlyWithRel());
+  }
+  bool ParseSectionDirectiveDataRelRoLocal(StringRef, SMLoc) {
+    return ParseSectionSwitch(".data.rel.ro.local", MCSectionELF::SHT_PROGBITS,
+                              MCSectionELF::SHF_ALLOC |
+                              MCSectionELF::SHF_WRITE,
+                              SectionKind::getReadOnlyWithRelLocal());
+  }
+  bool ParseSectionDirectiveEhFrame(StringRef, SMLoc) {
+    return ParseSectionSwitch(".eh_frame", MCSectionELF::SHT_PROGBITS,
+                              MCSectionELF::SHF_ALLOC |
+                              MCSectionELF::SHF_WRITE,
+                              SectionKind::getDataRel());
+  }
+  bool ParseDirectiveLEB128(StringRef, SMLoc);
+  bool ParseDirectiveSection(StringRef, SMLoc);
+  bool ParseDirectiveSize(StringRef, SMLoc);
+  bool ParseDirectivePrevious(StringRef, SMLoc);
 };
 
 }
@@ -59,6 +129,159 @@ bool ELFAsmParser::ParseSectionSwitch(StringRef Section, unsigned Type,
   return false;
 }
 
+bool ELFAsmParser::ParseDirectiveSize(StringRef, SMLoc) {
+  StringRef Name;
+  if (getParser().ParseIdentifier(Name))
+    return TokError("expected identifier in directive");
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  const MCExpr *Expr;
+  if (getParser().ParseExpression(Expr))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  getStreamer().EmitELFSize(Sym, Expr);
+  return false;
+}
+
+// FIXME: This is a work in progress.
+bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
+  StringRef SectionName;
+  // FIXME: This doesn't parse section names like ".note.GNU-stack" correctly.
+  if (getParser().ParseIdentifier(SectionName))
+    return TokError("expected identifier in directive");
+
+  std::string FlagsStr;
+  StringRef TypeName;
+  int64_t Size = 0;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+
+    if (getLexer().isNot(AsmToken::String))
+      return TokError("expected string in directive");
+
+    FlagsStr = getTok().getStringContents();
+    Lex();
+
+    AsmToken::TokenKind TypeStartToken;
+    if (getContext().getAsmInfo().getCommentString()[0] == '@')
+      TypeStartToken = AsmToken::Percent;
+    else
+      TypeStartToken = AsmToken::At;
+
+    if (getLexer().is(AsmToken::Comma)) {
+      Lex();
+      if (getLexer().is(TypeStartToken)) {
+        Lex();
+        if (getParser().ParseIdentifier(TypeName))
+          return TokError("expected identifier in directive");
+
+        if (getLexer().is(AsmToken::Comma)) {
+          Lex();
+
+          if (getParser().ParseAbsoluteExpression(Size))
+            return true;
+
+          if (Size <= 0)
+            return TokError("section size must be positive");
+        }
+      }
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  unsigned Flags = 0;
+  for (unsigned i = 0; i < FlagsStr.size(); i++) {
+    switch (FlagsStr[i]) {
+    case 'a':
+      Flags |= MCSectionELF::SHF_ALLOC;
+      break;
+    case 'x':
+      Flags |= MCSectionELF::SHF_EXECINSTR;
+      break;
+    case 'w':
+      Flags |= MCSectionELF::SHF_WRITE;
+      break;
+    case 'M':
+      Flags |= MCSectionELF::SHF_MERGE;
+      break;
+    case 'S':
+      Flags |= MCSectionELF::SHF_STRINGS;
+      break;
+    case 'T':
+      Flags |= MCSectionELF::SHF_TLS;
+      break;
+    case 'c':
+      Flags |= MCSectionELF::XCORE_SHF_CP_SECTION;
+      break;
+    case 'd':
+      Flags |= MCSectionELF::XCORE_SHF_DP_SECTION;
+      break;
+    default:
+      return TokError("unknown flag");
+    }
+  }
+
+  unsigned Type = MCSectionELF::SHT_NULL;
+  if (!TypeName.empty()) {
+    if (TypeName == "init_array")
+      Type = MCSectionELF::SHT_INIT_ARRAY;
+    else if (TypeName == "fini_array")
+      Type = MCSectionELF::SHT_FINI_ARRAY;
+    else if (TypeName == "preinit_array")
+      Type = MCSectionELF::SHT_PREINIT_ARRAY;
+    else if (TypeName == "nobits")
+      Type = MCSectionELF::SHT_NOBITS;
+    else if (TypeName == "progbits")
+      Type = MCSectionELF::SHT_PROGBITS;
+    else
+      return TokError("unknown section type");
+  }
+
+  SectionKind Kind = (Flags & MCSectionELF::SHF_EXECINSTR)
+                     ? SectionKind::getText()
+                     : SectionKind::getDataRel();
+  getStreamer().SwitchSection(getContext().getELFSection(SectionName, Type,
+                                                         Flags, Kind, false));
+  return false;
+}
+
+bool ELFAsmParser::ParseDirectiveLEB128(StringRef DirName, SMLoc) {
+  int64_t Value;
+  if (getParser().ParseAbsoluteExpression(Value))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  // FIXME: Add proper MC support.
+  if (getContext().getAsmInfo().hasLEB128()) {
+    if (DirName[1] == 's')
+      getStreamer().EmitRawText("\t.sleb128\t" + Twine(Value));
+    else
+      getStreamer().EmitRawText("\t.uleb128\t" + Twine(Value));
+    return false;
+  }
+  // FIXME: This shouldn't be an error!
+  return TokError("LEB128 not supported yet");
+}
+
+bool ELFAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) {
+  const MCSection *PreviousSection = getStreamer().getPreviousSection();
+  if (PreviousSection != NULL)
+    getStreamer().SwitchSection(PreviousSection);
+
+  return false;
+}
+
 namespace llvm {
 
 MCAsmParserExtension *createELFAsmParser() {
diff --git a/lib/MC/MCParser/MCAsmParser.cpp b/lib/MC/MCParser/MCAsmParser.cpp
index bee30641c7fcd..70295efc613ca 100644
--- a/lib/MC/MCParser/MCAsmParser.cpp
+++ b/lib/MC/MCParser/MCAsmParser.cpp
@@ -12,19 +12,26 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Target/TargetAsmParser.h"
 using namespace llvm;
 
-MCAsmParser::MCAsmParser() {
+MCAsmParser::MCAsmParser() : TargetParser(0), ShowParsedOperands(0) {
 }
 
 MCAsmParser::~MCAsmParser() {
 }
 
+void MCAsmParser::setTargetParser(TargetAsmParser &P) {
+  assert(!TargetParser && "Target parser is already initialized!");
+  TargetParser = &P;
+  TargetParser->Initialize(*this);
+}
+
 const AsmToken &MCAsmParser::getTok() {
   return getLexer().getTok();
 }
 
-bool MCAsmParser::TokError(const char *Msg) {
+bool MCAsmParser::TokError(const Twine &Msg) {
   Error(getLexer().getLoc(), Msg);
   return true;
 }
@@ -34,8 +41,4 @@ bool MCAsmParser::ParseExpression(const MCExpr *&Res) {
   return ParseExpression(Res, L);
 }
 
-/// getStartLoc - Get the location of the first token of this operand.
-SMLoc MCParsedAsmOperand::getStartLoc() const { return SMLoc(); }
-SMLoc MCParsedAsmOperand::getEndLoc() const { return SMLoc(); }
-
 
diff --git a/lib/MC/MCParser/TargetAsmParser.cpp b/lib/MC/MCParser/TargetAsmParser.cpp
index 05760c96cc658..8d43c21f4bc96 100644
--- a/lib/MC/MCParser/TargetAsmParser.cpp
+++ b/lib/MC/MCParser/TargetAsmParser.cpp
@@ -11,7 +11,7 @@
 using namespace llvm;
 
 TargetAsmParser::TargetAsmParser(const Target &T) 
-  : TheTarget(T)
+  : TheTarget(T), AvailableFeatures(0)
 {
 }
 
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 573f2a3530ee9..3e9d02ea5ae73 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -15,7 +15,8 @@
 #include <cstdlib>
 using namespace llvm;
 
-MCStreamer::MCStreamer(MCContext &_Context) : Context(_Context), CurSection(0) {
+MCStreamer::MCStreamer(MCContext &Ctx) : Context(Ctx), CurSection(0),
+                                         PrevSection(0) {
 }
 
 MCStreamer::~MCStreamer() {
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 7ca09511bdebd..cffabfadb3165 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -769,7 +769,7 @@ public:
       IsPCRel = 1;
       FixedValue = (FixupAddress - Layout.getSymbolAddress(SD_B) +
                     Target.getConstant());
-      FixedValue += 1 << Log2Size;
+      FixedValue += 1ULL << Log2Size;
     } else {
       FixedValue = 0;
     }
diff --git a/lib/MC/Makefile b/lib/MC/Makefile
index a661fa6f40804..bf8b7c0e78318 100644
--- a/lib/MC/Makefile
+++ b/lib/MC/Makefile
@@ -10,7 +10,7 @@
 LEVEL = ../..
 LIBRARYNAME = LLVMMC
 BUILD_ARCHIVE := 1
-PARALLEL_DIRS := MCParser
+PARALLEL_DIRS := MCParser MCDisassembler
 
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index 6804766b28957..eeb2b9675f4b5 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -12,41 +12,552 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "WinCOFFObjectWriter"
+
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCSectionCOFF.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#include "llvm/System/TimeValue.h"
+
+#include "../Target/X86/X86FixupKinds.h"
+
+#include <cstdio>
+
 using namespace llvm;
 
 namespace {
+typedef llvm::SmallString<COFF::NameSize> name;
+
+enum AuxiliaryType {
+  ATFunctionDefinition,
+  ATbfAndefSymbol,
+  ATWeakExternal,
+  ATFile,
+  ATSectionDefinition
+};
+
+struct AuxSymbol {
+  AuxiliaryType   AuxType;
+  COFF::Auxiliary Aux;
+};
+
+class COFFSymbol {
+public:
+  COFF::symbol Data;
+
+  typedef llvm::SmallVector<AuxSymbol, 1> AuxiliarySymbols;
+
+  name             Name;
+  size_t           Index;
+  AuxiliarySymbols Aux;
+  COFFSymbol      *Other;
+
+  MCSymbolData const *MCData;
+
+  COFFSymbol(llvm::StringRef name, size_t index);
+  size_t size() const;
+  void set_name_offset(uint32_t Offset);
+};
+
+// This class contains staging data for a COFF relocation entry.
+struct COFFRelocation {
+  COFF::relocation Data;
+  COFFSymbol          *Symb;
+
+  COFFRelocation() : Symb(NULL) {}
+  static size_t size() { return COFF::RelocationSize; }
+};
+
+typedef std::vector<COFFRelocation> relocations;
+
+class COFFSection {
+public:
+  COFF::section Header;
+
+  std::string          Name;
+  size_t               Number;
+  MCSectionData const *MCData;
+  COFFSymbol              *Symb;
+  relocations          Relocations;
+
+  COFFSection(llvm::StringRef name, size_t Index);
+  static size_t size();
+};
+
+// This class holds the COFF string table.
+class StringTable {
+  typedef llvm::StringMap<size_t> map;
+  map Map;
+
+  void update_length();
+public:
+  std::vector<char> Data;
+
+  StringTable();
+  size_t size() const;
+  size_t insert(llvm::StringRef String);
+};
+
+class WinCOFFObjectWriter : public MCObjectWriter {
+public:
+
+  typedef std::vector<COFFSymbol*>  symbols;
+  typedef std::vector<COFFSection*> sections;
+
+  typedef StringMap<COFFSymbol *>  name_symbol_map;
+  typedef StringMap<COFFSection *> name_section_map;
+
+  typedef DenseMap<MCSymbolData const *, COFFSymbol *>   symbol_map;
+  typedef DenseMap<MCSectionData const *, COFFSection *> section_map;
+
+  // Root level file contents.
+  bool Is64Bit;
+  COFF::header Header;
+  sections     Sections;
+  symbols      Symbols;
+  StringTable  Strings;
+
+  // Maps used during object file creation.
+  section_map SectionMap;
+  symbol_map  SymbolMap;
+
+  WinCOFFObjectWriter(raw_ostream &OS, bool is64Bit);
+  ~WinCOFFObjectWriter();
+
+  COFFSymbol *createSymbol(llvm::StringRef Name);
+  COFFSection *createSection(llvm::StringRef Name);
+
+  void InitCOFFEntity(COFFSymbol &Symbol);
+  void InitCOFFEntity(COFFSection &Section);
+
+  template <typename object_t, typename list_t>
+  object_t *createCOFFEntity(llvm::StringRef Name, list_t &List);
+
+  void DefineSection(MCSectionData const &SectionData);
+  void DefineSymbol(MCSymbolData const &SymbolData, MCAssembler &Assembler);
+
+  bool ExportSection(COFFSection *S);
+  bool ExportSymbol(MCSymbolData const &SymbolData, MCAssembler &Asm);
+
+  // Entity writing methods.
+
+  void WriteFileHeader(const COFF::header &Header);
+  void WriteSymbol(const COFFSymbol *S);
+  void WriteAuxiliarySymbols(const COFFSymbol::AuxiliarySymbols &S);
+  void WriteSectionHeader(const COFF::section &S);
+  void WriteRelocation(const COFF::relocation &R);
+
+  // MCObjectWriter interface implementation.
+
+  void ExecutePostLayoutBinding(MCAssembler &Asm);
+
+  void RecordRelocation(const MCAssembler &Asm,
+                        const MCAsmLayout &Layout,
+                        const MCFragment *Fragment,
+                        const MCFixup &Fixup,
+                        MCValue Target,
+                        uint64_t &FixedValue);
+
+  void WriteObject(const MCAssembler &Asm, const MCAsmLayout &Layout);
+};
+}
+
+static inline void write_uint32_le(void *Data, uint32_t const &Value) {
+  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Data);
+  Ptr[0] = (Value & 0x000000FF) >>  0;
+  Ptr[1] = (Value & 0x0000FF00) >>  8;
+  Ptr[2] = (Value & 0x00FF0000) >> 16;
+  Ptr[3] = (Value & 0xFF000000) >> 24;
+}
+
+static inline void write_uint16_le(void *Data, uint16_t const &Value) {
+  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Data);
+  Ptr[0] = (Value & 0x00FF) >> 0;
+  Ptr[1] = (Value & 0xFF00) >> 8;
+}
+
+static inline void write_uint8_le(void *Data, uint8_t const &Value) {
+  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Data);
+  Ptr[0] = (Value & 0xFF) >> 0;
+}
 
-  class WinCOFFObjectWriter : public MCObjectWriter {
-  public:
-    WinCOFFObjectWriter(raw_ostream &OS);
+//------------------------------------------------------------------------------
+// Symbol class implementation
+
+COFFSymbol::COFFSymbol(llvm::StringRef name, size_t index)
+      : Name(name.begin(), name.end()), Index(-1)
+      , Other(NULL), MCData(NULL) {
+  memset(&Data, 0, sizeof(Data));
+}
+
+size_t COFFSymbol::size() const {
+  return COFF::SymbolSize + (Data.NumberOfAuxSymbols * COFF::SymbolSize);
+}
+
+// In the case that the name does not fit within 8 bytes, the offset
+// into the string table is stored in the last 4 bytes instead, leaving
+// the first 4 bytes as 0.
+void COFFSymbol::set_name_offset(uint32_t Offset) {
+  write_uint32_le(Data.Name + 0, 0);
+  write_uint32_le(Data.Name + 4, Offset);
+}
+
+//------------------------------------------------------------------------------
+// Section class implementation
+
+COFFSection::COFFSection(llvm::StringRef name, size_t Index)
+       : Name(name), Number(Index + 1)
+       , MCData(NULL), Symb(NULL) {
+  memset(&Header, 0, sizeof(Header));
+}
+
+size_t COFFSection::size() {
+  return COFF::SectionSize;
+}
+
+//------------------------------------------------------------------------------
+// StringTable class implementation
+
+/// Write the length of the string table into Data.
+/// The length of the string table includes uint32 length header.
+void StringTable::update_length() {
+  write_uint32_le(&Data.front(), Data.size());
+}
+
+StringTable::StringTable() {
+  // The string table data begins with the length of the entire string table
+  // including the length header. Allocate space for this header.
+  Data.resize(4);
+}
+
+size_t StringTable::size() const {
+  return Data.size();
+}
+
+/// Add String to the table iff it is not already there.
+/// @returns the index into the string table where the string is now located.
+size_t StringTable::insert(llvm::StringRef String) {
+  map::iterator i = Map.find(String);
+
+  if (i != Map.end())
+    return i->second;
+
+  size_t Offset = Data.size();
 
-    // MCObjectWriter interface implementation.
+  // Insert string data into string table.
+  Data.insert(Data.end(), String.begin(), String.end());
+  Data.push_back('\0');
 
-    void ExecutePostLayoutBinding(MCAssembler &Asm);
+  // Put a reference to it in the map.
+  Map[String] = Offset;
 
-    void RecordRelocation(const MCAssembler &Asm,
-                          const MCAsmLayout &Layout,
-                          const MCFragment *Fragment,
-                          const MCFixup &Fixup,
-                          MCValue Target,
-                          uint64_t &FixedValue);
+  // Update the internal length field.
+  update_length();
 
-    void WriteObject(const MCAssembler &Asm, const MCAsmLayout &Layout);
-  };
+  return Offset;
 }
 
-WinCOFFObjectWriter::WinCOFFObjectWriter(raw_ostream &OS)
-                                : MCObjectWriter(OS, true) {
+//------------------------------------------------------------------------------
+// WinCOFFObjectWriter class implementation
+
+WinCOFFObjectWriter::WinCOFFObjectWriter(raw_ostream &OS, bool is64Bit)
+  : MCObjectWriter(OS, true)
+  , Is64Bit(is64Bit) {
+  memset(&Header, 0, sizeof(Header));
+
+  Is64Bit ? Header.Machine = COFF::IMAGE_FILE_MACHINE_AMD64
+          : Header.Machine = COFF::IMAGE_FILE_MACHINE_I386;
+}
+
+WinCOFFObjectWriter::~WinCOFFObjectWriter() {
+  for (symbols::iterator I = Symbols.begin(), E = Symbols.end(); I != E; ++I)
+    delete *I;
+  for (sections::iterator I = Sections.begin(), E = Sections.end(); I != E; ++I)
+    delete *I;
+}
+
+COFFSymbol *WinCOFFObjectWriter::createSymbol(llvm::StringRef Name) {
+  return createCOFFEntity<COFFSymbol>(Name, Symbols);
+}
+
+COFFSection *WinCOFFObjectWriter::createSection(llvm::StringRef Name) {
+  return createCOFFEntity<COFFSection>(Name, Sections);
+}
+
+/// This function initializes a symbol by entering its name into the string
+/// table if it is too long to fit in the symbol table header.
+void WinCOFFObjectWriter::InitCOFFEntity(COFFSymbol &S) {
+  if (S.Name.size() > COFF::NameSize) {
+    size_t StringTableEntry = Strings.insert(S.Name.c_str());
+
+    S.set_name_offset(StringTableEntry);
+  } else
+    memcpy(S.Data.Name, S.Name.c_str(), S.Name.size());
+}
+
+/// This function initializes a section by entering its name into the string
+/// table if it is too long to fit in the section table header.
+void WinCOFFObjectWriter::InitCOFFEntity(COFFSection &S) {
+  if (S.Name.size() > COFF::NameSize) {
+    size_t StringTableEntry = Strings.insert(S.Name.c_str());
+
+    // FIXME: Why is this number 999999? This number is never mentioned in the
+    // spec. I'm assuming this is due to the printed value needing to fit into
+    // the S.Header.Name field. In which case why not 9999999 (7 9's instead of
+    // 6)? The spec does not state if this entry should be null terminated in
+    // this case, and thus this seems to be the best way to do it. I think I
+    // just solved my own FIXME...
+    if (StringTableEntry > 999999)
+      report_fatal_error("COFF string table is greater than 999999 bytes.");
+
+    sprintf(S.Header.Name, "/%d", (unsigned)StringTableEntry);
+  } else
+    memcpy(S.Header.Name, S.Name.c_str(), S.Name.size());
+}
+
+/// A template used to lookup or create a symbol/section, and initialize it if
+/// needed.
+template <typename object_t, typename list_t>
+object_t *WinCOFFObjectWriter::createCOFFEntity(llvm::StringRef Name,
+                                                list_t &List) {
+  object_t *Object = new object_t(Name, List.size());
+
+  InitCOFFEntity(*Object);
+
+  List.push_back(Object);
+
+  return Object;
+}
+
+/// This function takes a section data object from the assembler
+/// and creates the associated COFF section staging object.
+void WinCOFFObjectWriter::DefineSection(MCSectionData const &SectionData) {
+  // FIXME: Not sure how to verify this (at least in a debug build).
+  MCSectionCOFF const &Sec =
+    static_cast<MCSectionCOFF const &>(SectionData.getSection());
+
+  COFFSection *coff_section = createSection(Sec.getSectionName());
+  COFFSymbol  *coff_symbol = createSymbol(Sec.getSectionName());
+
+  coff_section->Symb = coff_symbol;
+  coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
+  coff_symbol->Data.SectionNumber = coff_section->Number;
+
+  // In this case the auxiliary symbol is a Section Definition.
+  coff_symbol->Aux.resize(1);
+  memset(&coff_symbol->Aux[0], 0, sizeof(coff_symbol->Aux[0]));
+  coff_symbol->Aux[0].AuxType = ATSectionDefinition;
+  coff_symbol->Aux[0].Aux.SectionDefinition.Number = coff_section->Number;
+  coff_symbol->Aux[0].Aux.SectionDefinition.Selection = Sec.getSelection();
+
+  coff_section->Header.Characteristics = Sec.getCharacteristics();
+
+  uint32_t &Characteristics = coff_section->Header.Characteristics;
+  switch (SectionData.getAlignment()) {
+  case 1:    Characteristics |= COFF::IMAGE_SCN_ALIGN_1BYTES;    break;
+  case 2:    Characteristics |= COFF::IMAGE_SCN_ALIGN_2BYTES;    break;
+  case 4:    Characteristics |= COFF::IMAGE_SCN_ALIGN_4BYTES;    break;
+  case 8:    Characteristics |= COFF::IMAGE_SCN_ALIGN_8BYTES;    break;
+  case 16:   Characteristics |= COFF::IMAGE_SCN_ALIGN_16BYTES;   break;
+  case 32:   Characteristics |= COFF::IMAGE_SCN_ALIGN_32BYTES;   break;
+  case 64:   Characteristics |= COFF::IMAGE_SCN_ALIGN_64BYTES;   break;
+  case 128:  Characteristics |= COFF::IMAGE_SCN_ALIGN_128BYTES;  break;
+  case 256:  Characteristics |= COFF::IMAGE_SCN_ALIGN_256BYTES;  break;
+  case 512:  Characteristics |= COFF::IMAGE_SCN_ALIGN_512BYTES;  break;
+  case 1024: Characteristics |= COFF::IMAGE_SCN_ALIGN_1024BYTES; break;
+  case 2048: Characteristics |= COFF::IMAGE_SCN_ALIGN_2048BYTES; break;
+  case 4096: Characteristics |= COFF::IMAGE_SCN_ALIGN_4096BYTES; break;
+  case 8192: Characteristics |= COFF::IMAGE_SCN_ALIGN_8192BYTES; break;
+  default:
+    llvm_unreachable("unsupported section alignment");
+  }
+
+  // Bind internal COFF section to MC section.
+  coff_section->MCData = &SectionData;
+  SectionMap[&SectionData] = coff_section;
+}
+
+/// This function takes a section data object from the assembler
+/// and creates the associated COFF symbol staging object.
+void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
+                                        MCAssembler &Assembler) {
+  COFFSymbol *coff_symbol = createSymbol(SymbolData.getSymbol().getName());
+
+  coff_symbol->Data.Type         = (SymbolData.getFlags() & 0x0000FFFF) >>  0;
+  coff_symbol->Data.StorageClass = (SymbolData.getFlags() & 0x00FF0000) >> 16;
+
+  // If no storage class was specified in the streamer, define it here.
+  if (coff_symbol->Data.StorageClass == 0) {
+    bool external = SymbolData.isExternal() || (SymbolData.Fragment == NULL);
+
+    coff_symbol->Data.StorageClass =
+      external ? COFF::IMAGE_SYM_CLASS_EXTERNAL : COFF::IMAGE_SYM_CLASS_STATIC;
+  }
+
+  if (SymbolData.getFlags() & COFF::SF_WeakReference) {
+    coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
+
+    const MCExpr *Value = SymbolData.getSymbol().getVariableValue();
+
+    // FIXME: This assert message isn't very good.
+    assert(Value->getKind() == MCExpr::SymbolRef &&
+           "Value must be a SymbolRef!");
+
+    const MCSymbolRefExpr *SymbolRef =
+      static_cast<const MCSymbolRefExpr *>(Value);
+
+    const MCSymbolData &OtherSymbolData =
+      Assembler.getSymbolData(SymbolRef->getSymbol());
+
+    // FIXME: This assert message isn't very good.
+    assert(SymbolMap.find(&OtherSymbolData) != SymbolMap.end() &&
+           "OtherSymbolData must be in the symbol map!");
+
+    coff_symbol->Other = SymbolMap[&OtherSymbolData];
+
+    // Setup the Weak External auxiliary symbol.
+    coff_symbol->Aux.resize(1);
+    memset(&coff_symbol->Aux[0], 0, sizeof(coff_symbol->Aux[0]));
+    coff_symbol->Aux[0].AuxType = ATWeakExternal;
+    coff_symbol->Aux[0].Aux.WeakExternal.TagIndex = 0;
+    coff_symbol->Aux[0].Aux.WeakExternal.Characteristics =
+                                        COFF::IMAGE_WEAK_EXTERN_SEARCH_LIBRARY;
+  }
+
+  // Bind internal COFF symbol to MC symbol.
+  coff_symbol->MCData = &SymbolData;
+  SymbolMap[&SymbolData] = coff_symbol;
+}
+
+bool WinCOFFObjectWriter::ExportSection(COFFSection *S) {
+  return (S->Header.Characteristics
+         & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) == 0;
+}
+
+bool WinCOFFObjectWriter::ExportSymbol(MCSymbolData const &SymbolData,
+                                       MCAssembler &Asm) {
+  // This doesn't seem to be right. Strings referred to from the .data section
+  // need symbols so they can be linked to code in the .text section right?
+
+  // return Asm.isSymbolLinkerVisible (&SymbolData);
+
+  // For now, all symbols are exported, the linker will sort it out for us.
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// entity writing methods
+
+void WinCOFFObjectWriter::WriteFileHeader(const COFF::header &Header) {
+  WriteLE16(Header.Machine);
+  WriteLE16(Header.NumberOfSections);
+  WriteLE32(Header.TimeDateStamp);
+  WriteLE32(Header.PointerToSymbolTable);
+  WriteLE32(Header.NumberOfSymbols);
+  WriteLE16(Header.SizeOfOptionalHeader);
+  WriteLE16(Header.Characteristics);
+}
+
+void WinCOFFObjectWriter::WriteSymbol(const COFFSymbol *S) {
+  WriteBytes(StringRef(S->Data.Name, COFF::NameSize));
+  WriteLE32(S->Data.Value);
+  WriteLE16(S->Data.SectionNumber);
+  WriteLE16(S->Data.Type);
+  Write8(S->Data.StorageClass);
+  Write8(S->Data.NumberOfAuxSymbols);
+  WriteAuxiliarySymbols(S->Aux);
+}
+
+void WinCOFFObjectWriter::WriteAuxiliarySymbols(
+                                        const COFFSymbol::AuxiliarySymbols &S) {
+  for(COFFSymbol::AuxiliarySymbols::const_iterator i = S.begin(), e = S.end();
+      i != e; ++i) {
+    switch(i->AuxType) {
+    case ATFunctionDefinition:
+      WriteLE32(i->Aux.FunctionDefinition.TagIndex);
+      WriteLE32(i->Aux.FunctionDefinition.TotalSize);
+      WriteLE32(i->Aux.FunctionDefinition.PointerToLinenumber);
+      WriteLE32(i->Aux.FunctionDefinition.PointerToNextFunction);
+      WriteZeros(sizeof(i->Aux.FunctionDefinition.unused));
+      break;
+    case ATbfAndefSymbol:
+      WriteZeros(sizeof(i->Aux.bfAndefSymbol.unused1));
+      WriteLE16(i->Aux.bfAndefSymbol.Linenumber);
+      WriteZeros(sizeof(i->Aux.bfAndefSymbol.unused2));
+      WriteLE32(i->Aux.bfAndefSymbol.PointerToNextFunction);
+      WriteZeros(sizeof(i->Aux.bfAndefSymbol.unused3));
+      break;
+    case ATWeakExternal:
+      WriteLE32(i->Aux.WeakExternal.TagIndex);
+      WriteLE32(i->Aux.WeakExternal.Characteristics);
+      WriteZeros(sizeof(i->Aux.WeakExternal.unused));
+      break;
+    case ATFile:
+      WriteBytes(StringRef(reinterpret_cast<const char *>(i->Aux.File.FileName),
+                 sizeof(i->Aux.File.FileName)));
+      break;
+    case ATSectionDefinition:
+      WriteLE32(i->Aux.SectionDefinition.Length);
+      WriteLE16(i->Aux.SectionDefinition.NumberOfRelocations);
+      WriteLE16(i->Aux.SectionDefinition.NumberOfLinenumbers);
+      WriteLE32(i->Aux.SectionDefinition.CheckSum);
+      WriteLE16(i->Aux.SectionDefinition.Number);
+      Write8(i->Aux.SectionDefinition.Selection);
+      WriteZeros(sizeof(i->Aux.SectionDefinition.unused));
+      break;
+    }
+  }
+}
+
+void WinCOFFObjectWriter::WriteSectionHeader(const COFF::section &S) {
+  WriteBytes(StringRef(S.Name, COFF::NameSize));
+
+  WriteLE32(S.VirtualSize);
+  WriteLE32(S.VirtualAddress);
+  WriteLE32(S.SizeOfRawData);
+  WriteLE32(S.PointerToRawData);
+  WriteLE32(S.PointerToRelocations);
+  WriteLE32(S.PointerToLineNumbers);
+  WriteLE16(S.NumberOfRelocations);
+  WriteLE16(S.NumberOfLineNumbers);
+  WriteLE32(S.Characteristics);
+}
+
+void WinCOFFObjectWriter::WriteRelocation(const COFF::relocation &R) {
+  WriteLE32(R.VirtualAddress);
+  WriteLE32(R.SymbolTableIndex);
+  WriteLE16(R.Type);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // MCObjectWriter interface implementations
 
 void WinCOFFObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm) {
+  // "Define" each section & symbol. This creates section & symbol
+  // entries in the staging area and gives them their final indexes.
+
+  for (MCAssembler::const_iterator i = Asm.begin(), e = Asm.end(); i != e; i++)
+    DefineSection(*i);
+
+  for (MCAssembler::const_symbol_iterator i = Asm.symbol_begin(),
+                                          e = Asm.symbol_end(); i != e; i++) {
+    if (ExportSymbol(*i, Asm))
+      DefineSymbol(*i, Asm);
+  }
 }
 
 void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
@@ -55,17 +566,209 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
                                            const MCFixup &Fixup,
                                            MCValue Target,
                                            uint64_t &FixedValue) {
+  assert(Target.getSymA() != NULL && "Relocation must reference a symbol!");
+
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  MCSymbolData &A_SD = Asm.getSymbolData(*A);
+
+  MCSectionData const *SectionData = Fragment->getParent();
+
+  // Mark this symbol as requiring an entry in the symbol table.
+  assert(SectionMap.find(SectionData) != SectionMap.end() &&
+         "Section must already have been defined in ExecutePostLayoutBinding!");
+  assert(SymbolMap.find(&A_SD) != SymbolMap.end() &&
+         "Symbol must already have been defined in ExecutePostLayoutBinding!");
+
+  COFFSection *coff_section = SectionMap[SectionData];
+  COFFSymbol *coff_symbol = SymbolMap[&A_SD];
+
+  if (Target.getSymB()) {
+    const MCSymbol *B = &Target.getSymB()->getSymbol();
+    MCSymbolData &B_SD = Asm.getSymbolData(*B);
+
+    FixedValue = Layout.getSymbolAddress(&A_SD) - Layout.getSymbolAddress(&B_SD);
+
+    // In the case where we have SymbA and SymB, we just need to store the delta
+    // between the two symbols.  Update FixedValue to account for the delta, and
+    // skip recording the relocation.
+    return;
+  } else {
+    FixedValue = Target.getConstant();
+  }
+
+  COFFRelocation Reloc;
+
+  Reloc.Data.SymbolTableIndex = 0;
+  Reloc.Data.VirtualAddress = Layout.getFragmentOffset(Fragment);
+  Reloc.Symb = coff_symbol;
+
+  Reloc.Data.VirtualAddress += Fixup.getOffset();
+
+  switch (Fixup.getKind()) {
+  case X86::reloc_pcrel_4byte:
+  case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_movq_load:
+    Reloc.Data.Type = Is64Bit ? COFF::IMAGE_REL_AMD64_REL32
+                              : COFF::IMAGE_REL_I386_REL32;
+    // FIXME: Can anyone explain what this does other than adjust for the size
+    // of the offset?
+    FixedValue += 4;
+    break;
+  case FK_Data_4:
+    Reloc.Data.Type = Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32
+                              : COFF::IMAGE_REL_I386_DIR32;
+    break;
+  case FK_Data_8:
+    if (Is64Bit)
+      Reloc.Data.Type = COFF::IMAGE_REL_AMD64_ADDR64;
+    else
+      llvm_unreachable("unsupported relocation type");
+    break;
+  default:
+    llvm_unreachable("unsupported relocation type");
+  }
+
+  coff_section->Relocations.push_back(Reloc);
 }
 
 void WinCOFFObjectWriter::WriteObject(const MCAssembler &Asm,
                                       const MCAsmLayout &Layout) {
+  // Assign symbol and section indexes and offsets.
+
+  Header.NumberOfSymbols = 0;
+
+  for (symbols::iterator i = Symbols.begin(), e = Symbols.end(); i != e; i++) {
+    COFFSymbol *coff_symbol = *i;
+    MCSymbolData const *SymbolData = coff_symbol->MCData;
+
+    coff_symbol->Index = Header.NumberOfSymbols++;
+
+    // Update section number & offset for symbols that have them.
+    if ((SymbolData != NULL) && (SymbolData->Fragment != NULL)) {
+      COFFSection *coff_section = SectionMap[SymbolData->Fragment->getParent()];
+
+      coff_symbol->Data.SectionNumber = coff_section->Number;
+      coff_symbol->Data.Value = Layout.getFragmentOffset(SymbolData->Fragment)
+                              + SymbolData->Offset;
+    }
+
+    // Update auxiliary symbol info.
+    coff_symbol->Data.NumberOfAuxSymbols = coff_symbol->Aux.size();
+    Header.NumberOfSymbols += coff_symbol->Data.NumberOfAuxSymbols;
+  }
+
+  // Fixup weak external references.
+  for (symbols::iterator i = Symbols.begin(), e = Symbols.end(); i != e; i++) {
+    COFFSymbol *symb = *i;
+
+    if (symb->Other != NULL) {
+      assert(symb->Aux.size() == 1 &&
+             "Symbol must contain one aux symbol!");
+      assert(symb->Aux[0].AuxType == ATWeakExternal &&
+             "Symbol's aux symbol must be a Weak External!");
+      symb->Aux[0].Aux.WeakExternal.TagIndex = symb->Other->Index;
+    }
+  }
+
+  // Assign file offsets to COFF object file structures.
+
+  unsigned offset = 0;
+
+  offset += COFF::HeaderSize;
+  offset += COFF::SectionSize * Asm.size();
+
+  Header.NumberOfSections = Sections.size();
+
+  for (MCAssembler::const_iterator i = Asm.begin(),
+                                   e = Asm.end();
+                                   i != e; i++) {
+    COFFSection *Sec = SectionMap[i];
+
+    Sec->Header.SizeOfRawData = Layout.getSectionFileSize(i);
+
+    if (ExportSection(Sec)) {
+      Sec->Header.PointerToRawData = offset;
+
+      offset += Sec->Header.SizeOfRawData;
+    }
+
+    if (Sec->Relocations.size() > 0) {
+      Sec->Header.NumberOfRelocations = Sec->Relocations.size();
+      Sec->Header.PointerToRelocations = offset;
+
+      offset += COFF::RelocationSize * Sec->Relocations.size();
+
+      for (relocations::iterator cr = Sec->Relocations.begin(),
+                                 er = Sec->Relocations.end();
+                                 cr != er; cr++) {
+        (*cr).Data.SymbolTableIndex = (*cr).Symb->Index;
+      }
+    }
+
+    assert(Sec->Symb->Aux.size() == 1 && "Section's symbol must have one aux!");
+    AuxSymbol &Aux = Sec->Symb->Aux[0];
+    assert(Aux.AuxType == ATSectionDefinition &&
+           "Section's symbol's aux symbol must be a Section Definition!");
+    Aux.Aux.SectionDefinition.Length = Sec->Header.SizeOfRawData;
+    Aux.Aux.SectionDefinition.NumberOfRelocations =
+                                                Sec->Header.NumberOfRelocations;
+    Aux.Aux.SectionDefinition.NumberOfLinenumbers =
+                                                Sec->Header.NumberOfLineNumbers;
+  }
+
+  Header.PointerToSymbolTable = offset;
+
+  Header.TimeDateStamp = sys::TimeValue::now().toEpochTime();
+
+  // Write it all to disk...
+  WriteFileHeader(Header);
+
+  {
+    sections::iterator i, ie;
+    MCAssembler::const_iterator j, je;
+
+    for (i = Sections.begin(), ie = Sections.end(); i != ie; i++)
+      WriteSectionHeader((*i)->Header);
+
+    for (i = Sections.begin(), ie = Sections.end(),
+         j = Asm.begin(), je = Asm.end();
+         (i != ie) && (j != je); i++, j++) {
+      if ((*i)->Header.PointerToRawData != 0) {
+        assert(OS.tell() == (*i)->Header.PointerToRawData &&
+               "Section::PointerToRawData is insane!");
+
+        Asm.WriteSectionData(j, Layout, this);
+      }
+
+      if ((*i)->Relocations.size() > 0) {
+        assert(OS.tell() == (*i)->Header.PointerToRelocations &&
+               "Section::PointerToRelocations is insane!");
+
+        for (relocations::const_iterator k = (*i)->Relocations.begin(),
+                                               ke = (*i)->Relocations.end();
+                                               k != ke; k++) {
+          WriteRelocation(k->Data);
+        }
+      } else
+        assert((*i)->Header.PointerToRelocations == 0 &&
+               "Section::PointerToRelocations is insane!");
+    }
+  }
+
+  assert(OS.tell() == Header.PointerToSymbolTable &&
+         "Header::PointerToSymbolTable is insane!");
+
+  for (symbols::iterator i = Symbols.begin(), e = Symbols.end(); i != e; i++)
+    WriteSymbol(*i);
+
+  OS.write((char const *)&Strings.Data.front(), Strings.Data.size());
 }
 
 //------------------------------------------------------------------------------
 // WinCOFFObjectWriter factory function
 
 namespace llvm {
-  MCObjectWriter *createWinCOFFObjectWriter(raw_ostream &OS) {
-    return new WinCOFFObjectWriter(OS);
+  MCObjectWriter *createWinCOFFObjectWriter(raw_ostream &OS, bool is64Bit) {
+    return new WinCOFFObjectWriter(OS, is64Bit);
   }
 }
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 1030cdb28d2cd..8a194bff21513 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -18,27 +18,34 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Target/TargetAsmBackend.h"
+#include "llvm/ADT/StringMap.h"
+
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-#define dbg_notimpl(x) \
-  do { dbgs() << "not implemented, " << __FUNCTION__  << " (" << x << ")"; \
-    abort(); } while (false);
-
 namespace {
 class WinCOFFStreamer : public MCObjectStreamer {
 public:
+  MCSymbol const *CurSymbol;
+
   WinCOFFStreamer(MCContext &Context,
                   TargetAsmBackend &TAB,
                   MCCodeEmitter &CE,
                   raw_ostream &OS);
 
+  void AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                       unsigned ByteAlignment, bool External);
+
   // MCStreamer interface
 
   virtual void EmitLabel(MCSymbol *Symbol);
@@ -52,18 +59,18 @@ public:
   virtual void EndCOFFSymbolDef();
   virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value);
   virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                        unsigned ByteAlignment);
+                                unsigned ByteAlignment);
   virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size);
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                    unsigned Size,unsigned ByteAlignment);
+                            unsigned Size,unsigned ByteAlignment);
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
-                      uint64_t Size, unsigned ByteAlignment);
+                              uint64_t Size, unsigned ByteAlignment);
   virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
-  virtual void EmitValue(const MCExpr *Value, unsigned Size, 
+  virtual void EmitValue(const MCExpr *Value, unsigned Size,
                          unsigned AddrSpace);
   virtual void EmitGPRel32Value(const MCExpr *Value);
   virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
-                            unsigned ValueSize, unsigned MaxBytesToEmit);
+                                   unsigned ValueSize, unsigned MaxBytesToEmit);
   virtual void EmitCodeAlignment(unsigned ByteAlignment,
                                  unsigned MaxBytesToEmit);
   virtual void EmitValueToOffset(const MCExpr *Offset, unsigned char Value);
@@ -78,96 +85,224 @@ WinCOFFStreamer::WinCOFFStreamer(MCContext &Context,
                                  TargetAsmBackend &TAB,
                                  MCCodeEmitter &CE,
                                  raw_ostream &OS)
-    : MCObjectStreamer(Context, TAB, OS, &CE) {
+    : MCObjectStreamer(Context, TAB, OS, &CE)
+    , CurSymbol(NULL) {
+}
+
+void WinCOFFStreamer::AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                      unsigned ByteAlignment, bool External) {
+  assert(!Symbol->isInSection() && "Symbol must not already have a section!");
+
+  std::string SectionName(".bss$linkonce");
+  SectionName.append(Symbol->getName().begin(), Symbol->getName().end());
+
+  MCSymbolData &SymbolData = getAssembler().getOrCreateSymbolData(*Symbol);
+
+  unsigned Characteristics =
+    COFF::IMAGE_SCN_LNK_COMDAT |
+    COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
+    COFF::IMAGE_SCN_MEM_READ |
+    COFF::IMAGE_SCN_MEM_WRITE;
+
+  int Selection = COFF::IMAGE_COMDAT_SELECT_LARGEST;
+
+  const MCSection *Section = MCStreamer::getContext().getCOFFSection(
+    SectionName, Characteristics, Selection, SectionKind::getBSS());
+
+  MCSectionData &SectionData = getAssembler().getOrCreateSectionData(*Section);
+
+  if (SectionData.getAlignment() < ByteAlignment)
+    SectionData.setAlignment(ByteAlignment);
+
+  SymbolData.setExternal(External);
+
+  Symbol->setSection(*Section);
+
+  if (ByteAlignment != 1)
+      new MCAlignFragment(ByteAlignment, 0, 0, ByteAlignment, &SectionData);
+
+  SymbolData.setFragment(new MCFillFragment(0, 0, Size, &SectionData));
 }
 
 // MCStreamer interface
 
 void WinCOFFStreamer::EmitLabel(MCSymbol *Symbol) {
+  // TODO: This is copied almost exactly from the MachOStreamer. Consider
+  // merging into MCObjectStreamer?
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+  assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
+  assert(CurSection && "Cannot emit before setting section!");
+
+  Symbol->setSection(*CurSection);
+
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+
+  // FIXME: This is wasteful, we don't necessarily need to create a data
+  // fragment. Instead, we should mark the symbol as pointing into the data
+  // fragment if it exists, otherwise we should just queue the label and set its
+  // fragment pointer when we emit the next fragment.
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  assert(!SD.getFragment() && "Unexpected fragment on symbol data!");
+  SD.setFragment(DF);
+  SD.setOffset(DF->getContents().size());
 }
 
 void WinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
-  dbg_notimpl("Flag = " << Flag);
+  llvm_unreachable("not implemented");
 }
 
 void WinCOFFStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+  // TODO: This is exactly the same as MachOStreamer. Consider merging into
+  // MCObjectStreamer.
+  getAssembler().getOrCreateSymbolData(*Symbol);
+  AddValueSymbols(Value);
+  Symbol->setVariableValue(Value);
 }
 
 void WinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                           MCSymbolAttr Attribute) {
+  switch (Attribute) {
+  case MCSA_WeakReference:
+    getAssembler().getOrCreateSymbolData(*Symbol).modifyFlags(
+      COFF::SF_WeakReference,
+      COFF::SF_WeakReference);
+    break;
+
+  case MCSA_Global:
+    getAssembler().getOrCreateSymbolData(*Symbol).setExternal(true);
+    break;
+
+  default:
+    llvm_unreachable("unsupported attribute");
+    break;
+  }
 }
 
 void WinCOFFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
-  dbg_notimpl("Symbol = " << Symbol->getName() << ", DescValue = "<< DescValue);
+  llvm_unreachable("not implemented");
 }
 
 void WinCOFFStreamer::BeginCOFFSymbolDef(MCSymbol const *Symbol) {
+  assert(CurSymbol == NULL && "EndCOFFSymbolDef must be called between calls "
+                              "to BeginCOFFSymbolDef!");
+  CurSymbol = Symbol;
 }
 
 void WinCOFFStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
+  assert(CurSymbol != NULL && "BeginCOFFSymbolDef must be called first!");
+  assert((StorageClass & ~0xFF) == 0 && "StorageClass must only have data in "
+                                        "the first byte!");
+
+  getAssembler().getOrCreateSymbolData(*CurSymbol).modifyFlags(
+    StorageClass << COFF::SF_ClassShift,
+    COFF::SF_ClassMask);
 }
 
 void WinCOFFStreamer::EmitCOFFSymbolType(int Type) {
+  assert(CurSymbol != NULL && "BeginCOFFSymbolDef must be called first!");
+  assert((Type & ~0xFFFF) == 0 && "Type must only have data in the first 2 "
+                                  "bytes");
+
+  getAssembler().getOrCreateSymbolData(*CurSymbol).modifyFlags(
+    Type << COFF::SF_TypeShift,
+    COFF::SF_TypeMask);
 }
 
 void WinCOFFStreamer::EndCOFFSymbolDef() {
+  assert(CurSymbol != NULL && "BeginCOFFSymbolDef must be called first!");
+  CurSymbol = NULL;
 }
 
 void WinCOFFStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
-  dbg_notimpl("Symbol = " << Symbol->getName() << ", Value = " << *Value);
+  llvm_unreachable("not implemented");
 }
 
 void WinCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                      unsigned ByteAlignment) {
+                                       unsigned ByteAlignment) {
+  AddCommonSymbol(Symbol, Size, ByteAlignment, true);
 }
 
 void WinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size) {
+  AddCommonSymbol(Symbol, Size, 1, false);
 }
 
 void WinCOFFStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                                  unsigned Size,unsigned ByteAlignment) {
-  MCSectionCOFF const *SectionCOFF =
-    static_cast<MCSectionCOFF const *>(Section);
-
-  dbg_notimpl("Section = " << SectionCOFF->getSectionName() << ", Symbol = " <<
-              Symbol->getName() << ", Size = " << Size << ", ByteAlignment = "
-              << ByteAlignment);
+                                   unsigned Size,unsigned ByteAlignment) {
+  llvm_unreachable("not implemented");
 }
 
 void WinCOFFStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                                      uint64_t Size, unsigned ByteAlignment) {
-  MCSectionCOFF const *SectionCOFF =
-    static_cast<MCSectionCOFF const *>(Section);
-
-  dbg_notimpl("Section = " << SectionCOFF->getSectionName() << ", Symbol = " <<
-              Symbol->getName() << ", Size = " << Size << ", ByteAlignment = "
-              << ByteAlignment);
+  llvm_unreachable("not implemented");
 }
 
 void WinCOFFStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
+  // TODO: This is copied exactly from the MachOStreamer. Consider merging into
+  // MCObjectStreamer?
+  getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
 }
 
 void WinCOFFStreamer::EmitValue(const MCExpr *Value, unsigned Size,
-                               unsigned AddrSpace) {
+                                unsigned AddrSpace) {
+  assert(AddrSpace == 0 && "Address space must be 0!");
+
+  // TODO: This is copied exactly from the MachOStreamer. Consider merging into
+  // MCObjectStreamer?
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  // Avoid fixups when possible.
+  int64_t AbsValue;
+  if (AddValueSymbols(Value)->EvaluateAsAbsolute(AbsValue)) {
+    // FIXME: Endianness assumption.
+    for (unsigned i = 0; i != Size; ++i)
+      DF->getContents().push_back(uint8_t(AbsValue >> (i * 8)));
+  } else {
+    DF->addFixup(MCFixup::Create(DF->getContents().size(),
+                                 AddValueSymbols(Value),
+                                 MCFixup::getKindForSize(Size)));
+    DF->getContents().resize(DF->getContents().size() + Size, 0);
+  }
 }
 
 void WinCOFFStreamer::EmitGPRel32Value(const MCExpr *Value) {
-  dbg_notimpl("Value = '" << *Value);
+  llvm_unreachable("not implemented");
 }
 
 void WinCOFFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
-                                          int64_t Value,
-                                          unsigned ValueSize,
-                                          unsigned MaxBytesToEmit) {
+                                           int64_t Value,
+                                           unsigned ValueSize,
+                                           unsigned MaxBytesToEmit) {
+  // TODO: This is copied exactly from the MachOStreamer. Consider merging into
+  // MCObjectStreamer?
+  if (MaxBytesToEmit == 0)
+    MaxBytesToEmit = ByteAlignment;
+  new MCAlignFragment(ByteAlignment, Value, ValueSize, MaxBytesToEmit,
+                      getCurrentSectionData());
+
+  // Update the maximum alignment on the current section if necessary.
+  if (ByteAlignment > getCurrentSectionData()->getAlignment())
+    getCurrentSectionData()->setAlignment(ByteAlignment);
 }
 
 void WinCOFFStreamer::EmitCodeAlignment(unsigned ByteAlignment,
-                                       unsigned MaxBytesToEmit = 0) {
+                                        unsigned MaxBytesToEmit) {
+  // TODO: This is copied exactly from the MachOStreamer. Consider merging into
+  // MCObjectStreamer?
+  if (MaxBytesToEmit == 0)
+    MaxBytesToEmit = ByteAlignment;
+  MCAlignFragment *F = new MCAlignFragment(ByteAlignment, 0, 1, MaxBytesToEmit,
+                                           getCurrentSectionData());
+  F->setEmitNops(true);
+
+  // Update the maximum alignment on the current section if necessary.
+  if (ByteAlignment > getCurrentSectionData()->getAlignment())
+    getCurrentSectionData()->setAlignment(ByteAlignment);
 }
 
 void WinCOFFStreamer::EmitValueToOffset(const MCExpr *Offset,
-                                       unsigned char Value = 0) {
-  dbg_notimpl("Offset = '" << *Offset << "', Value = " << Value);
+                                        unsigned char Value) {
+  llvm_unreachable("not implemented");
 }
 
 void WinCOFFStreamer::EmitFileDirective(StringRef Filename) {
@@ -176,11 +311,24 @@ void WinCOFFStreamer::EmitFileDirective(StringRef Filename) {
 }
 
 void WinCOFFStreamer::EmitDwarfFileDirective(unsigned FileNo,
-                                            StringRef Filename) {
-  dbg_notimpl("FileNo = " << FileNo << ", Filename = '" << Filename << "'");
+                                             StringRef Filename) {
+  llvm_unreachable("not implemented");
 }
 
 void WinCOFFStreamer::EmitInstruction(const MCInst &Instruction) {
+  for (unsigned i = 0, e = Instruction.getNumOperands(); i != e; ++i)
+    if (Instruction.getOperand(i).isExpr())
+      AddValueSymbols(Instruction.getOperand(i).getExpr());
+
+  getCurrentSectionData()->setHasInstructions(true);
+
+  MCInstFragment *Fragment =
+    new MCInstFragment(Instruction, getCurrentSectionData());
+
+  raw_svector_ostream VecOS(Fragment->getCode());
+
+  getAssembler().getEmitter().EncodeInstruction(Instruction, VecOS,
+                                                Fragment->getFixups());
 }
 
 void WinCOFFStreamer::Finish() {
@@ -192,7 +340,10 @@ namespace llvm
   MCStreamer *createWinCOFFStreamer(MCContext &Context,
                                     TargetAsmBackend &TAB,
                                     MCCodeEmitter &CE,
-                                    raw_ostream &OS) {
-    return new WinCOFFStreamer(Context, TAB, CE, OS);
+                                    raw_ostream &OS,
+                                    bool RelaxAll) {
+    WinCOFFStreamer *S = new WinCOFFStreamer(Context, TAB, CE, OS);
+    S->getAssembler().setRelaxAll(RelaxAll);
+    return S;
   }
 }
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 2e78557011331..b87ddf9c95b58 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -153,6 +153,7 @@ readExponent(StringRef::iterator begin, StringRef::iterator end)
     value += absExponent * 10;
     if (absExponent >= overlargeExponent) {
       absExponent = overlargeExponent;
+      p = end;  /* outwit assert below */
       break;
     }
     absExponent = value;
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 262fa42ab2ced..8a212a291f24d 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -2123,15 +2123,16 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
     char *BufPtr = Buffer+65;
 
     uint64_t N;
-    if (Signed) {
+    if (!Signed) {
+      N = getZExtValue();
+    } else {
       int64_t I = getSExtValue();
-      if (I < 0) {
+      if (I >= 0) {
+        N = I;
+      } else {
         Str.push_back('-');
-        I = -I;
+        N = -(uint64_t)I;
       }
-      N = I;
-    } else {
-      N = getZExtValue();
     }
 
     while (N) {
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 366d2f799211d..0c70a402654e5 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMSupport
   circular_raw_ostream.cpp
   CommandLine.cpp
   ConstantRange.cpp
+  CrashRecoveryContext.cpp
   Debug.cpp
   DeltaAlgorithm.cpp
   DAGDeltaAlgorithm.cpp
@@ -23,7 +24,6 @@ add_llvm_library(LLVMSupport
   PluginLoader.cpp
   PrettyStackTrace.cpp
   Regex.cpp
-  SlowOperationInformer.cpp
   SmallPtrSet.cpp
   SmallVector.cpp
   SourceMgr.cpp
diff --git a/lib/Support/ConstantRange.cpp b/lib/Support/ConstantRange.cpp
index 2746f7aaaa5e8..8ef3785f53318 100644
--- a/lib/Support/ConstantRange.cpp
+++ b/lib/Support/ConstantRange.cpp
@@ -21,6 +21,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Constants.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -38,7 +39,7 @@ ConstantRange::ConstantRange(uint32_t BitWidth, bool Full) {
 
 /// Initialize a range to hold the single specified value.
 ///
-ConstantRange::ConstantRange(const APInt & V) : Lower(V), Upper(V + 1) {}
+ConstantRange::ConstantRange(const APInt &V) : Lower(V), Upper(V + 1) {}
 
 ConstantRange::ConstantRange(const APInt &L, const APInt &U) :
   Lower(L), Upper(U) {
@@ -202,14 +203,12 @@ bool ConstantRange::contains(const APInt &V) const {
 }
 
 /// contains - Return true if the argument is a subset of this range.
-/// Two equal set contain each other. The empty set is considered to be
-/// contained by all other sets.
+/// Two equal sets contain each other. The empty set contained by all other
+/// sets.
 ///
 bool ConstantRange::contains(const ConstantRange &Other) const {
-  if (isFullSet()) return true;
-  if (Other.isFullSet()) return false;
-  if (Other.isEmptySet()) return true;
-  if (isEmptySet()) return false;
+  if (isFullSet() || Other.isEmptySet()) return true;
+  if (isEmptySet() || Other.isFullSet()) return false;
 
   if (!isWrappedSet()) {
     if (Other.isWrappedSet())
@@ -235,46 +234,6 @@ ConstantRange ConstantRange::subtract(const APInt &Val) const {
   return ConstantRange(Lower - Val, Upper - Val);
 }
 
-
-// intersect1Wrapped - This helper function is used to intersect two ranges when
-// it is known that LHS is wrapped and RHS isn't.
-//
-ConstantRange 
-ConstantRange::intersect1Wrapped(const ConstantRange &LHS,
-                                 const ConstantRange &RHS) {
-  assert(LHS.isWrappedSet() && !RHS.isWrappedSet());
-
-  // Check to see if we overlap on the Left side of RHS...
-  //
-  if (RHS.Lower.ult(LHS.Upper)) {
-    // We do overlap on the left side of RHS, see if we overlap on the right of
-    // RHS...
-    if (RHS.Upper.ugt(LHS.Lower)) {
-      // Ok, the result overlaps on both the left and right sides.  See if the
-      // resultant interval will be smaller if we wrap or not...
-      //
-      if (LHS.getSetSize().ult(RHS.getSetSize()))
-        return LHS;
-      else
-        return RHS;
-
-    } else {
-      // No overlap on the right, just on the left.
-      return ConstantRange(RHS.Lower, LHS.Upper);
-    }
-  } else {
-    // We don't overlap on the left side of RHS, see if we overlap on the right
-    // of RHS...
-    if (RHS.Upper.ugt(LHS.Lower)) {
-      // Simple overlap...
-      return ConstantRange(LHS.Lower, RHS.Upper);
-    } else {
-      // No overlap...
-      return ConstantRange(LHS.getBitWidth(), false);
-    }
-  }
-}
-
 /// intersectWith - Return the range that results from the intersection of this
 /// range with another range.  The resultant range is guaranteed to include all
 /// elements contained in both input ranges, and to have the smallest possible
@@ -486,7 +445,7 @@ ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
   assert(SrcTySize > DstTySize && "Not a value truncation");
   APInt Size(APInt::getLowBitsSet(SrcTySize, DstTySize));
   if (isFullSet() || getSetSize().ugt(Size))
-    return ConstantRange(DstTySize);
+    return ConstantRange(DstTySize, /*isFullSet=*/true);
 
   APInt L = Lower; L.trunc(DstTySize);
   APInt U = Upper; U.trunc(DstTySize);
@@ -539,6 +498,27 @@ ConstantRange::add(const ConstantRange &Other) const {
 }
 
 ConstantRange
+ConstantRange::sub(const ConstantRange &Other) const {
+  if (isEmptySet() || Other.isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+  if (isFullSet() || Other.isFullSet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  APInt Spread_X = getSetSize(), Spread_Y = Other.getSetSize();
+  APInt NewLower = getLower() - Other.getLower();
+  APInt NewUpper = getUpper() - Other.getUpper() + 1;
+  if (NewLower == NewUpper)
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  ConstantRange X = ConstantRange(NewLower, NewUpper);
+  if (X.getSetSize().ult(Spread_X) || X.getSetSize().ult(Spread_Y))
+    // We've wrapped, therefore, full set.
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  return X;
+}
+
+ConstantRange
 ConstantRange::multiply(const ConstantRange &Other) const {
   // TODO: If either operand is a single element and the multiply is known to
   // be non-wrapping, round the result min and max value to the appropriate
@@ -616,40 +596,42 @@ ConstantRange::udiv(const ConstantRange &RHS) const {
 }
 
 ConstantRange
-ConstantRange::shl(const ConstantRange &Amount) const {
-  if (isEmptySet())
-    return *this;
+ConstantRange::shl(const ConstantRange &Other) const {
+  if (isEmptySet() || Other.isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
 
-  APInt min = getUnsignedMin() << Amount.getUnsignedMin();
-  APInt max = getUnsignedMax() << Amount.getUnsignedMax();
+  APInt min = getUnsignedMin().shl(Other.getUnsignedMin());
+  APInt max = getUnsignedMax().shl(Other.getUnsignedMax());
 
   // there's no overflow!
   APInt Zeros(getBitWidth(), getUnsignedMax().countLeadingZeros());
-  if (Zeros.uge(Amount.getUnsignedMax()))
-    return ConstantRange(min, max);
+  if (Zeros.ugt(Other.getUnsignedMax()))
+    return ConstantRange(min, max + 1);
 
   // FIXME: implement the other tricky cases
-  return ConstantRange(getBitWidth());
+  return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 }
 
 ConstantRange
-ConstantRange::ashr(const ConstantRange &Amount) const {
-  if (isEmptySet())
-    return *this;
+ConstantRange::lshr(const ConstantRange &Other) const {
+  if (isEmptySet() || Other.isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+  
+  APInt max = getUnsignedMax().lshr(Other.getUnsignedMin());
+  APInt min = getUnsignedMin().lshr(Other.getUnsignedMax());
+  if (min == max + 1)
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
-  APInt min = getUnsignedMax().ashr(Amount.getUnsignedMin());
-  APInt max = getUnsignedMin().ashr(Amount.getUnsignedMax());
-  return ConstantRange(min, max);
+  return ConstantRange(min, max + 1);
 }
 
-ConstantRange
-ConstantRange::lshr(const ConstantRange &Amount) const {
-  if (isEmptySet())
-    return *this;
-  
-  APInt min = getUnsignedMax().lshr(Amount.getUnsignedMin());
-  APInt max = getUnsignedMin().lshr(Amount.getUnsignedMax());
-  return ConstantRange(min, max);
+ConstantRange ConstantRange::inverse() const {
+  if (isFullSet()) {
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+  } else if (isEmptySet()) {
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+  }
+  return ConstantRange(Upper, Lower);
 }
 
 /// print - Print out the bounds to a stream...
@@ -668,5 +650,3 @@ void ConstantRange::print(raw_ostream &OS) const {
 void ConstantRange::dump() const {
   print(dbgs());
 }
-
-
diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp
new file mode 100644
index 0000000000000..49258ede83c1d
--- /dev/null
+++ b/lib/Support/CrashRecoveryContext.cpp
@@ -0,0 +1,204 @@
+//===--- CrashRecoveryContext.cpp - Crash Recovery ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CrashRecoveryContext.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Config/config.h"
+#include "llvm/System/Mutex.h"
+#include "llvm/System/ThreadLocal.h"
+#include <setjmp.h>
+#include <cstdio>
+using namespace llvm;
+
+namespace {
+
+struct CrashRecoveryContextImpl;
+
+static sys::ThreadLocal<const CrashRecoveryContextImpl> CurrentContext;
+
+struct CrashRecoveryContextImpl {
+  CrashRecoveryContext *CRC;
+  std::string Backtrace;
+  ::jmp_buf JumpBuffer;
+  volatile unsigned Failed : 1;
+
+public:
+  CrashRecoveryContextImpl(CrashRecoveryContext *CRC) : CRC(CRC),
+                                                        Failed(false) {
+    CurrentContext.set(this);
+  }
+  ~CrashRecoveryContextImpl() {
+    CurrentContext.erase();
+  }
+
+  void HandleCrash() {
+    // Eliminate the current context entry, to avoid re-entering in case the
+    // cleanup code crashes.
+    CurrentContext.erase();
+
+    assert(!Failed && "Crash recovery context already failed!");
+    Failed = true;
+
+    // FIXME: Stash the backtrace.
+
+    // Jump back to the RunSafely we were called under.
+    longjmp(JumpBuffer, 1);
+  }
+};
+
+}
+
+static sys::Mutex gCrashRecoveryContexMutex;
+static bool gCrashRecoveryEnabled = false;
+
+CrashRecoveryContext::~CrashRecoveryContext() {
+  CrashRecoveryContextImpl *CRCI = (CrashRecoveryContextImpl *) Impl;
+  delete CRCI;
+}
+
+CrashRecoveryContext *CrashRecoveryContext::GetCurrent() {
+  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+  if (!CRCI)
+    return 0;
+
+  return CRCI->CRC;
+}
+
+#ifdef LLVM_ON_WIN32
+
+// FIXME: No real Win32 implementation currently.
+
+void CrashRecoveryContext::Enable() {
+  sys::ScopedLock L(gCrashRecoveryContexMutex);
+
+  if (gCrashRecoveryEnabled)
+    return;
+
+  gCrashRecoveryEnabled = true;
+}
+
+void CrashRecoveryContext::Disable() {
+  sys::ScopedLock L(gCrashRecoveryContexMutex);
+
+  if (!gCrashRecoveryEnabled)
+    return;
+
+  gCrashRecoveryEnabled = false;
+}
+
+#else
+
+// Generic POSIX implementation.
+//
+// This implementation relies on synchronous signals being delivered to the
+// current thread. We use a thread local object to keep track of the active
+// crash recovery context, and install signal handlers to invoke HandleCrash on
+// the active object.
+//
+// This implementation does not to attempt to chain signal handlers in any
+// reliable fashion -- if we get a signal outside of a crash recovery context we
+// simply disable crash recovery and raise the signal again.
+
+#include <signal.h>
+
+static int Signals[] = { SIGABRT, SIGBUS, SIGFPE, SIGILL, SIGSEGV, SIGTRAP };
+static const unsigned NumSignals = sizeof(Signals) / sizeof(Signals[0]);
+static struct sigaction PrevActions[NumSignals];
+
+static void CrashRecoverySignalHandler(int Signal) {
+  // Lookup the current thread local recovery object.
+  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+
+  if (!CRCI) {
+    // We didn't find a crash recovery context -- this means either we got a
+    // signal on a thread we didn't expect it on, the application got a signal
+    // outside of a crash recovery context, or something else went horribly
+    // wrong.
+    //
+    // Disable crash recovery and raise the signal again. The assumption here is
+    // that the enclosing application will terminate soon, and we won't want to
+    // attempt crash recovery again.
+    //
+    // This call of Disable isn't thread safe, but it doesn't actually matter.
+    CrashRecoveryContext::Disable();
+    raise(Signal);
+  }
+
+  // Unblock the signal we received.
+  sigset_t SigMask;
+  sigemptyset(&SigMask);
+  sigaddset(&SigMask, Signal);
+  sigprocmask(SIG_UNBLOCK, &SigMask, 0);
+
+  if (CRCI)
+    const_cast<CrashRecoveryContextImpl*>(CRCI)->HandleCrash();
+}
+
+void CrashRecoveryContext::Enable() {
+  sys::ScopedLock L(gCrashRecoveryContexMutex);
+
+  if (gCrashRecoveryEnabled)
+    return;
+
+  gCrashRecoveryEnabled = true;
+
+  // Setup the signal handler.
+  struct sigaction Handler;
+  Handler.sa_handler = CrashRecoverySignalHandler;
+  Handler.sa_flags = 0;
+  sigemptyset(&Handler.sa_mask);
+
+  for (unsigned i = 0; i != NumSignals; ++i) {
+    sigaction(Signals[i], &Handler, &PrevActions[i]);
+  }
+}
+
+void CrashRecoveryContext::Disable() {
+  sys::ScopedLock L(gCrashRecoveryContexMutex);
+
+  if (!gCrashRecoveryEnabled)
+    return;
+
+  gCrashRecoveryEnabled = false;
+
+  // Restore the previous signal handlers.
+  for (unsigned i = 0; i != NumSignals; ++i)
+    sigaction(Signals[i], &PrevActions[i], 0);
+}
+
+#endif
+
+bool CrashRecoveryContext::RunSafely(void (*Fn)(void*), void *UserData) {
+  // If crash recovery is disabled, do nothing.
+  if (gCrashRecoveryEnabled) {
+    assert(!Impl && "Crash recovery context already initialized!");
+    CrashRecoveryContextImpl *CRCI = new CrashRecoveryContextImpl(this);
+    Impl = CRCI;
+
+    if (setjmp(CRCI->JumpBuffer) != 0) {
+      return false;
+    }
+  }
+
+  Fn(UserData);
+  return true;
+}
+
+void CrashRecoveryContext::HandleCrash() {
+  CrashRecoveryContextImpl *CRCI = (CrashRecoveryContextImpl *) Impl;
+  assert(CRCI && "Crash recovery context never initialized!");
+  CRCI->HandleCrash();
+}
+
+const std::string &CrashRecoveryContext::getBacktrace() const {
+  CrashRecoveryContextImpl *CRC = (CrashRecoveryContextImpl *) Impl;
+  assert(CRC && "Crash recovery context never initialized!");
+  assert(CRC->Failed && "No crash was detected!");
+  return CRC->Backtrace;
+}
diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp
index 7e7ca9debe9a7..0b7af3e5905be 100644
--- a/lib/Support/ErrorHandling.cpp
+++ b/lib/Support/ErrorHandling.cpp
@@ -18,8 +18,19 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/System/Signals.h"
 #include "llvm/System/Threading.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Config/config.h"
 #include <cassert>
 #include <cstdlib>
+
+#if defined(HAVE_UNISTD_H)
+# include <unistd.h>
+#endif
+#if defined(_MSC_VER)
+# include <io.h>
+# include <fcntl.h>
+#endif
+
 using namespace llvm;
 using namespace std;
 
@@ -39,19 +50,26 @@ void llvm::remove_fatal_error_handler() {
   ErrorHandler = 0;
 }
 
-void llvm::report_fatal_error(const char *reason) {
-  report_fatal_error(Twine(reason));
+void llvm::report_fatal_error(const char *Reason) {
+  report_fatal_error(Twine(Reason));
 }
 
-void llvm::report_fatal_error(const std::string &reason) {
-  report_fatal_error(Twine(reason));
+void llvm::report_fatal_error(const std::string &Reason) {
+  report_fatal_error(Twine(Reason));
 }
 
-void llvm::report_fatal_error(const Twine &reason) {
-  if (!ErrorHandler) {
-    errs() << "LLVM ERROR: " << reason << "\n";
+void llvm::report_fatal_error(const Twine &Reason) {
+  if (ErrorHandler) {
+    ErrorHandler(ErrorHandlerUserData, Reason.str());
   } else {
-    ErrorHandler(ErrorHandlerUserData, reason.str());
+    // Blast the result out to stderr.  We don't try hard to make sure this
+    // succeeds (e.g. handling EINTR) and we can't use errs() here because
+    // raw ostreams can call report_fatal_error.
+    SmallVector<char, 64> Buffer;
+    raw_svector_ostream OS(Buffer);
+    OS << "LLVM ERROR: " << Reason << "\n";
+    StringRef MessageStr = OS.str();
+    (void)::write(2, MessageStr.data(), MessageStr.size());
   }
 
   // If we reached here, we are failing ungracefully. Run the interrupt handlers
diff --git a/lib/Support/FoldingSet.cpp b/lib/Support/FoldingSet.cpp
index b8dca334da494..29b5952208874 100644
--- a/lib/Support/FoldingSet.cpp
+++ b/lib/Support/FoldingSet.cpp
@@ -23,6 +23,37 @@
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
+// FoldingSetNodeIDRef Implementation
+
+/// ComputeHash - Compute a strong hash value for this FoldingSetNodeIDRef,
+/// used to lookup the node in the FoldingSetImpl.
+unsigned FoldingSetNodeIDRef::ComputeHash() const {
+  // This is adapted from SuperFastHash by Paul Hsieh.
+  unsigned Hash = static_cast<unsigned>(Size);
+  for (const unsigned *BP = Data, *E = BP+Size; BP != E; ++BP) {
+    unsigned Data = *BP;
+    Hash         += Data & 0xFFFF;
+    unsigned Tmp  = ((Data >> 16) << 11) ^ Hash;
+    Hash          = (Hash << 16) ^ Tmp;
+    Hash         += Hash >> 11;
+  }
+  
+  // Force "avalanching" of final 127 bits.
+  Hash ^= Hash << 3;
+  Hash += Hash >> 5;
+  Hash ^= Hash << 4;
+  Hash += Hash >> 17;
+  Hash ^= Hash << 25;
+  Hash += Hash >> 6;
+  return Hash;
+}
+
+bool FoldingSetNodeIDRef::operator==(FoldingSetNodeIDRef RHS) const {
+  if (Size != RHS.Size) return false;
+  return memcmp(Data, RHS.Data, Size*sizeof(*Data)) == 0;
+}
+
+//===----------------------------------------------------------------------===//
 // FoldingSetNodeID Implementation
 
 /// Add* - Add various data types to Bit data.
@@ -104,31 +135,19 @@ void FoldingSetNodeID::AddString(StringRef String) {
 /// ComputeHash - Compute a strong hash value for this FoldingSetNodeID, used to 
 /// lookup the node in the FoldingSetImpl.
 unsigned FoldingSetNodeID::ComputeHash() const {
-  // This is adapted from SuperFastHash by Paul Hsieh.
-  unsigned Hash = static_cast<unsigned>(Bits.size());
-  for (const unsigned *BP = &Bits[0], *E = BP+Bits.size(); BP != E; ++BP) {
-    unsigned Data = *BP;
-    Hash         += Data & 0xFFFF;
-    unsigned Tmp  = ((Data >> 16) << 11) ^ Hash;
-    Hash          = (Hash << 16) ^ Tmp;
-    Hash         += Hash >> 11;
-  }
-  
-  // Force "avalanching" of final 127 bits.
-  Hash ^= Hash << 3;
-  Hash += Hash >> 5;
-  Hash ^= Hash << 4;
-  Hash += Hash >> 17;
-  Hash ^= Hash << 25;
-  Hash += Hash >> 6;
-  return Hash;
+  return FoldingSetNodeIDRef(Bits.data(), Bits.size()).ComputeHash();
 }
 
 /// operator== - Used to compare two nodes to each other.
 ///
 bool FoldingSetNodeID::operator==(const FoldingSetNodeID &RHS)const{
-  if (Bits.size() != RHS.Bits.size()) return false;
-  return memcmp(&Bits[0], &RHS.Bits[0], Bits.size()*sizeof(Bits[0])) == 0;
+  return *this == FoldingSetNodeIDRef(RHS.Bits.data(), RHS.Bits.size());
+}
+
+/// operator== - Used to compare two nodes to each other.
+///
+bool FoldingSetNodeID::operator==(FoldingSetNodeIDRef RHS) const {
+  return FoldingSetNodeIDRef(Bits.data(), Bits.size()) == RHS;
 }
 
 /// Intern - Copy this node's data to a memory region allocated from the
@@ -168,10 +187,9 @@ static void **GetBucketPtr(void *NextInBucketPtr) {
 
 /// GetBucketFor - Hash the specified node ID and return the hash bucket for
 /// the specified ID.
-static void **GetBucketFor(const FoldingSetNodeID &ID,
-                           void **Buckets, unsigned NumBuckets) {
+static void **GetBucketFor(unsigned Hash, void **Buckets, unsigned NumBuckets) {
   // NumBuckets is always a power of 2.
-  unsigned BucketNum = ID.ComputeHash() & (NumBuckets-1);
+  unsigned BucketNum = Hash & (NumBuckets-1);
   return Buckets + BucketNum;
 }
 
@@ -219,7 +237,7 @@ void FoldingSetImpl::GrowHashTable() {
   NumNodes = 0;
 
   // Walk the old buckets, rehashing nodes into their new place.
-  FoldingSetNodeID ID;
+  FoldingSetNodeID TempID;
   for (unsigned i = 0; i != OldNumBuckets; ++i) {
     void *Probe = OldBuckets[i];
     if (!Probe) continue;
@@ -229,9 +247,10 @@ void FoldingSetImpl::GrowHashTable() {
       NodeInBucket->SetNextInBucket(0);
 
       // Insert the node into the new bucket, after recomputing the hash.
-      GetNodeProfile(ID, NodeInBucket);
-      InsertNode(NodeInBucket, GetBucketFor(ID, Buckets, NumBuckets));
-      ID.clear();
+      InsertNode(NodeInBucket,
+                 GetBucketFor(ComputeNodeHash(NodeInBucket, TempID),
+                              Buckets, NumBuckets));
+      TempID.clear();
     }
   }
   
@@ -245,19 +264,18 @@ FoldingSetImpl::Node
 *FoldingSetImpl::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
                                      void *&InsertPos) {
   
-  void **Bucket = GetBucketFor(ID, Buckets, NumBuckets);
+  void **Bucket = GetBucketFor(ID.ComputeHash(), Buckets, NumBuckets);
   void *Probe = *Bucket;
   
   InsertPos = 0;
   
-  FoldingSetNodeID OtherID;
+  FoldingSetNodeID TempID;
   while (Node *NodeInBucket = GetNextPtr(Probe)) {
-    GetNodeProfile(OtherID, NodeInBucket);
-    if (OtherID == ID)
+    if (NodeEquals(NodeInBucket, ID, TempID))
       return NodeInBucket;
+    TempID.clear();
 
     Probe = NodeInBucket->getNextInBucket();
-    OtherID.clear();
   }
   
   // Didn't find the node, return null with the bucket as the InsertPos.
@@ -273,9 +291,8 @@ void FoldingSetImpl::InsertNode(Node *N, void *InsertPos) {
   // Do we need to grow the hashtable?
   if (NumNodes+1 > NumBuckets*2) {
     GrowHashTable();
-    FoldingSetNodeID ID;
-    GetNodeProfile(ID, N);
-    InsertPos = GetBucketFor(ID, Buckets, NumBuckets);
+    FoldingSetNodeID TempID;
+    InsertPos = GetBucketFor(ComputeNodeHash(N, TempID), Buckets, NumBuckets);
   }
 
   ++NumNodes;
@@ -341,7 +358,7 @@ bool FoldingSetImpl::RemoveNode(Node *N) {
 /// instead.
 FoldingSetImpl::Node *FoldingSetImpl::GetOrInsertNode(FoldingSetImpl::Node *N) {
   FoldingSetNodeID ID;
-  GetNodeProfile(ID, N);
+  GetNodeProfile(N, ID);
   void *IP;
   if (Node *E = FindNodeOrInsertPos(ID, IP))
     return E;
diff --git a/lib/Support/PrettyStackTrace.cpp b/lib/Support/PrettyStackTrace.cpp
index a99ab2f30df0e..3c8a10849d149 100644
--- a/lib/Support/PrettyStackTrace.cpp
+++ b/lib/Support/PrettyStackTrace.cpp
@@ -72,7 +72,7 @@ asm(".desc ___crashreporter_info__, 0x10");
 
 /// CrashHandler - This callback is run if a fatal signal is delivered to the
 /// process, it prints the pretty stack trace.
-static void CrashHandler(void *Cookie) {
+static void CrashHandler(void *) {
 #ifndef __APPLE__
   // On non-apple systems, just emit the crash stack trace to stderr.
   PrintCurStackTrace(errs());
@@ -89,7 +89,8 @@ static void CrashHandler(void *Cookie) {
 #ifndef HAVE_CRASHREPORTERCLIENT_H
     __crashreporter_info__ = strdup(std::string(TmpStr.str()).c_str());
 #else
-    CRSetCrashLogMessage(std::string(TmpStr.str()).c_str());
+    // Cast to void to avoid warning.
+    (void)CRSetCrashLogMessage(std::string(TmpStr.str()).c_str());
 #endif
     errs() << TmpStr.str();
   }
diff --git a/lib/Support/SlowOperationInformer.cpp b/lib/Support/SlowOperationInformer.cpp
deleted file mode 100644
index b4e9430e5fdfe..0000000000000
--- a/lib/Support/SlowOperationInformer.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-//===-- SlowOperationInformer.cpp - Keep the user informed ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the SlowOperationInformer class for the LLVM debugger.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/SlowOperationInformer.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/System/Alarm.h"
-#include <sstream>
-#include <cassert>
-using namespace llvm;
-
-SlowOperationInformer::SlowOperationInformer(const std::string &Name)
-  : OperationName(Name), LastPrintAmount(0) {
-  sys::SetupAlarm(1);
-}
-
-SlowOperationInformer::~SlowOperationInformer() {
-  sys::TerminateAlarm();
-  if (LastPrintAmount) {
-    // If we have printed something, make _sure_ we print the 100% amount, and
-    // also print a newline.
-    outs() << std::string(LastPrintAmount, '\b') << "Progress "
-           << OperationName << ": 100%  \n";
-  }
-}
-
-/// progress - Clients should periodically call this method when they are in
-/// an exception-safe state.  The Amount variable should indicate how far
-/// along the operation is, given in 1/10ths of a percent (in other words,
-/// Amount should range from 0 to 1000).
-bool SlowOperationInformer::progress(unsigned Amount) {
-  int status = sys::AlarmStatus();
-  if (status == -1) {
-    outs() << "\n";
-    LastPrintAmount = 0;
-    return true;
-  }
-
-  // If we haven't spent enough time in this operation to warrant displaying the
-  // progress bar, don't do so yet.
-  if (status == 0)
-    return false;
-
-  // Delete whatever we printed last time.
-  std::string ToPrint = std::string(LastPrintAmount, '\b');
-
-  std::ostringstream OS;
-  OS << "Progress " << OperationName << ": " << Amount/10;
-  if (unsigned Rem = Amount % 10)
-    OS << "." << Rem << "%";
-  else
-    OS << "%  ";
-
-  LastPrintAmount = OS.str().size();
-  outs() << ToPrint+OS.str();
-  outs().flush();
-  return false;
-}
diff --git a/lib/Support/SmallVector.cpp b/lib/Support/SmallVector.cpp
index 2e17af864155a..a89f14957635e 100644
--- a/lib/Support/SmallVector.cpp
+++ b/lib/Support/SmallVector.cpp
@@ -18,7 +18,7 @@ using namespace llvm;
 /// on POD-like datatypes and is out of line to reduce code duplication.
 void SmallVectorBase::grow_pod(size_t MinSizeInBytes, size_t TSize) {
   size_t CurSizeBytes = size_in_bytes();
-  size_t NewCapacityInBytes = 2 * capacity_in_bytes();
+  size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow.
   if (NewCapacityInBytes < MinSizeInBytes)
     NewCapacityInBytes = MinSizeInBytes;
 
diff --git a/lib/Support/Statistic.cpp b/lib/Support/Statistic.cpp
index 7d5f65af28422..e32ab74a2d4c4 100644
--- a/lib/Support/Statistic.cpp
+++ b/lib/Support/Statistic.cpp
@@ -44,7 +44,7 @@ Enabled("stats", cl::desc("Enable statistics output from program"));
 
 namespace {
 /// StatisticInfo - This class is used in a ManagedStatic so that it is created
-/// on demand (when the first statistic is bumped) and destroyed only when 
+/// on demand (when the first statistic is bumped) and destroyed only when
 /// llvm_shutdown is called.  We print statistics from the destructor.
 class StatisticInfo {
   std::vector<const Statistic*> Stats;
@@ -52,7 +52,7 @@ class StatisticInfo {
   friend void llvm::PrintStatistics(raw_ostream &OS);
 public:
   ~StatisticInfo();
-  
+
   void addStatistic(const Statistic *S) {
     Stats.push_back(S);
   }
@@ -71,7 +71,7 @@ void Statistic::RegisterStatistic() {
   if (!Initialized) {
     if (Enabled)
       StatInfo->addStatistic(this);
-    
+
     sys::MemoryFence();
     // Remember we have been registered.
     Initialized = true;
@@ -84,7 +84,7 @@ struct NameCompare {
   bool operator()(const Statistic *LHS, const Statistic *RHS) const {
     int Cmp = std::strcmp(LHS->getName(), RHS->getName());
     if (Cmp != 0) return Cmp < 0;
-    
+
     // Secondary key is the description.
     return std::strcmp(LHS->getDesc(), RHS->getDesc()) < 0;
   }
@@ -112,7 +112,7 @@ void llvm::PrintStatistics(raw_ostream &OS) {
     MaxNameLen = std::max(MaxNameLen,
                           (unsigned)std::strlen(Stats.Stats[i]->getName()));
   }
-  
+
   // Sort the fields by name.
   std::stable_sort(Stats.Stats.begin(), Stats.Stats.end(), NameCompare());
 
@@ -120,7 +120,7 @@ void llvm::PrintStatistics(raw_ostream &OS) {
   OS << "===" << std::string(73, '-') << "===\n"
      << "                          ... Statistics Collected ...\n"
      << "===" << std::string(73, '-') << "===\n\n";
-  
+
   // Print all of the statistics.
   for (size_t i = 0, e = Stats.Stats.size(); i != e; ++i) {
     std::string CountStr = utostr(Stats.Stats[i]->getValue());
@@ -129,7 +129,7 @@ void llvm::PrintStatistics(raw_ostream &OS) {
        << std::string(MaxNameLen-std::strlen(Stats.Stats[i]->getName()), ' ')
        << " - " << Stats.Stats[i]->getDesc() << "\n";
   }
-  
+
   OS << '\n';  // Flush the output stream.
   OS.flush();
 
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index ca0f518a88b62..46f26b242aac3 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/APInt.h"
+#include <bitset>
 
 using namespace llvm;
 
@@ -30,14 +31,14 @@ static bool ascii_isdigit(char x) {
 /// compare_lower - Compare strings, ignoring case.
 int StringRef::compare_lower(StringRef RHS) const {
   for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
-    char LHC = ascii_tolower(Data[I]);
-    char RHC = ascii_tolower(RHS.Data[I]);
+    unsigned char LHC = ascii_tolower(Data[I]);
+    unsigned char RHC = ascii_tolower(RHS.Data[I]);
     if (LHC != RHC)
       return LHC < RHC ? -1 : 1;
   }
 
   if (Length == RHS.Length)
-        return 0;
+    return 0;
   return Length < RHS.Length ? -1 : 1;
 }
 
@@ -58,10 +59,10 @@ int StringRef::compare_numeric(StringRef RHS) const {
           break;
       }
     }
-    return Data[I] < RHS.Data[I] ? -1 : 1;
+    return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
   }
   if (Length == RHS.Length)
-        return 0;
+    return 0;
   return Length < RHS.Length ? -1 : 1;
 }
 
@@ -153,11 +154,15 @@ size_t StringRef::rfind(StringRef Str) const {
 /// find_first_of - Find the first character in the string that is in \arg
 /// Chars, or npos if not found.
 ///
-/// Note: O(size() * Chars.size())
+/// Note: O(size() + Chars.size())
 StringRef::size_type StringRef::find_first_of(StringRef Chars,
                                               size_t From) const {
+  std::bitset<1 << CHAR_BIT> CharBits;
+  for (size_type i = 0; i != Chars.size(); ++i)
+    CharBits.set((unsigned char)Chars[i]);
+
   for (size_type i = min(From, Length), e = Length; i != e; ++i)
-    if (Chars.find(Data[i]) != npos)
+    if (CharBits.test((unsigned char)Data[i]))
       return i;
   return npos;
 }
@@ -174,11 +179,15 @@ StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
 /// find_first_not_of - Find the first character in the string that is not
 /// in the string \arg Chars, or npos if not found.
 ///
-/// Note: O(size() * Chars.size())
+/// Note: O(size() + Chars.size())
 StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
                                                   size_t From) const {
+  std::bitset<1 << CHAR_BIT> CharBits;
+  for (size_type i = 0; i != Chars.size(); ++i)
+    CharBits.set((unsigned char)Chars[i]);
+
   for (size_type i = min(From, Length), e = Length; i != e; ++i)
-    if (Chars.find(Data[i]) == npos)
+    if (!CharBits.test((unsigned char)Data[i]))
       return i;
   return npos;
 }
diff --git a/lib/Support/SystemUtils.cpp b/lib/Support/SystemUtils.cpp
index 299032f187156..c8b260c2e3dd9 100644
--- a/lib/Support/SystemUtils.cpp
+++ b/lib/Support/SystemUtils.cpp
@@ -49,6 +49,16 @@ sys::Path llvm::FindExecutable(const std::string &ExeName,
     Result.appendComponent(ExeName);
     if (Result.canExecute())
       return Result;
+    // If the path is absolute (and it usually is), call FindProgramByName to
+    // allow it to try platform-specific logic, such as appending a .exe suffix
+    // on Windows. Don't do this if we somehow have a relative path, because
+    // we don't want to go searching the PATH and accidentally find an unrelated
+    // version of the program.
+    if (Result.isAbsolute()) {
+      Result = sys::Program::FindProgramByName(Result.str());
+      if (!Result.empty())
+        return Result;
+    }
   }
 
   return sys::Path();
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 6a70449b56dc7..3a95b65e69000 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -221,121 +221,238 @@ const char *Triple::getArchNameForAssembler() {
 
 //
 
-void Triple::Parse() const {
-  assert(!isInitialized() && "Invalid parse call.");
-
-  StringRef ArchName = getArchName();
-  StringRef VendorName = getVendorName();
-  StringRef OSName = getOSName();
-
+Triple::ArchType Triple::ParseArch(StringRef ArchName) {
   if (ArchName.size() == 4 && ArchName[0] == 'i' && 
       ArchName[2] == '8' && ArchName[3] == '6' && 
       ArchName[1] - '3' < 6) // i[3-9]86
-    Arch = x86;
+    return x86;
   else if (ArchName == "amd64" || ArchName == "x86_64")
-    Arch = x86_64;
+    return x86_64;
   else if (ArchName == "bfin")
-    Arch = bfin;
+    return bfin;
   else if (ArchName == "pic16")
-    Arch = pic16;
+    return pic16;
   else if (ArchName == "powerpc")
-    Arch = ppc;
+    return ppc;
   else if ((ArchName == "powerpc64") || (ArchName == "ppu"))
-    Arch = ppc64;
+    return ppc64;
   else if (ArchName == "mblaze")
-    Arch = mblaze;
+    return mblaze;
   else if (ArchName == "arm" ||
            ArchName.startswith("armv") ||
            ArchName == "xscale")
-    Arch = arm;
+    return arm;
   else if (ArchName == "thumb" ||
            ArchName.startswith("thumbv"))
-    Arch = thumb;
+    return thumb;
   else if (ArchName.startswith("alpha"))
-    Arch = alpha;
+    return alpha;
   else if (ArchName == "spu" || ArchName == "cellspu")
-    Arch = cellspu;
+    return cellspu;
   else if (ArchName == "msp430")
-    Arch = msp430;
+    return msp430;
   else if (ArchName == "mips" || ArchName == "mipsallegrex")
-    Arch = mips;
+    return mips;
   else if (ArchName == "mipsel" || ArchName == "mipsallegrexel" ||
            ArchName == "psp")
-    Arch = mipsel;
+    return mipsel;
   else if (ArchName == "sparc")
-    Arch = sparc;
+    return sparc;
   else if (ArchName == "sparcv9")
-    Arch = sparcv9;
+    return sparcv9;
   else if (ArchName == "s390x")
-    Arch = systemz;
+    return systemz;
   else if (ArchName == "tce")
-    Arch = tce;
+    return tce;
   else if (ArchName == "xcore")
-    Arch = xcore;
+    return xcore;
   else
-    Arch = UnknownArch;
-
-
-  // Handle some exceptional cases where the OS / environment components are
-  // stuck into the vendor field.
-  if (StringRef(getTriple()).count('-') == 1) {
-    StringRef VendorName = getVendorName();
-
-    if (VendorName.startswith("mingw32")) { // 'i386-mingw32', etc.
-      Vendor = PC;
-      OS = MinGW32;
-      return;
-    }
-
-    // arm-elf is another example, but we don't currently parse anything about
-    // the environment.
-  }
+    return UnknownArch;
+}
 
+Triple::VendorType Triple::ParseVendor(StringRef VendorName) {
   if (VendorName == "apple")
-    Vendor = Apple;
+    return Apple;
   else if (VendorName == "pc")
-    Vendor = PC;
+    return PC;
   else
-    Vendor = UnknownVendor;
+    return UnknownVendor;
+}
 
+Triple::OSType Triple::ParseOS(StringRef OSName) {
   if (OSName.startswith("auroraux"))
-    OS = AuroraUX;
+    return AuroraUX;
   else if (OSName.startswith("cygwin"))
-    OS = Cygwin;
+    return Cygwin;
   else if (OSName.startswith("darwin"))
-    OS = Darwin;
+    return Darwin;
   else if (OSName.startswith("dragonfly"))
-    OS = DragonFly;
+    return DragonFly;
   else if (OSName.startswith("freebsd"))
-    OS = FreeBSD;
+    return FreeBSD;
   else if (OSName.startswith("linux"))
-    OS = Linux;
+    return Linux;
   else if (OSName.startswith("lv2"))
-    OS = Lv2;
+    return Lv2;
   else if (OSName.startswith("mingw32"))
-    OS = MinGW32;
+    return MinGW32;
   else if (OSName.startswith("mingw64"))
-    OS = MinGW64;
+    return MinGW64;
   else if (OSName.startswith("netbsd"))
-    OS = NetBSD;
+    return NetBSD;
   else if (OSName.startswith("openbsd"))
-    OS = OpenBSD;
+    return OpenBSD;
   else if (OSName.startswith("psp"))
-    OS = Psp;
+    return Psp;
   else if (OSName.startswith("solaris"))
-    OS = Solaris;
+    return Solaris;
   else if (OSName.startswith("win32"))
-    OS = Win32;
+    return Win32;
   else if (OSName.startswith("haiku"))
-    OS = Haiku;
+    return Haiku;
   else if (OSName.startswith("minix"))
-    OS = Minix;
+    return Minix;
   else
-    OS = UnknownOS;
+    return UnknownOS;
+}
+
+void Triple::Parse() const {
+  assert(!isInitialized() && "Invalid parse call.");
+
+  Arch = ParseArch(getArchName());
+  Vendor = ParseVendor(getVendorName());
+  OS = ParseOS(getOSName());
 
   assert(isInitialized() && "Failed to initialize!");
 }
 
+std::string Triple::normalize(StringRef Str) {
+  // Parse into components.
+  SmallVector<StringRef, 4> Components;
+  for (size_t First = 0, Last = 0; Last != StringRef::npos; First = Last + 1) {
+    Last = Str.find('-', First);
+    Components.push_back(Str.slice(First, Last));
+  }
+
+  // If the first component corresponds to a known architecture, preferentially
+  // use it for the architecture.  If the second component corresponds to a
+  // known vendor, preferentially use it for the vendor, etc.  This avoids silly
+  // component movement when a component parses as (eg) both a valid arch and a
+  // valid os.
+  ArchType Arch = UnknownArch;
+  if (Components.size() > 0)
+    Arch = ParseArch(Components[0]);
+  VendorType Vendor = UnknownVendor;
+  if (Components.size() > 1)
+    Vendor = ParseVendor(Components[1]);
+  OSType OS = UnknownOS;
+  if (Components.size() > 2)
+    OS = ParseOS(Components[2]);
+
+  // Note which components are already in their final position.  These will not
+  // be moved.
+  bool Found[3];
+  Found[0] = Arch != UnknownArch;
+  Found[1] = Vendor != UnknownVendor;
+  Found[2] = OS != UnknownOS;
+
+  // If they are not there already, permute the components into their canonical
+  // positions by seeing if they parse as a valid architecture, and if so moving
+  // the component to the architecture position etc.
+  for (unsigned Pos = 0; Pos != 3; ++Pos) {
+    if (Found[Pos])
+      continue; // Already in the canonical position.
+
+    for (unsigned Idx = 0; Idx != Components.size(); ++Idx) {
+      // Do not reparse any components that already matched.
+      if (Idx < 3 && Found[Idx])
+        continue;
+
+      // Does this component parse as valid for the target position?
+      bool Valid = false;
+      StringRef Comp = Components[Idx];
+      switch (Pos) {
+      default:
+        assert(false && "unexpected component type!");
+      case 0:
+        Arch = ParseArch(Comp);
+        Valid = Arch != UnknownArch;
+        break;
+      case 1:
+        Vendor = ParseVendor(Comp);
+        Valid = Vendor != UnknownVendor;
+        break;
+      case 2:
+        OS = ParseOS(Comp);
+        Valid = OS != UnknownOS;
+        break;
+      }
+      if (!Valid)
+        continue; // Nope, try the next component.
+
+      // Move the component to the target position, pushing any non-fixed
+      // components that are in the way to the right.  This tends to give
+      // good results in the common cases of a forgotten vendor component
+      // or a wrongly positioned environment.
+      if (Pos < Idx) {
+        // Insert left, pushing the existing components to the right.  For
+        // example, a-b-i386 -> i386-a-b when moving i386 to the front.
+        StringRef CurrentComponent(""); // The empty component.
+        // Replace the component we are moving with an empty component.
+        std::swap(CurrentComponent, Components[Idx]);
+        // Insert the component being moved at Pos, displacing any existing
+        // components to the right.
+        for (unsigned i = Pos; !CurrentComponent.empty(); ++i) {
+          // Skip over any fixed components.
+          while (i < 3 && Found[i]) ++i;
+          // Place the component at the new position, getting the component
+          // that was at this position - it will be moved right.
+          std::swap(CurrentComponent, Components[i]);
+        }
+      } else if (Pos > Idx) {
+        // Push right by inserting empty components until the component at Idx
+        // reaches the target position Pos.  For example, pc-a -> -pc-a when
+        // moving pc to the second position.
+        do {
+          // Insert one empty component at Idx.
+          StringRef CurrentComponent(""); // The empty component.
+          for (unsigned i = Idx; i < Components.size(); ++i) {
+            // Skip over any fixed components.
+            while (i < 3 && Found[i]) ++i;
+            // Place the component at the new position, getting the component
+            // that was at this position - it will be moved right.
+            std::swap(CurrentComponent, Components[i]);
+            // If it was placed on top of an empty component then we are done.
+            if (CurrentComponent.empty())
+              break;
+          }
+          // The last component was pushed off the end - append it.
+          if (!CurrentComponent.empty())
+            Components.push_back(CurrentComponent);
+
+          // Advance Idx to the component's new position.
+          while (++Idx < 3 && Found[Idx]) {}
+        } while (Idx < Pos); // Add more until the final position is reached.
+      }
+      assert(Pos < Components.size() && Components[Pos] == Comp &&
+             "Component moved wrong!");
+      Found[Pos] = true;
+      break;
+    }
+  }
+
+  // Special case logic goes here.  At this point Arch, Vendor and OS have the
+  // correct values for the computed components.
+
+  // Stick the corrected components back together to form the normalized string.
+  std::string Normalized;
+  for (unsigned i = 0, e = Components.size(); i != e; ++i) {
+    if (i) Normalized += '-';
+    Normalized += Components[i];
+  }
+  return Normalized;
+}
+
 StringRef Triple::getArchName() const {
   return StringRef(Data).split('-').first;           // Isolate first component
 }
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 8054ae63688c9..dba46df362566 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/System/Signals.h"
 #include "llvm/ADT/STLExtras.h"
 #include <cctype>
 #include <cerrno>
@@ -56,13 +57,6 @@ raw_ostream::~raw_ostream() {
 
   if (BufferMode == InternalBuffer)
     delete [] OutBufStart;
-
-  // If there are any pending errors, report them now. Clients wishing
-  // to avoid report_fatal_error calls should check for errors with
-  // has_error() and clear the error flag with clear_error() before
-  // destructing raw_ostream objects which may have errors.
-  if (Error)
-    report_fatal_error("IO failure on output stream.");
 }
 
 // An out of line virtual method to provide a home for the class vtable.
@@ -143,9 +137,10 @@ raw_ostream &raw_ostream::operator<<(unsigned long long N) {
 }
 
 raw_ostream &raw_ostream::operator<<(long long N) {
-  if (N <  0) {
+  if (N < 0) {
     *this << '-';
-    N = -N;
+    // Avoid undefined behavior on INT64_MIN with a cast.
+    N = -(unsigned long long)N;
   }
 
   return this->operator<<(static_cast<unsigned long long>(N));
@@ -368,7 +363,7 @@ void format_object_base::home() {
 /// stream should be immediately destroyed; the string will be empty
 /// if no error occurred.
 raw_fd_ostream::raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
-                               unsigned Flags) : pos(0) {
+                               unsigned Flags) : Error(false), pos(0) {
   assert(Filename != 0 && "Filename is null");
   // Verify that we don't have both "append" and "excl".
   assert((!(Flags & F_Excl) || !(Flags & F_Append)) &&
@@ -376,14 +371,17 @@ raw_fd_ostream::raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
 
   ErrorInfo.clear();
 
-  // Handle "-" as stdout.
+  // Handle "-" as stdout. Note that when we do this, we consider ourself
+  // the owner of stdout. This means that we can do things like close the
+  // file descriptor when we're done and set the "binary" flag globally.
   if (Filename[0] == '-' && Filename[1] == 0) {
     FD = STDOUT_FILENO;
     // If user requested binary then put stdout into binary mode if
     // possible.
     if (Flags & F_Binary)
       sys::Program::ChangeStdoutToBinary();
-    ShouldClose = false;
+    // Close stdout when we're done, to detect any output errors.
+    ShouldClose = true;
     return;
   }
 
@@ -413,14 +411,22 @@ raw_fd_ostream::raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
 }
 
 raw_fd_ostream::~raw_fd_ostream() {
-  if (FD < 0) return;
-  flush();
-  if (ShouldClose)
-    while (::close(FD) != 0)
-      if (errno != EINTR) {
-        error_detected();
-        break;
-      }
+  if (FD >= 0) {
+    flush();
+    if (ShouldClose)
+      while (::close(FD) != 0)
+        if (errno != EINTR) {
+          error_detected();
+          break;
+        }
+  }
+
+  // If there are any pending errors, report them now. Clients wishing
+  // to avoid report_fatal_error calls should check for errors with
+  // has_error() and clear the error flag with clear_error() before
+  // destructing raw_ostream objects which may have errors.
+  if (has_error())
+    report_fatal_error("IO failure on output stream.");
 }
 
 
@@ -534,30 +540,24 @@ bool raw_fd_ostream::is_displayed() const {
 }
 
 //===----------------------------------------------------------------------===//
-//  raw_stdout/err_ostream
+//  outs(), errs(), nulls()
 //===----------------------------------------------------------------------===//
 
-// Set buffer settings to model stdout and stderr behavior.
-// Set standard error to be unbuffered by default.
-raw_stdout_ostream::raw_stdout_ostream():raw_fd_ostream(STDOUT_FILENO, false) {}
-raw_stderr_ostream::raw_stderr_ostream():raw_fd_ostream(STDERR_FILENO, false,
-                                                        true) {}
-
-// An out of line virtual method to provide a home for the class vtable.
-void raw_stdout_ostream::handle() {}
-void raw_stderr_ostream::handle() {}
-
 /// outs() - This returns a reference to a raw_ostream for standard output.
 /// Use it like: outs() << "foo" << "bar";
 raw_ostream &llvm::outs() {
-  static raw_stdout_ostream S;
+  // Set buffer settings to model stdout behavior.
+  // Delete the file descriptor when the program exists, forcing error
+  // detection. If you don't want this behavior, don't use outs().
+  static raw_fd_ostream S(STDOUT_FILENO, true);
   return S;
 }
 
 /// errs() - This returns a reference to a raw_ostream for standard error.
 /// Use it like: errs() << "foo" << "bar";
 raw_ostream &llvm::errs() {
-  static raw_stderr_ostream S;
+  // Set standard error to be unbuffered by default.
+  static raw_fd_ostream S(STDERR_FILENO, false, true);
   return S;
 }
 
@@ -665,3 +665,34 @@ void raw_null_ostream::write_impl(const char *Ptr, size_t Size) {
 uint64_t raw_null_ostream::current_pos() const {
   return 0;
 }
+
+//===----------------------------------------------------------------------===//
+//  tool_output_file
+//===----------------------------------------------------------------------===//
+
+tool_output_file::CleanupInstaller::CleanupInstaller(const char *filename)
+  : Filename(filename), Keep(false) {
+  // Arrange for the file to be deleted if the process is killed.
+  if (Filename != "-")
+    sys::RemoveFileOnSignal(sys::Path(Filename));
+}
+
+tool_output_file::CleanupInstaller::~CleanupInstaller() {
+  // Delete the file if the client hasn't told us not to.
+  if (!Keep && Filename != "-")
+    sys::Path(Filename).eraseFromDisk();
+
+  // Ok, the file is successfully written and closed, or deleted. There's no
+  // further need to clean it up on signals.
+  if (Filename != "-")
+    sys::DontRemoveFileOnSignal(sys::Path(Filename));
+}
+
+tool_output_file::tool_output_file(const char *filename, std::string &ErrorInfo,
+                                   unsigned Flags)
+  : Installer(filename),
+    OS(filename, ErrorInfo, Flags) {
+  // If open fails, no cleanup is needed.
+  if (!ErrorInfo.empty())
+    Installer.Keep = true;
+}
diff --git a/lib/System/DynamicLibrary.cpp b/lib/System/DynamicLibrary.cpp
index 6f6890c06c499..660db492d6b9a 100644
--- a/lib/System/DynamicLibrary.cpp
+++ b/lib/System/DynamicLibrary.cpp
@@ -70,6 +70,12 @@ bool DynamicLibrary::LoadLibraryPermanently(const char *Filename,
     if (ErrMsg) *ErrMsg = dlerror();
     return true;
   }
+#ifdef __CYGWIN__
+  // Cygwin searches symbols only in the main
+  // with the handle of dlopen(NULL, RTLD_GLOBAL).
+  if (Filename == NULL)
+    H = RTLD_DEFAULT;
+#endif
   if (OpenedHandles == 0)
     OpenedHandles = new std::vector<void *>();
   OpenedHandles->push_back(H);
diff --git a/lib/System/Path.cpp b/lib/System/Path.cpp
index 1235257b27e20..4445c667d86e9 100644
--- a/lib/System/Path.cpp
+++ b/lib/System/Path.cpp
@@ -61,7 +61,7 @@ sys::IdentifyFileType(const char *magic, unsigned length) {
         if (memcmp(magic,"!<arch>\n",8) == 0)
           return Archive_FileType;
       break;
-      
+
     case '\177':
       if (magic[1] == 'E' && magic[2] == 'L' && magic[3] == 'F') {
         if (length >= 18 && magic[17] == 0)
@@ -76,11 +76,11 @@ sys::IdentifyFileType(const char *magic, unsigned length) {
       break;
 
     case 0xCA:
-      if (magic[1] == char(0xFE) && magic[2] == char(0xBA) && 
+      if (magic[1] == char(0xFE) && magic[2] == char(0xBA) &&
           magic[3] == char(0xBE)) {
-        // This is complicated by an overlap with Java class files. 
+        // This is complicated by an overlap with Java class files.
         // See the Mach-O section in /usr/share/file/magic for details.
-        if (length >= 8 && magic[7] < 43) 
+        if (length >= 8 && magic[7] < 43)
           // FIXME: Universal Binary of any type.
           return Mach_O_DynamicallyLinkedSharedLib_FileType;
       }
@@ -89,18 +89,18 @@ sys::IdentifyFileType(const char *magic, unsigned length) {
     case 0xFE:
     case 0xCE: {
       uint16_t type = 0;
-      if (magic[0] == char(0xFE) && magic[1] == char(0xED) && 
+      if (magic[0] == char(0xFE) && magic[1] == char(0xED) &&
           magic[2] == char(0xFA) && magic[3] == char(0xCE)) {
         /* Native endian */
         if (length >= 16) type = magic[14] << 8 | magic[15];
-      } else if (magic[0] == char(0xCE) && magic[1] == char(0xFA) && 
+      } else if (magic[0] == char(0xCE) && magic[1] == char(0xFA) &&
                  magic[2] == char(0xED) && magic[3] == char(0xFE)) {
         /* Reverse endian */
         if (length >= 14) type = magic[13] << 8 | magic[12];
       }
       switch (type) {
-        default: break;      
-        case 1: return Mach_O_Object_FileType; 
+        default: break;
+        case 1: return Mach_O_Object_FileType;
         case 2: return Mach_O_Executable_FileType;
         case 3: return Mach_O_FixedVirtualMemorySharedLib_FileType;
         case 4: return Mach_O_Core_FileType;
@@ -219,38 +219,38 @@ static StringRef getDirnameCharSep(StringRef path, const char *Sep) {
          "Sep must be a 1-character string literal.");
   if (path.empty())
     return ".";
-  
+
   // If the path is all slashes, return a single slash.
   // Otherwise, remove all trailing slashes.
-  
+
   signed pos = static_cast<signed>(path.size()) - 1;
-  
+
   while (pos >= 0 && path[pos] == Sep[0])
     --pos;
-  
+
   if (pos < 0)
     return path[0] == Sep[0] ? Sep : ".";
-  
+
   // Any slashes left?
   signed i = 0;
-  
+
   while (i < pos && path[i] != Sep[0])
     ++i;
-  
+
   if (i == pos) // No slashes?  Return "."
     return ".";
-  
-  // There is at least one slash left.  Remove all trailing non-slashes.  
+
+  // There is at least one slash left.  Remove all trailing non-slashes.
   while (pos >= 0 && path[pos] != Sep[0])
     --pos;
-  
+
   // Remove any trailing slashes.
   while (pos >= 0 && path[pos] == Sep[0])
     --pos;
-  
+
   if (pos < 0)
     return path[0] == Sep[0] ? Sep : ".";
-  
+
   return path.substr(0, pos+1);
 }
 
diff --git a/lib/System/RWMutex.cpp b/lib/System/RWMutex.cpp
index 5faf220eb9168..deb04709d829c 100644
--- a/lib/System/RWMutex.cpp
+++ b/lib/System/RWMutex.cpp
@@ -71,23 +71,9 @@ RWMutexImpl::RWMutexImpl()
     bzero(rwlock, sizeof(pthread_rwlock_t));
 #endif
 
-    pthread_rwlockattr_t attr;
-
-    // Initialize the rwlock attributes
-    int errorcode = pthread_rwlockattr_init(&attr);
-    assert(errorcode == 0);
-
-#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__) && !defined(__DragonFly__)
-    // Make it a process local rwlock
-    errorcode = pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_PRIVATE);
-#endif
-
     // Initialize the rwlock
-    errorcode = pthread_rwlock_init(rwlock, &attr);
-    assert(errorcode == 0);
-
-    // Destroy the attributes
-    errorcode = pthread_rwlockattr_destroy(&attr);
+    int errorcode = pthread_rwlock_init(rwlock, NULL);
+    (void)errorcode;
     assert(errorcode == 0);
 
     // Assign the data member
diff --git a/lib/System/ThreadLocal.cpp b/lib/System/ThreadLocal.cpp
index e7054b5281471..f6a55a1c0b9b1 100644
--- a/lib/System/ThreadLocal.cpp
+++ b/lib/System/ThreadLocal.cpp
@@ -27,6 +27,7 @@ ThreadLocalImpl::ThreadLocalImpl() { }
 ThreadLocalImpl::~ThreadLocalImpl() { }
 void ThreadLocalImpl::setInstance(const void* d) { data = const_cast<void*>(d);}
 const void* ThreadLocalImpl::getInstance() { return data; }
+void ThreadLocalImpl::removeInstance() { data = 0; }
 }
 #else
 
@@ -67,6 +68,10 @@ const void* ThreadLocalImpl::getInstance() {
   return pthread_getspecific(*key);
 }
 
+void ThreadLocalImpl::removeInstance() {
+  setInstance(0);
+}
+
 }
 
 #elif defined(LLVM_ON_UNIX)
diff --git a/lib/System/Unix/Path.inc b/lib/System/Unix/Path.inc
index bc104a32a3aea..47e4d1ac3c6b5 100644
--- a/lib/System/Unix/Path.inc
+++ b/lib/System/Unix/Path.inc
@@ -276,20 +276,20 @@ Path::GetCurrentDirectory() {
   char pathname[MAXPATHLEN];
   if (!getcwd(pathname,MAXPATHLEN)) {
     assert (false && "Could not query current working directory.");
-    return Path("");
+    return Path();
   }
 
   return Path(pathname);
 }
 
-#ifdef __FreeBSD__
+#if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__minix)
 static int
 test_dir(char buf[PATH_MAX], char ret[PATH_MAX],
     const char *dir, const char *bin)
 {
   struct stat sb;
 
-  snprintf(buf, PATH_MAX, "%s//%s", dir, bin);
+  snprintf(buf, PATH_MAX, "%s/%s", dir, bin);
   if (realpath(buf, ret) == NULL)
     return (1);
   if (stat(buf, &sb) != 0)
@@ -334,7 +334,7 @@ getprogpath(char ret[PATH_MAX], const char *bin)
   free(pv);
   return (NULL);
 }
-#endif // __FreeBSD__
+#endif // __FreeBSD__ || __NetBSD__
 
 /// GetMainExecutable - Return the path to the main executable, given the
 /// value of argv[0] from program startup.
@@ -350,7 +350,7 @@ Path Path::GetMainExecutable(const char *argv0, void *MainAddr) {
     if (realpath(exe_path, link_path))
       return Path(std::string(link_path));
   }
-#elif defined(__FreeBSD__)
+#elif defined(__FreeBSD__) || defined (__NetBSD__) || defined(__minix)
   char exe_path[PATH_MAX];
 
   if (getprogpath(exe_path, argv0) != NULL)
@@ -408,7 +408,7 @@ Path::getSuffix() const {
 
   std::string::size_type dot = path.rfind('.');
   if (dot == std::string::npos || dot < slash)
-    return StringRef("");
+    return StringRef();
   else
     return StringRef(path).substr(dot + 1);
 }
diff --git a/lib/System/Unix/Signals.inc b/lib/System/Unix/Signals.inc
index 1e74647e5fdcb..7b7c43efc7864 100644
--- a/lib/System/Unix/Signals.inc
+++ b/lib/System/Unix/Signals.inc
@@ -182,6 +182,16 @@ bool llvm::sys::RemoveFileOnSignal(const sys::Path &Filename,
   return false;
 }
 
+// DontRemoveFileOnSignal - The public API
+void llvm::sys::DontRemoveFileOnSignal(const sys::Path &Filename) {
+  SignalsMutex.acquire();
+  std::vector<sys::Path>::reverse_iterator I =
+    std::find(FilesToRemove.rbegin(), FilesToRemove.rend(), Filename);
+  if (I != FilesToRemove.rend())
+    FilesToRemove.erase(I.base()-1);
+  SignalsMutex.release();
+}
+
 /// AddSignalHandler - Add a function to be called when a signal is delivered
 /// to the process.  The handler can have a cookie passed to it to identify
 /// what instance of the handler it is.
@@ -253,3 +263,37 @@ void llvm::sys::PrintStackTraceOnErrorSignal() {
   AddSignalHandler(PrintStackTrace, 0);
 }
 
+
+/***/
+
+// On Darwin, raise sends a signal to the main thread instead of the current
+// thread. This has the unfortunate effect that assert() and abort() will end up
+// bypassing our crash recovery attempts. We work around this for anything in
+// the same linkage unit by just defining our own versions of the assert handler
+// and abort.
+
+#ifdef __APPLE__
+
+void __assert_rtn(const char *func,
+                  const char *file,
+                  int line,
+                  const char *expr) {
+  if (func)
+    fprintf(stderr, "Assertion failed: (%s), function %s, file %s, line %d.\n",
+            expr, func, file, line);
+  else
+    fprintf(stderr, "Assertion failed: (%s), file %s, line %d.\n",
+            expr, file, line);
+  abort();
+}
+
+#include <signal.h>
+#include <pthread.h>
+
+void abort() {
+  pthread_kill(pthread_self(), SIGABRT);
+  usleep(1000);
+  __builtin_trap();
+}
+
+#endif
diff --git a/lib/System/Unix/ThreadLocal.inc b/lib/System/Unix/ThreadLocal.inc
index 83d554d3077c7..6769520a6fb66 100644
--- a/lib/System/Unix/ThreadLocal.inc
+++ b/lib/System/Unix/ThreadLocal.inc
@@ -22,4 +22,5 @@ ThreadLocalImpl::ThreadLocalImpl() { }
 ThreadLocalImpl::~ThreadLocalImpl() { }
 void ThreadLocalImpl::setInstance(const void* d) { data = const_cast<void*>(d);}
 const void* ThreadLocalImpl::getInstance() { return data; }
+void ThreadLocalImpl::removeInstance() { setInstance(0); }
 }
diff --git a/lib/System/Win32/Path.inc b/lib/System/Win32/Path.inc
index 379527d4ebf24..4a6dbd3ddf299 100644
--- a/lib/System/Win32/Path.inc
+++ b/lib/System/Win32/Path.inc
@@ -400,8 +400,10 @@ PathWithStatus::getFileStatus(bool update, std::string *ErrStr) const {
     for (unsigned i = 0; i < path.length(); ++i)
       status.uniqueID += path[i];
 
-    __int64 ft = *reinterpret_cast<__int64*>(&fi.ftLastWriteTime);
-    status.modTime.fromWin32Time(ft);
+    ULARGE_INTEGER ui;
+    ui.LowPart = fi.ftLastWriteTime.dwLowDateTime;
+    ui.HighPart = fi.ftLastWriteTime.dwHighDateTime;
+    status.modTime.fromWin32Time(ui.QuadPart);
 
     status.isDir = fi.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY;
     fsIsValid = true;
@@ -720,7 +722,7 @@ Path::eraseFromDisk(bool remove_contents, std::string *ErrStr) const {
 
 bool Path::getMagicNumber(std::string& Magic, unsigned len) const {
   assert(len < 1024 && "Request for magic string too long");
-  char* buf = (char*) alloca(1 + len);
+  char* buf = reinterpret_cast<char*>(alloca(len));
 
   HANDLE h = CreateFile(path.c_str(),
                         GENERIC_READ,
@@ -739,8 +741,7 @@ bool Path::getMagicNumber(std::string& Magic, unsigned len) const {
   if (!ret || nRead != len)
     return false;
 
-  buf[len] = '\0';
-  Magic = buf;
+  Magic = std::string(buf, len);
   return true;
 }
 
@@ -777,8 +778,11 @@ Path::setStatusInfoOnDisk(const FileStatus &si, std::string *ErrMsg) const {
     return MakeErrMsg(ErrMsg, path + ": GetFileInformationByHandle: ");
   }
 
+  ULARGE_INTEGER ui;
+  ui.QuadPart = si.modTime.toWin32Time();
   FILETIME ft;
-  (uint64_t&)ft = si.modTime.toWin32Time();
+  ft.dwLowDateTime = ui.LowPart;
+  ft.dwHighDateTime = ui.HighPart;
   BOOL ret = SetFileTime(h, NULL, &ft, &ft);
   DWORD err = GetLastError();
   CloseHandle(h);
diff --git a/lib/System/Win32/Signals.inc b/lib/System/Win32/Signals.inc
index d6db71ba4f359..2498a26ea99c5 100644
--- a/lib/System/Win32/Signals.inc
+++ b/lib/System/Win32/Signals.inc
@@ -140,6 +140,20 @@ bool sys::RemoveFileOnSignal(const sys::Path &Filename, std::string* ErrMsg) {
   return false;
 }
 
+// DontRemoveFileOnSignal - The public API
+void sys::DontRemoveFileOnSignal(const sys::Path &Filename) {
+  if (FilesToRemove == NULL)
+    return;
+
+  FilesToRemove->push_back(Filename);
+  std::vector<sys::Path>::reverse_iterator I =
+  std::find(FilesToRemove->rbegin(), FilesToRemove->rend(), Filename);
+  if (I != FilesToRemove->rend())
+    FilesToRemove->erase(I.base()-1);
+
+  LeaveCriticalSection(&CriticalSection);
+}
+
 /// PrintStackTraceOnErrorSignal - When an error signal (such as SIBABRT or
 /// SIGSEGV) is delivered to the process, print a stack trace and then exit.
 void sys::PrintStackTraceOnErrorSignal() {
diff --git a/lib/System/Win32/ThreadLocal.inc b/lib/System/Win32/ThreadLocal.inc
index c8f7840b00387..b8b933c4d29d9 100644
--- a/lib/System/Win32/ThreadLocal.inc
+++ b/lib/System/Win32/ThreadLocal.inc
@@ -46,4 +46,8 @@ void ThreadLocalImpl::setInstance(const void* d){
   assert(errorcode != 0);
 }
 
+void ThreadLocalImpl::removeInstance() {
+  setInstance(0);
+}
+
 }
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 14825a785649d..271ca44c2b69d 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -30,22 +30,22 @@ class formatted_raw_ostream;
 namespace ARMCC {
   // The CondCodes constants map directly to the 4-bit encoding of the
   // condition field for predicated instructions.
-  enum CondCodes {
-    EQ,
-    NE,
-    HS,
-    LO,
-    MI,
-    PL,
-    VS,
-    VC,
-    HI,
-    LS,
-    GE,
-    LT,
-    GT,
-    LE,
-    AL
+  enum CondCodes { // Meaning (integer)          Meaning (floating-point)
+    EQ,            // Equal                      Equal
+    NE,            // Not equal                  Not equal, or unordered
+    HS,            // Carry set                  >, ==, or unordered
+    LO,            // Carry clear                Less than
+    MI,            // Minus, negative            Less than
+    PL,            // Plus, positive or zero     >, ==, or unordered
+    VS,            // Overflow                   Unordered
+    VC,            // No overflow                Not unordered
+    HI,            // Unsigned higher            Greater than, or unordered
+    LS,            // Unsigned lower or same     Less than or equal
+    GE,            // Greater than or equal      Greater than or equal
+    LT,            // Less than                  Less than, or unordered
+    GT,            // Greater than               Greater than
+    LE,            // Less than or equal         <, ==, or unordered
+    AL             // Always (unconditional)     Always (unconditional)
   };
 
   inline static CondCodes getOppositeCondition(CondCodes CC) {
@@ -90,6 +90,33 @@ inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
   }
 }
 
+namespace ARM_MB {
+  // The Memory Barrier Option constants map directly to the 4-bit encoding of
+  // the option field for memory barrier operations.
+  enum MemBOpt {
+    ST    = 14,
+    ISH   = 11,
+    ISHST = 10,
+    NSH   = 7,
+    NSHST = 6,
+    OSH   = 3,
+    OSHST = 2
+  };
+
+  inline static const char *MemBOptToString(unsigned val) {
+    switch (val) {
+    default: llvm_unreachable("Unknown memory opetion");
+    case ST:    return "st";
+    case ISH:   return "ish";
+    case ISHST: return "ishst";
+    case NSH:   return "nsh";
+    case NSHST: return "nshst";
+    case OSH:   return "osh";
+    case OSHST: return "oshst";
+    }
+  }
+} // namespace ARM_MB
+
 FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 
@@ -98,6 +125,7 @@ FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
 
 FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMExpandPseudoPass();
+FunctionPass *createARMGlobalMergePass(const TargetLowering* tli);
 FunctionPass *createARMConstantIslandPass();
 FunctionPass *createNEONPreAllocPass();
 FunctionPass *createNEONMoveFixPass();
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index fa64d6c2a4b4d..d6a8f19724dc9 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -1,4 +1,4 @@
-//===- ARM.td - Describe the ARM Target Machine -----------------*- C++ -*-===//
+//===- ARM.td - Describe the ARM Target Machine ------------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -20,20 +20,6 @@ include "llvm/Target/Target.td"
 // ARM Subtarget features.
 //
 
-def ArchV4T     : SubtargetFeature<"v4t", "ARMArchVersion", "V4T",
-                                   "ARM v4T">;
-def ArchV5T     : SubtargetFeature<"v5t", "ARMArchVersion", "V5T",
-                                   "ARM v5T">;
-def ArchV5TE    : SubtargetFeature<"v5te", "ARMArchVersion", "V5TE",
-                                   "ARM v5TE, v5TEj, v5TExp">;
-def ArchV6      : SubtargetFeature<"v6", "ARMArchVersion", "V6",
-                                   "ARM v6">;
-def ArchV6T2    : SubtargetFeature<"v6t2", "ARMArchVersion", "V6T2",
-                                   "ARM v6t2">;
-def ArchV7A     : SubtargetFeature<"v7a", "ARMArchVersion", "V7A",
-                                   "ARM v7A">;
-def ArchV7M     : SubtargetFeature<"v7m", "ARMArchVersion", "V7M",
-                                   "ARM v7M">;
 def FeatureVFP2 : SubtargetFeature<"vfp2", "ARMFPUType", "VFPv2",
                                    "Enable VFP2 instructions">;
 def FeatureVFP3 : SubtargetFeature<"vfp3", "ARMFPUType", "VFPv3",
@@ -42,14 +28,20 @@ def FeatureNEON : SubtargetFeature<"neon", "ARMFPUType", "NEON",
                                    "Enable NEON instructions">;
 def FeatureThumb2 : SubtargetFeature<"thumb2", "ThumbMode", "Thumb2",
                                      "Enable Thumb2 instructions">;
+def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
+                                     "Does not support ARM mode execution">;
 def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
                                      "Enable half-precision floating point">;
 def FeatureHWDiv  : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
                                      "Enable divide instructions">;
-def FeatureT2ExtractPack: SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true",
+def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true",
                                  "Enable Thumb2 extract and pack instructions">;
+def FeatureDB     : SubtargetFeature<"db", "HasDataBarrier", "true",
+                                   "Has data barrier (dmb / dsb) instructions">;
 def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
                                          "FP compare + branch is slow">;
+def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
+                          "Floating point unit supports single precision only">;
 
 // Some processors have multiply-accumulate instructions that don't
 // play nicely with other VFP instructions, and it's generally better
@@ -57,14 +49,41 @@ def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
 // FIXME: Currently, this is only flagged for Cortex-A8. It may be true for
 // others as well. We should do more benchmarking and confirm one way or
 // the other.
-def FeatureHasSlowVMLx   : SubtargetFeature<"vmlx", "SlowVMLx", "true",
-                                            "Disable VFP MAC instructions">;
+def FeatureHasSlowVMLx : SubtargetFeature<"vmlx", "SlowVMLx", "true",
+                                          "Disable VFP MAC instructions">;
 // Some processors benefit from using NEON instructions for scalar
 // single-precision FP operations.
 def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
                                         "true",
                                         "Use NEON for single precision FP">;
 
+// Disable 32-bit to 16-bit narrowing for experimentation.
+def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
+                                             "Prefer 32-bit Thumb instrs">;
+
+
+// ARM architectures.
+def ArchV4T     : SubtargetFeature<"v4t", "ARMArchVersion", "V4T",
+                                   "ARM v4T">;
+def ArchV5T     : SubtargetFeature<"v5t", "ARMArchVersion", "V5T",
+                                   "ARM v5T">;
+def ArchV5TE    : SubtargetFeature<"v5te", "ARMArchVersion", "V5TE",
+                                   "ARM v5TE, v5TEj, v5TExp">;
+def ArchV6      : SubtargetFeature<"v6", "ARMArchVersion", "V6",
+                                   "ARM v6">;
+def ArchV6M     : SubtargetFeature<"v6m", "ARMArchVersion", "V6M",
+                                   "ARM v6m",
+                                   [FeatureNoARM, FeatureDB]>;
+def ArchV6T2    : SubtargetFeature<"v6t2", "ARMArchVersion", "V6T2",
+                                   "ARM v6t2",
+                                   [FeatureThumb2]>;
+def ArchV7A     : SubtargetFeature<"v7a", "ARMArchVersion", "V7A",
+                                   "ARM v7A",
+                                   [FeatureThumb2, FeatureNEON, FeatureDB]>;
+def ArchV7M     : SubtargetFeature<"v7m", "ARMArchVersion", "V7M",
+                                   "ARM v7M",
+                                   [FeatureThumb2, FeatureNoARM, FeatureDB,
+                                    FeatureHWDiv]>;
 
 //===----------------------------------------------------------------------===//
 // ARM Processors supported.
@@ -122,20 +141,23 @@ def : Processor<"arm1176jzf-s",     ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
 def : Processor<"mpcorenovfp",      ARMV6Itineraries, [ArchV6]>;
 def : Processor<"mpcore",           ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
 
+// V6M Processors.
+def : Processor<"cortex-m0",        ARMV6Itineraries, [ArchV6M]>;
+
 // V6T2 Processors.
-def : Processor<"arm1156t2-s",     ARMV6Itineraries,
-                 [ArchV6T2, FeatureThumb2]>;
-def : Processor<"arm1156t2f-s",    ARMV6Itineraries,
-                 [ArchV6T2, FeatureThumb2, FeatureVFP2]>;
+def : Processor<"arm1156t2-s",      ARMV6Itineraries, [ArchV6T2]>;
+def : Processor<"arm1156t2f-s",     ARMV6Itineraries, [ArchV6T2, FeatureVFP2]>;
 
 // V7 Processors.
 def : Processor<"cortex-a8",        CortexA8Itineraries,
-                [ArchV7A, FeatureThumb2, FeatureNEON, FeatureHasSlowVMLx,
-                 FeatureSlowFPBrcc, FeatureNEONForFP, FeatureT2ExtractPack]>;
+                [ArchV7A, FeatureHasSlowVMLx,
+                 FeatureSlowFPBrcc, FeatureNEONForFP, FeatureT2XtPk]>;
 def : Processor<"cortex-a9",        CortexA9Itineraries,
-                [ArchV7A, FeatureThumb2, FeatureNEON, FeatureT2ExtractPack]>;
-def : ProcNoItin<"cortex-m3",       [ArchV7M, FeatureThumb2, FeatureHWDiv]>;
-def : ProcNoItin<"cortex-m4",       [ArchV7M, FeatureThumb2, FeatureHWDiv]>;
+                [ArchV7A, FeatureT2XtPk]>;
+
+// V7M Processors.
+def : ProcNoItin<"cortex-m3",       [ArchV7M]>;
+def : ProcNoItin<"cortex-m4",       [ArchV7M, FeatureVFP2, FeatureVFPOnlySP]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h
index 92a13f1d751ca..db481005b3a4c 100644
--- a/lib/Target/ARM/ARMAddressingModes.h
+++ b/lib/Target/ARM/ARMAddressingModes.h
@@ -458,6 +458,7 @@ namespace ARM_AM {
   //    IB - Increment before
   //    DA - Decrement after
   //    DB - Decrement before
+  // For VFP instructions, only the IA and DB modes are valid.
 
   static inline AMSubMode getAM4SubMode(unsigned Mode) {
     return (AMSubMode)(Mode & 0x7);
@@ -477,14 +478,6 @@ namespace ARM_AM {
   //
   // The first operand is always a Reg.  The second operand encodes the
   // operation in bit 8 and the immediate in bits 0-7.
-  //
-  // This is also used for FP load/store multiple ops. The second operand
-  // encodes the number of registers (or 2 times the number of registers
-  // for DPR ops) in bits 0-7. In addition, bits 8-10 encode one of the
-  // following two sub-modes:
-  //
-  //    IA - Increment after
-  //    DB - Decrement before
 
   /// getAM5Opc - This function encodes the addrmode5 opc field.
   static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
@@ -498,17 +491,6 @@ namespace ARM_AM {
     return ((AM5Opc >> 8) & 1) ? sub : add;
   }
 
-  /// getAM5Opc - This function encodes the addrmode5 opc field for VLDM and
-  /// VSTM instructions.
-  static inline unsigned getAM5Opc(AMSubMode SubMode, unsigned char Offset) {
-    assert((SubMode == ia || SubMode == db) &&
-           "Illegal addressing mode 5 sub-mode!");
-    return ((int)SubMode << 8) | Offset;
-  }
-  static inline AMSubMode getAM5SubMode(unsigned AM5Opc) {
-    return (AMSubMode)((AM5Opc >> 8) & 0x7);
-  }
-
   //===--------------------------------------------------------------------===//
   // Addressing Mode #6
   //===--------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 946f4744f5bbc..6cfd5961149fd 100644
--- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -17,7 +17,7 @@
 #include "ARMBuildAttrs.h"
 #include "ARMAddressingModes.h"
 #include "ARMConstantPoolValue.h"
-#include "ARMInstPrinter.h"
+#include "AsmPrinter/ARMInstPrinter.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMMCInstLower.h"
 #include "ARMTargetMachine.h"
@@ -47,6 +47,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
@@ -56,6 +57,15 @@ static cl::opt<bool>
 EnableMCInst("enable-arm-mcinst-printer", cl::Hidden,
             cl::desc("enable experimental asmprinter gunk in the arm backend"));
 
+namespace llvm {
+  namespace ARM {
+    enum DW_ISA {
+      DW_ISA_ARM_thumb = 1,
+      DW_ISA_ARM_arm = 2
+    };
+  }
+}
+
 namespace {
   class ARMAsmPrinter : public AsmPrinter {
 
@@ -80,9 +90,9 @@ namespace {
     virtual const char *getPassName() const {
       return "ARM Assembly Printer";
     }
-    
+
     void printInstructionThroughMCStreamer(const MachineInstr *MI);
-    
+
 
     void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O,
                       const char *Modifier = 0);
@@ -110,8 +120,12 @@ namespace {
     void printAddrModePCOperand(const MachineInstr *MI, int OpNum,
                                 raw_ostream &O,
                                 const char *Modifier = 0);
-    void printBitfieldInvMaskImmOperand (const MachineInstr *MI, int OpNum,
-                                         raw_ostream &O);
+    void printBitfieldInvMaskImmOperand(const MachineInstr *MI, int OpNum,
+                                        raw_ostream &O);
+    void printMemBOption(const MachineInstr *MI, int OpNum,
+                         raw_ostream &O);
+    void printShiftImmOperand(const MachineInstr *MI, int OpNum,
+                              raw_ostream &O);
 
     void printThumbS4ImmOperand(const MachineInstr *MI, int OpNum,
                                 raw_ostream &O);
@@ -190,12 +204,32 @@ namespace {
 
     virtual void EmitInstruction(const MachineInstr *MI);
     bool runOnMachineFunction(MachineFunction &F);
-    
+
     virtual void EmitConstantPool() {} // we emit constant pools customly!
     virtual void EmitFunctionEntryLabel();
     void EmitStartOfAsmFile(Module &M);
     void EmitEndOfAsmFile(Module &M);
 
+    MachineLocation getDebugValueLocation(const MachineInstr *MI) const {
+      MachineLocation Location;
+      assert (MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+      // Frame address.  Currently handles register +- offset only.
+      if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
+        Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
+      else {
+        DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+      }
+      return Location;
+    }
+
+    virtual unsigned getISAEncoding() {
+      // ARM/Darwin adds ISA to the DWARF info for each function.
+      if (!Subtarget->isTargetDarwin())
+        return 0;
+      return Subtarget->isThumb() ?
+        llvm::ARM::DW_ISA_ARM_thumb : llvm::ARM::DW_ISA_ARM_arm;
+    }
+
     MCSymbol *GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2,
                                           const MachineBasicBlock *MBB) const;
     MCSymbol *GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const;
@@ -208,7 +242,7 @@ namespace {
       EmitMachineConstantPoolValue(MCPV, OS);
       OutStreamer.EmitRawText(OS.str());
     }
-    
+
     void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV,
                                       raw_ostream &O) {
       switch (TM.getTargetData()->getTypeAllocSize(MCPV->getType())) {
@@ -234,7 +268,7 @@ namespace {
           // FIXME: Remove this when Darwin transition to @GOT like syntax.
           MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
           O << *Sym;
-          
+
           MachineModuleInfoMachO &MMIMachO =
             MMI->getObjFileInfo<MachineModuleInfoMachO>();
           MachineModuleInfoImpl::StubValueTy &StubSym =
@@ -278,7 +312,7 @@ void ARMAsmPrinter::EmitFunctionEntryLabel() {
       OutStreamer.EmitRawText(OS.str());
     }
   }
-  
+
   OutStreamer.EmitLabel(CurrentFnSym);
 }
 
@@ -358,7 +392,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
   case MachineOperand::MO_ExternalSymbol: {
     bool isCallOp = Modifier && !strcmp(Modifier, "call");
     O << *GetExternalSymbolSymbol(MO.getSymbolName());
-    
+
     if (isCallOp && Subtarget->isTargetELF() &&
         TM.getRelocationModel() == Reloc::PIC_)
       O << "(PLT)";
@@ -438,15 +472,13 @@ void ARMAsmPrinter::printSORegOperand(const MachineInstr *MI, int Op,
   O << getRegisterName(MO1.getReg());
 
   // Print the shift opc.
-  O << ", "
-    << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm()))
-    << " ";
-
+  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm());
+  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
   if (MO2.getReg()) {
-    O << getRegisterName(MO2.getReg());
+    O << ' ' << getRegisterName(MO2.getReg());
     assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
-  } else {
-    O << "#" << ARM_AM::getSORegOffset(MO3.getImm());
+  } else if (ShOpc != ARM_AM::rrx) {
+    O << " #" << ARM_AM::getSORegOffset(MO3.getImm());
   }
 }
 
@@ -575,16 +607,6 @@ void ARMAsmPrinter::printAddrMode5Operand(const MachineInstr *MI, int Op,
 
   assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
 
-  if (Modifier && strcmp(Modifier, "submode") == 0) {
-    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MO2.getImm());
-    O << ARM_AM::getAMSubModeStr(Mode);
-    return;
-  } else if (Modifier && strcmp(Modifier, "base") == 0) {
-    // Used for FSTM{D|S} and LSTM{D|S} operations.
-    O << getRegisterName(MO1.getReg());
-    return;
-  }
-
   O << "[" << getRegisterName(MO1.getReg());
 
   if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) {
@@ -641,6 +663,32 @@ ARMAsmPrinter::printBitfieldInvMaskImmOperand(const MachineInstr *MI, int Op,
   O << "#" << lsb << ", #" << width;
 }
 
+void
+ARMAsmPrinter::printMemBOption(const MachineInstr *MI, int OpNum,
+                               raw_ostream &O) {
+  unsigned val = MI->getOperand(OpNum).getImm();
+  O << ARM_MB::MemBOptToString(val);
+}
+
+void ARMAsmPrinter::printShiftImmOperand(const MachineInstr *MI, int OpNum,
+                                         raw_ostream &O) {
+  unsigned ShiftOp = MI->getOperand(OpNum).getImm();
+  ARM_AM::ShiftOpc Opc = ARM_AM::getSORegShOp(ShiftOp);
+  switch (Opc) {
+  case ARM_AM::no_shift:
+    return;
+  case ARM_AM::lsl:
+    O << ", lsl #";
+    break;
+  case ARM_AM::asr:
+    O << ", asr #";
+    break;
+  default:
+    assert(0 && "unexpected shift opcode for shift immediate operand");
+  }
+  O << ARM_AM::getSORegOffset(ShiftOp);
+}
+
 //===--------------------------------------------------------------------===//
 
 void ARMAsmPrinter::printThumbS4ImmOperand(const MachineInstr *MI, int Op,
@@ -737,12 +785,11 @@ void ARMAsmPrinter::printT2SOOperand(const MachineInstr *MI, int OpNum,
   O << getRegisterName(Reg);
 
   // Print the shift opc.
-  O << ", "
-    << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm()))
-    << " ";
-
   assert(MO2.isImm() && "Not a valid t2_so_reg value!");
-  O << "#" << ARM_AM::getSORegOffset(MO2.getImm());
+  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO2.getImm());
+  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
+  if (ShOpc != ARM_AM::rrx)
+    O << " #" << ARM_AM::getSORegOffset(MO2.getImm());
 }
 
 void ARMAsmPrinter::printT2AddrModeImm12Operand(const MachineInstr *MI,
@@ -916,12 +963,12 @@ void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNum,
 
   const MachineOperand &MO1 = MI->getOperand(OpNum);
   const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
-  
+
   unsigned JTI = MO1.getIndex();
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
   // Can't use EmitLabel until instprinter happens, label comes out in the wrong
   // order.
-  O << *JTISymbol << ":\n";
+  O << "\n" << *JTISymbol << ":\n";
 
   const char *JTEntryDirective = MAI->getData32bitsDirective();
 
@@ -958,12 +1005,12 @@ void ARMAsmPrinter::printJT2BlockOperand(const MachineInstr *MI, int OpNum,
   const MachineOperand &MO1 = MI->getOperand(OpNum);
   const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
   unsigned JTI = MO1.getIndex();
-  
+
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
-  
+
   // Can't use EmitLabel until instprinter happens, label comes out in the wrong
   // order.
-  O << *JTISymbol << ":\n";
+  O << "\n" << *JTISymbol << ":\n";
 
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
@@ -980,7 +1027,7 @@ void ARMAsmPrinter::printJT2BlockOperand(const MachineInstr *MI, int OpNum,
       O << MAI->getData8bitsDirective();
     else if (HalfWordOffset)
       O << MAI->getData16bitsDirective();
-    
+
     if (ByteOffset || HalfWordOffset)
       O << '(' << *MBB->getSymbol() << "-" << *JTISymbol << ")/2";
     else
@@ -1086,10 +1133,10 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     printInstructionThroughMCStreamer(MI);
     return;
   }
-  
+
   if (MI->getOpcode() == ARM::CONSTPOOL_ENTRY)
     EmitAlignment(2);
-  
+
   SmallString<128> Str;
   raw_svector_ostream OS(Str);
   if (MI->getOpcode() == ARM::DBG_VALUE) {
@@ -1112,7 +1159,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   printInstruction(MI, OS);
   OutStreamer.EmitRawText(OS.str());
-  
+
   // Make sure the instruction that follows TBB is 2-byte aligned.
   // FIXME: Constant island pass should insert an "ALIGN" instruction instead.
   if (MI->getOpcode() == ARM::t2TBB)
@@ -1129,7 +1176,7 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
       // avoid out-of-range branches that are due a fundamental limitation of
       // the way symbol offsets are encoded with the current Darwin ARM
       // relocations.
-      const TargetLoweringObjectFileMachO &TLOFMacho = 
+      const TargetLoweringObjectFileMachO &TLOFMacho =
         static_cast<const TargetLoweringObjectFileMachO &>(
           getObjFileLowering());
       OutStreamer.SwitchSection(TLOFMacho.getTextSection());
@@ -1148,6 +1195,12 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
                                      16, SectionKind::getText());
         OutStreamer.SwitchSection(sect);
       }
+      const MCSection *StaticInitSect =
+        OutContext.getMachOSection("__TEXT", "__StaticInit",
+                                   MCSectionMachO::S_REGULAR |
+                                   MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                   SectionKind::getText());
+      OutStreamer.SwitchSection(StaticInitSect);
     }
   }
 
@@ -1173,8 +1226,8 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
       OutStreamer.EmitRawText("\t.eabi_attribute " +
                               Twine(ARMBuildAttrs::ABI_FP_exceptions) + ", 1");
     }
-    
-    if (FiniteOnlyFPMath())
+
+    if (NoInfsFPMath && NoNaNsFPMath)
       OutStreamer.EmitRawText("\t.eabi_attribute " +
                               Twine(ARMBuildAttrs::ABI_FP_number_model)+ ", 1");
     else
@@ -1280,7 +1333,7 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
     // LPC0:
     //     add r0, pc, r0
     // This adds the address of LPC0 to r0.
-    
+
     // Emit the label.
     // FIXME: MOVE TO SHARED PLACE.
     unsigned Id = (unsigned)MI->getOperand(2).getImm();
@@ -1288,8 +1341,8 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
     MCSymbol *Label =OutContext.GetOrCreateSymbol(Twine(Prefix)
                          + "PC" + Twine(getFunctionNumber()) + "_" + Twine(Id));
     OutStreamer.EmitLabel(Label);
-    
-    
+
+
     // Form and emit tha dd.
     MCInst AddInst;
     AddInst.setOpcode(ARM::ADDrr);
@@ -1315,7 +1368,7 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
       EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
     else
       EmitGlobalConstant(MCPE.Val.ConstVal);
-    
+
     return;
   }
   case ARM::MOVi2pieces: { // FIXME: Remove asmstring from td file.
@@ -1325,13 +1378,13 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
 
     unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
     unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
-    
+
     {
       MCInst TmpInst;
       TmpInst.setOpcode(ARM::MOVi);
       TmpInst.addOperand(MCOperand::CreateReg(DstReg));
       TmpInst.addOperand(MCOperand::CreateImm(SOImmValV1));
-      
+
       // Predicate.
       TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
       TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
@@ -1349,11 +1402,11 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
       // Predicate.
       TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
       TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
-      
+
       TmpInst.addOperand(MCOperand::CreateReg(0));          // cc_out
       OutStreamer.EmitInstruction(TmpInst);
     }
-    return; 
+    return;
   }
   case ARM::MOVi32imm: { // FIXME: Remove asmstring from td file.
     // This is a hack that lowers as a two instruction sequence.
@@ -1384,32 +1437,32 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
       TmpInst.setOpcode(ARM::MOVi16);
       TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // dstreg
       TmpInst.addOperand(V1); // lower16(imm)
-      
+
       // Predicate.
       TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
       TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
-      
+
       OutStreamer.EmitInstruction(TmpInst);
     }
-    
+
     {
       MCInst TmpInst;
       TmpInst.setOpcode(ARM::MOVTi16);
       TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // dstreg
       TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // srcreg
       TmpInst.addOperand(V2);   // upper16(imm)
-      
+
       // Predicate.
       TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
       TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
-      
+
       OutStreamer.EmitInstruction(TmpInst);
     }
-    
+
     return;
   }
   }
-      
+
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
   OutStreamer.EmitInstruction(TmpInst);
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 49c16f3e07205..3a8bebe0dd247 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -15,9 +15,9 @@
 #include "ARM.h"
 #include "ARMAddressingModes.h"
 #include "ARMConstantPoolValue.h"
-#include "ARMGenInstrInfo.inc"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMRegisterInfo.h"
+#include "ARMGenInstrInfo.inc"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalValue.h"
@@ -501,7 +501,7 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
       llvm_unreachable("Unknown or unset size field for instr!");
     case TargetOpcode::IMPLICIT_DEF:
     case TargetOpcode::KILL:
-    case TargetOpcode::DBG_LABEL:
+    case TargetOpcode::PROLOG_LABEL:
     case TargetOpcode::EH_LABEL:
     case TargetOpcode::DBG_VALUE:
       return 0;
@@ -573,48 +573,6 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   return 0; // Not reached
 }
 
-/// Return true if the instruction is a register to register move and
-/// leave the source and dest operands in the passed parameters.
-///
-bool
-ARMBaseInstrInfo::isMoveInstr(const MachineInstr &MI,
-                              unsigned &SrcReg, unsigned &DstReg,
-                              unsigned& SrcSubIdx, unsigned& DstSubIdx) const {
-  switch (MI.getOpcode()) {
-  default: break;
-  case ARM::VMOVS:
-  case ARM::VMOVD:
-  case ARM::VMOVDneon:
-  case ARM::VMOVQ:
-  case ARM::VMOVQQ : {
-    SrcReg = MI.getOperand(1).getReg();
-    DstReg = MI.getOperand(0).getReg();
-    SrcSubIdx = MI.getOperand(1).getSubReg();
-    DstSubIdx = MI.getOperand(0).getSubReg();
-    return true;
-  }
-  case ARM::MOVr:
-  case ARM::MOVr_TC:
-  case ARM::tMOVr:
-  case ARM::tMOVgpr2tgpr:
-  case ARM::tMOVtgpr2gpr:
-  case ARM::tMOVgpr2gpr:
-  case ARM::t2MOVr: {
-    assert(MI.getDesc().getNumOperands() >= 2 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           "Invalid ARM MOV instruction");
-    SrcReg = MI.getOperand(1).getReg();
-    DstReg = MI.getOperand(0).getReg();
-    SrcSubIdx = MI.getOperand(1).getSubReg();
-    DstSubIdx = MI.getOperand(0).getSubReg();
-    return true;
-  }
-  }
-
-  return false;
-}
-
 unsigned
 ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
                                       int &FrameIndex) const {
@@ -763,8 +721,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                             Align);
 
   // tGPR is used sometimes in ARM instructions that need to avoid using
-  // certain registers.  Just treat it as GPR here.
-  if (RC == ARM::tGPRRegisterClass || RC == ARM::tcGPRRegisterClass)
+  // certain registers.  Just treat it as GPR here. Likewise, rGPR.
+  if (RC == ARM::tGPRRegisterClass || RC == ARM::tcGPRRegisterClass
+      || RC == ARM::rGPRRegisterClass)
     RC = ARM::GPRRegisterClass;
 
   switch (RC->getID()) {
@@ -798,7 +757,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQ))
                      .addReg(SrcReg, getKillRegState(isKill))
                      .addFrameIndex(FI)
-                     .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))
+                     .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
                      .addMemOperand(MMO));
     }
     break;
@@ -818,7 +777,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MachineInstrBuilder MIB =
         AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMD))
                        .addFrameIndex(FI)
-                       .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)))
+                       .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)))
         .addMemOperand(MMO);
       MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
       MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
@@ -830,7 +789,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineInstrBuilder MIB =
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMD))
                      .addFrameIndex(FI)
-                     .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)))
+                     .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)))
       .addMemOperand(MMO);
     MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
     MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
@@ -865,7 +824,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
   // tGPR is used sometimes in ARM instructions that need to avoid using
   // certain registers.  Just treat it as GPR here.
-  if (RC == ARM::tGPRRegisterClass || RC == ARM::tcGPRRegisterClass)
+  if (RC == ARM::tGPRRegisterClass || RC == ARM::tcGPRRegisterClass
+      || RC == ARM::rGPRRegisterClass)
     RC = ARM::GPRRegisterClass;
 
   switch (RC->getID()) {
@@ -893,7 +853,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     } else {
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQ), DestReg)
                      .addFrameIndex(FI)
-                     .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))
+                     .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
                      .addMemOperand(MMO));
     }
     break;
@@ -910,7 +870,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MachineInstrBuilder MIB =
         AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD))
                        .addFrameIndex(FI)
-                       .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)))
+                       .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)))
         .addMemOperand(MMO);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
@@ -922,7 +882,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineInstrBuilder MIB =
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD))
                      .addFrameIndex(FI)
-                     .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)))
+                     .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)))
       .addMemOperand(MMO);
     MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
     MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
@@ -963,6 +923,11 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
 
   unsigned PCLabelId = AFI->createConstPoolEntryUId();
   ARMConstantPoolValue *NewCPV = 0;
+  // FIXME: The below assumes PIC relocation model and that the function
+  // is Thumb mode (t1 or t2). PCAdjustment would be 8 for ARM mode PIC, and
+  // zero for non-PIC in ARM or Thumb. The callers are all of thumb LDR
+  // instructions, so that's probably OK, but is PIC always correct when
+  // we get here?
   if (ACPV->isGlobalValue())
     NewCPV = new ARMConstantPoolValue(ACPV->getGV(), PCLabelId,
                                       ARMCP::CPValue, 4);
@@ -972,6 +937,9 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
   else if (ACPV->isBlockAddress())
     NewCPV = new ARMConstantPoolValue(ACPV->getBlockAddress(), PCLabelId,
                                       ARMCP::CPBlockAddress, 4);
+  else if (ACPV->isLSDA())
+    NewCPV = new ARMConstantPoolValue(MF.getFunction(), PCLabelId,
+                                      ARMCP::CPLSDA, 4);
   else
     llvm_unreachable("Unexpected ARM constantpool value type!!");
   CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlignment());
@@ -1393,3 +1361,63 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   Offset = (isSub) ? -Offset : Offset;
   return Offset == 0;
 }
+
+bool ARMBaseInstrInfo::
+AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg, int &CmpValue) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::CMPri:
+  case ARM::CMPzri:
+  case ARM::t2CMPri:
+  case ARM::t2CMPzri:
+    SrcReg = MI->getOperand(0).getReg();
+    CmpValue = MI->getOperand(1).getImm();
+    return true;
+  }
+
+  return false;
+}
+
+/// ConvertToSetZeroFlag - Convert the instruction to set the "zero" flag so
+/// that we can remove a "comparison with zero".
+bool ARMBaseInstrInfo::
+ConvertToSetZeroFlag(MachineInstr *MI, MachineInstr *CmpInstr) const {
+  // Conservatively refuse to convert an instruction which isn't in the same BB
+  // as the comparison.
+  if (MI->getParent() != CmpInstr->getParent())
+    return false;
+
+  // Check that CPSR isn't set between the comparison instruction and the one we
+  // want to change.
+  MachineBasicBlock::const_iterator I = CmpInstr, E = MI;
+  --I;
+  for (; I != E; --I) {
+    const MachineInstr &Instr = *I;
+
+    for (unsigned IO = 0, EO = Instr.getNumOperands(); IO != EO; ++IO) {
+      const MachineOperand &MO = Instr.getOperand(IO);
+      if (!MO.isReg() || !MO.isDef()) continue;
+
+      // This instruction modifies CPSR before the one we want to change. We
+      // can't do this transformation.
+      if (MO.getReg() == ARM::CPSR)
+        return false;
+    }
+  }
+
+  // Set the "zero" bit in CPSR.
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::ADDri:
+  case ARM::SUBri:
+  case ARM::t2ADDri:
+  case ARM::t2SUBri:
+    MI->RemoveOperand(5);
+    MachineInstrBuilder(MI)
+      .addReg(ARM::CPSR, RegState::Define | RegState::Implicit);
+    CmpInstr->eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 89a2db74a75ef..b4f4a33a70adf 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -15,11 +15,12 @@
 #define ARMBASEINSTRUCTIONINFO_H
 
 #include "ARM.h"
-#include "ARMRegisterInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
 namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseRegisterInfo;
 
 /// ARMII - This namespace holds all of the target specific flags that
 /// instruction info tracks.
@@ -97,44 +98,45 @@ namespace ARMII {
 
     // Miscellaneous arithmetic instructions
     ArithMiscFrm  = 12 << FormShift,
+    SatFrm        = 13 << FormShift,
 
     // Extend instructions
-    ExtFrm        = 13 << FormShift,
+    ExtFrm        = 14 << FormShift,
 
     // VFP formats
-    VFPUnaryFrm   = 14 << FormShift,
-    VFPBinaryFrm  = 15 << FormShift,
-    VFPConv1Frm   = 16 << FormShift,
-    VFPConv2Frm   = 17 << FormShift,
-    VFPConv3Frm   = 18 << FormShift,
-    VFPConv4Frm   = 19 << FormShift,
-    VFPConv5Frm   = 20 << FormShift,
-    VFPLdStFrm    = 21 << FormShift,
-    VFPLdStMulFrm = 22 << FormShift,
-    VFPMiscFrm    = 23 << FormShift,
+    VFPUnaryFrm   = 15 << FormShift,
+    VFPBinaryFrm  = 16 << FormShift,
+    VFPConv1Frm   = 17 << FormShift,
+    VFPConv2Frm   = 18 << FormShift,
+    VFPConv3Frm   = 19 << FormShift,
+    VFPConv4Frm   = 20 << FormShift,
+    VFPConv5Frm   = 21 << FormShift,
+    VFPLdStFrm    = 22 << FormShift,
+    VFPLdStMulFrm = 23 << FormShift,
+    VFPMiscFrm    = 24 << FormShift,
 
     // Thumb format
-    ThumbFrm      = 24 << FormShift,
+    ThumbFrm      = 25 << FormShift,
 
     // Miscelleaneous format
-    MiscFrm       = 25 << FormShift,
+    MiscFrm       = 26 << FormShift,
 
     // NEON formats
-    NGetLnFrm     = 26 << FormShift,
-    NSetLnFrm     = 27 << FormShift,
-    NDupFrm       = 28 << FormShift,
-    NLdStFrm      = 29 << FormShift,
-    N1RegModImmFrm= 30 << FormShift,
-    N2RegFrm      = 31 << FormShift,
-    NVCVTFrm      = 32 << FormShift,
-    NVDupLnFrm    = 33 << FormShift,
-    N2RegVShLFrm  = 34 << FormShift,
-    N2RegVShRFrm  = 35 << FormShift,
-    N3RegFrm      = 36 << FormShift,
-    N3RegVShFrm   = 37 << FormShift,
-    NVExtFrm      = 38 << FormShift,
-    NVMulSLFrm    = 39 << FormShift,
-    NVTBLFrm      = 40 << FormShift,
+    NGetLnFrm     = 27 << FormShift,
+    NSetLnFrm     = 28 << FormShift,
+    NDupFrm       = 29 << FormShift,
+    NLdStFrm      = 30 << FormShift,
+    N1RegModImmFrm= 31 << FormShift,
+    N2RegFrm      = 32 << FormShift,
+    NVCVTFrm      = 33 << FormShift,
+    NVDupLnFrm    = 34 << FormShift,
+    N2RegVShLFrm  = 35 << FormShift,
+    N2RegVShRFrm  = 36 << FormShift,
+    N3RegFrm      = 37 << FormShift,
+    N3RegVShFrm   = 38 << FormShift,
+    NVExtFrm      = 39 << FormShift,
+    NVMulSLFrm    = 40 << FormShift,
+    NVTBLFrm      = 41 << FormShift,
 
     //===------------------------------------------------------------------===//
     // Misc flags.
@@ -198,7 +200,7 @@ namespace ARMII {
 }
 
 class ARMBaseInstrInfo : public TargetInstrInfoImpl {
-  const ARMSubtarget& Subtarget;
+  const ARMSubtarget &Subtarget;
 protected:
   // Can be only subclassed.
   explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
@@ -223,7 +225,7 @@ public:
   virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                              MachineBasicBlock *&FBB,
                              SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify) const;
+                             bool AllowModify = false) const;
   virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                 MachineBasicBlock *FBB,
@@ -262,12 +264,6 @@ public:
   ///
   virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const;
 
-  /// Return true if the instruction is a register to register move and return
-  /// the source and dest operands and their sub-register indices by reference.
-  virtual bool isMoveInstr(const MachineInstr &MI,
-                           unsigned &SrcReg, unsigned &DstReg,
-                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-
   virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                        int &FrameIndex) const;
   virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
@@ -341,6 +337,17 @@ public:
                                          unsigned NumInstrs) const {
     return NumInstrs && NumInstrs == 1;
   }
+
+  /// AnalyzeCompare - For a comparison instruction, return the source register
+  /// in SrcReg and the value it compares against in CmpValue. Return true if
+  /// the comparison instruction can be analyzed.
+  virtual bool AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                              int &CmpValue) const;
+
+  /// ConvertToSetZeroFlag - Convert the instruction to set the zero flag so
+  /// that we can remove a "comparison with zero".
+  virtual bool ConvertToSetZeroFlag(MachineInstr *Instr,
+                                    MachineInstr *CmpInstr) const;
 };
 
 static inline
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 182bd99371457..eceafad63f17a 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -40,13 +40,20 @@
 #include "llvm/Support/CommandLine.h"
 
 namespace llvm {
-cl::opt<bool>
-ReuseFrameIndexVals("arm-reuse-frame-index-vals", cl::Hidden, cl::init(true),
-          cl::desc("Reuse repeated frame index values"));
+static cl::opt<bool>
+ForceAllBaseRegAlloc("arm-force-base-reg-alloc", cl::Hidden, cl::init(false),
+          cl::desc("Force use of virtual base registers for stack load/store"));
+static cl::opt<bool>
+EnableLocalStackAlloc("enable-local-stack-alloc", cl::init(true), cl::Hidden,
+          cl::desc("Enable pre-regalloc stack frame index allocation"));
 }
 
 using namespace llvm;
 
+static cl::opt<bool>
+EnableBasePointer("arm-use-base-pointer", cl::Hidden, cl::init(true),
+          cl::desc("Enable use of a base pointer for complex stack frames"));
+
 unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum,
                                                    bool *isSPVFP) {
   if (isSPVFP)
@@ -143,7 +150,8 @@ ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
                                          const ARMSubtarget &sti)
   : ARMGenRegisterInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
     TII(tii), STI(sti),
-    FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11) {
+    FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11),
+    BasePtr(ARM::R6) {
 }
 
 const unsigned*
@@ -176,8 +184,11 @@ getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(ARM::SP);
   Reserved.set(ARM::PC);
-  if (STI.isTargetDarwin() || hasFP(MF))
+  Reserved.set(ARM::FPSCR);
+  if (hasFP(MF))
     Reserved.set(FramePtr);
+  if (hasBasePointer(MF))
+    Reserved.set(BasePtr);
   // Some targets reserve R9.
   if (STI.isR9Reserved())
     Reserved.set(ARM::R9);
@@ -191,9 +202,13 @@ bool ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF,
   case ARM::SP:
   case ARM::PC:
     return true;
+  case ARM::R6:
+    if (hasBasePointer(MF))
+      return true;
+    break;
   case ARM::R7:
   case ARM::R11:
-    if (FramePtr == Reg && (STI.isTargetDarwin() || hasFP(MF)))
+    if (FramePtr == Reg && hasFP(MF))
       return true;
     break;
   case ARM::R9:
@@ -510,7 +525,7 @@ ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC,
       return std::make_pair(RC->allocation_order_begin(MF),
                             RC->allocation_order_end(MF));
 
-    if (!STI.isTargetDarwin() && !hasFP(MF)) {
+    if (!hasFP(MF)) {
       if (!STI.isR9Reserved())
         return std::make_pair(GPREven1,
                               GPREven1 + (sizeof(GPREven1)/sizeof(unsigned)));
@@ -539,7 +554,7 @@ ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC,
       return std::make_pair(RC->allocation_order_begin(MF),
                             RC->allocation_order_end(MF));
 
-    if (!STI.isTargetDarwin() && !hasFP(MF)) {
+    if (!hasFP(MF)) {
       if (!STI.isR9Reserved())
         return std::make_pair(GPROdd1,
                               GPROdd1 + (sizeof(GPROdd1)/sizeof(unsigned)));
@@ -609,30 +624,68 @@ ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
 /// or if frame pointer elimination is disabled.
 ///
 bool ARMBaseRegisterInfo::hasFP(const MachineFunction &MF) const {
+  // Mac OS X requires FP not to be clobbered for backtracing purpose.
+  if (STI.isTargetDarwin())
+    return true;
+
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return ((DisableFramePointerElim(MF) && MFI->adjustsStack())||
+  // Always eliminate non-leaf frame pointers.
+  return ((DisableFramePointerElim(MF) && MFI->hasCalls()) ||
           needsStackRealignment(MF) ||
           MFI->hasVarSizedObjects() ||
           MFI->isFrameAddressTaken());
 }
 
+bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  if (!EnableBasePointer)
+    return false;
+
+  if (needsStackRealignment(MF) && MFI->hasVarSizedObjects())
+    return true;
+
+  // Thumb has trouble with negative offsets from the FP. Thumb2 has a limited
+  // negative range for ldr/str (255), and thumb1 is positive offsets only.
+  // It's going to be better to use the SP or Base Pointer instead. When there
+  // are variable sized objects, we can't reference off of the SP, so we
+  // reserve a Base Pointer.
+  if (AFI->isThumbFunction() && MFI->hasVarSizedObjects()) {
+    // Conservatively estimate whether the negative offset from the frame
+    // pointer will be sufficient to reach. If a function has a smallish
+    // frame, it's less likely to have lots of spills and callee saved
+    // space, so it's all more likely to be within range of the frame pointer.
+    // If it's wrong, the scavenger will still enable access to work, it just
+    // won't be optimal.
+    if (AFI->isThumb2Function() && MFI->getLocalFrameSize() < 128)
+      return false;
+    return true;
+  }
+
+  return false;
+}
+
 bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  return (RealignStack &&
-          !AFI->isThumb1OnlyFunction() &&
-          !MFI->hasVarSizedObjects());
+  // We can't realign the stack if:
+  // 1. Dynamic stack realignment is explicitly disabled,
+  // 2. This is a Thumb1 function (it's not useful, so we don't bother), or
+  // 3. There are VLAs in the function and the base pointer is disabled.
+  return (RealignStack && !AFI->isThumb1OnlyFunction() &&
+          (!MFI->hasVarSizedObjects() || EnableBasePointer));
 }
 
 bool ARMBaseRegisterInfo::
 needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const Function *F = MF.getFunction();
   unsigned StackAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
-  return (RealignStack &&
-          !AFI->isThumb1OnlyFunction() &&
-          (MFI->getMaxAlignment() > StackAlign) &&
-          !MFI->hasVarSizedObjects());
+  bool requiresRealignment = ((MFI->getLocalFrameMaxAlign() > StackAlign) ||
+                               F->hasFnAttr(Attribute::StackAlignment));
+
+  return requiresRealignment && canRealignStack(MF);
 }
 
 bool ARMBaseRegisterInfo::
@@ -668,6 +721,7 @@ static unsigned estimateStackSize(MachineFunction &MF) {
 /// instructions will require a scratch register during their expansion later.
 unsigned
 ARMBaseRegisterInfo::estimateRSStackSizeLimit(MachineFunction &MF) const {
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned Limit = (1 << 12) - 1;
   for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
     for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
@@ -693,7 +747,10 @@ ARMBaseRegisterInfo::estimateRSStackSizeLimit(MachineFunction &MF) const {
           Limit = std::min(Limit, ((1U << 8) - 1) * 4);
           break;
         case ARMII::AddrModeT2_i12:
-          if (hasFP(MF)) Limit = std::min(Limit, (1U << 8) - 1);
+          // i12 supports only positive offset so these will be converted to
+          // i8 opcodes. See llvm::rewriteT2FrameIndex.
+          if (hasFP(MF) && AFI->hasStackFrame())
+            Limit = std::min(Limit, (1U << 8) - 1);
           break;
         case ARMII::AddrMode6:
           // Addressing mode 6 (load/store) instructions can't encode an
@@ -710,6 +767,19 @@ ARMBaseRegisterInfo::estimateRSStackSizeLimit(MachineFunction &MF) const {
   return Limit;
 }
 
+static unsigned GetFunctionSizeInBytes(const MachineFunction &MF,
+                                       const ARMBaseInstrInfo &TII) {
+  unsigned FnSize = 0;
+  for (MachineFunction::const_iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    const MachineBasicBlock &MBB = *MBBI;
+    for (MachineBasicBlock::const_iterator I = MBB.begin(),E = MBB.end();
+         I != E; ++I)
+      FnSize += TII.GetInstSizeInBytes(I);
+  }
+  return FnSize;
+}
+
 void
 ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                        RegScavenger *RS) const {
@@ -737,6 +807,10 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   if (AFI->isThumb1OnlyFunction() && AFI->getVarArgsRegSaveSize() > 0)
     MF.getRegInfo().setPhysRegUsed(ARM::LR);
 
+  // Spill the BasePtr if it's used.
+  if (hasBasePointer(MF))
+    MF.getRegInfo().setPhysRegUsed(BasePtr);
+
   // Don't spill FP if the frame can be eliminated. This is determined
   // by scanning the callee-save registers to see if any is used.
   const unsigned *CSRegs = getCalleeSavedRegs();
@@ -807,7 +881,7 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
   bool ForceLRSpill = false;
   if (!LRSpilled && AFI->isThumb1OnlyFunction()) {
-    unsigned FnSize = TII.GetFunctionSizeInBytes(MF);
+    unsigned FnSize = GetFunctionSizeInBytes(MF, TII);
     // Force LR to be spilled if the Thumb function size is > 2048. This enables
     // use of BL to implement far jump. If it turns out that it's not needed
     // then the branch fix up path will undo it.
@@ -824,13 +898,19 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // slot of the previous FP. Also, if we have variable sized objects in the
   // function, stack slot references will often be negative, and some of
   // our instructions are positive-offset only, so conservatively consider
-  // that case to want a spill slot (or register) as well.
+  // that case to want a spill slot (or register) as well. Similarly, if
+  // the function adjusts the stack pointer during execution and the
+  // adjustments aren't already part of our stack size estimate, our offset
+  // calculations may be off, so be conservative.
   // FIXME: We could add logic to be more precise about negative offsets
   //        and which instructions will need a scratch register for them. Is it
   //        worth the effort and added fragility?
   bool BigStack =
-    (RS && (estimateStackSize(MF) + (hasFP(MF) ? 4:0) >=
-            estimateRSStackSizeLimit(MF))) || MFI->hasVarSizedObjects();
+    (RS &&
+     (estimateStackSize(MF) + ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >=
+      estimateRSStackSizeLimit(MF)))
+    || MFI->hasVarSizedObjects()
+    || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF));
 
   bool ExtraCSSpill = false;
   if (BigStack || !CanEliminateFrame || cannotEliminateFrame(MF)) {
@@ -848,9 +928,7 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       ExtraCSSpill = true;
     }
 
-    // Darwin ABI requires FP to point to the stack slot that contains the
-    // previous FP.
-    if (STI.isTargetDarwin() || hasFP(MF)) {
+    if (hasFP(MF)) {
       MF.getRegInfo().setPhysRegUsed(FramePtr);
       NumGPRSpills++;
     }
@@ -941,55 +1019,88 @@ unsigned ARMBaseRegisterInfo::getRARegister() const {
   return ARM::LR;
 }
 
-unsigned 
+unsigned
 ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  if (STI.isTargetDarwin() || hasFP(MF))
+  if (hasFP(MF))
     return FramePtr;
   return ARM::SP;
 }
 
+// Provide a base+offset reference to an FI slot for debug info. It's the
+// same as what we use for resolving the code-gen references for now.
+// FIXME: This can go wrong when references are SP-relative and simple call
+//        frames aren't used.
 int
 ARMBaseRegisterInfo::getFrameIndexReference(const MachineFunction &MF, int FI,
                                             unsigned &FrameReg) const {
+  return ResolveFrameIndexReference(MF, FI, FrameReg, 0);
+}
+
+int
+ARMBaseRegisterInfo::ResolveFrameIndexReference(const MachineFunction &MF,
+                                                int FI,
+                                                unsigned &FrameReg,
+                                                int SPAdj) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
+  int FPOffset = Offset - AFI->getFramePtrSpillOffset();
   bool isFixed = MFI->isFixedObjectIndex(FI);
 
   FrameReg = ARM::SP;
+  Offset += SPAdj;
   if (AFI->isGPRCalleeSavedArea1Frame(FI))
-    Offset -= AFI->getGPRCalleeSavedArea1Offset();
+    return Offset - AFI->getGPRCalleeSavedArea1Offset();
   else if (AFI->isGPRCalleeSavedArea2Frame(FI))
-    Offset -= AFI->getGPRCalleeSavedArea2Offset();
+    return Offset - AFI->getGPRCalleeSavedArea2Offset();
   else if (AFI->isDPRCalleeSavedAreaFrame(FI))
-    Offset -= AFI->getDPRCalleeSavedAreaOffset();
-  else if (needsStackRealignment(MF)) {
-    // When dynamically realigning the stack, use the frame pointer for
-    // parameters, and the stack pointer for locals.
+    return Offset - AFI->getDPRCalleeSavedAreaOffset();
+
+  // When dynamically realigning the stack, use the frame pointer for
+  // parameters, and the stack/base pointer for locals.
+  if (needsStackRealignment(MF)) {
     assert (hasFP(MF) && "dynamic stack realignment without a FP!");
     if (isFixed) {
       FrameReg = getFrameRegister(MF);
-      Offset -= AFI->getFramePtrSpillOffset();
+      Offset = FPOffset;
+    } else if (MFI->hasVarSizedObjects()) {
+      assert(hasBasePointer(MF) &&
+             "VLAs and dynamic stack alignment, but missing base pointer!");
+      FrameReg = BasePtr;
     }
-  } else if (hasFP(MF) && AFI->hasStackFrame()) {
-    if (isFixed || MFI->hasVarSizedObjects()) {
-      // Use frame pointer to reference fixed objects unless this is a
-      // frameless function.
+    return Offset;
+  }
+
+  // If there is a frame pointer, use it when we can.
+  if (hasFP(MF) && AFI->hasStackFrame()) {
+    // Use frame pointer to reference fixed objects. Use it for locals if
+    // there are VLAs (and thus the SP isn't reliable as a base).
+    if (isFixed || (MFI->hasVarSizedObjects() && !hasBasePointer(MF))) {
       FrameReg = getFrameRegister(MF);
-      Offset -= AFI->getFramePtrSpillOffset();
+      return FPOffset;
+    } else if (MFI->hasVarSizedObjects()) {
+      assert(hasBasePointer(MF) && "missing base pointer!");
+      // Use the base register since we have it.
+      FrameReg = BasePtr;
     } else if (AFI->isThumb2Function()) {
-      // In Thumb2 mode, the negative offset is very limited.
-      int FPOffset = Offset - AFI->getFramePtrSpillOffset();
+      // In Thumb2 mode, the negative offset is very limited. Try to avoid
+      // out of range references.
       if (FPOffset >= -255 && FPOffset < 0) {
         FrameReg = getFrameRegister(MF);
-        Offset = FPOffset;
+        return FPOffset;
       }
+    } else if (Offset > (FPOffset < 0 ? -FPOffset : FPOffset)) {
+      // Otherwise, use SP or FP, whichever is closer to the stack slot.
+      FrameReg = getFrameRegister(MF);
+      return FPOffset;
     }
   }
+  // Use the base pointer if we have one.
+  if (hasBasePointer(MF))
+    FrameReg = BasePtr;
   return Offset;
 }
 
-
 int
 ARMBaseRegisterInfo::getFrameIndexOffset(const MachineFunction &MF,
                                          int FI) const {
@@ -1024,7 +1135,8 @@ unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
   case ARM::R5:
     return ARM::R4;
   case ARM::R7:
-    return isReservedReg(MF, ARM::R7)  ? 0 : ARM::R6;
+    return (isReservedReg(MF, ARM::R7) || isReservedReg(MF, ARM::R6))
+      ? 0 : ARM::R6;
   case ARM::R9:
     return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R8;
   case ARM::R11:
@@ -1113,7 +1225,8 @@ unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg,
   case ARM::R4:
     return ARM::R5;
   case ARM::R6:
-    return isReservedReg(MF, ARM::R7)  ? 0 : ARM::R7;
+    return (isReservedReg(MF, ARM::R7) || isReservedReg(MF, ARM::R6))
+      ? 0 : ARM::R7;
   case ARM::R8:
     return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R9;
   case ARM::R10:
@@ -1220,13 +1333,18 @@ requiresFrameIndexScavenging(const MachineFunction &MF) const {
   return true;
 }
 
+bool ARMBaseRegisterInfo::
+requiresVirtualBaseRegisters(const MachineFunction &MF) const {
+  return EnableLocalStackAlloc;
+}
+
 // hasReservedCallFrame - Under normal circumstances, when a frame pointer is
 // not required, we reserve argument space for call sites in the function
 // immediately on entry to the current function. This eliminates the need for
 // add/sub sp brackets around call sites. Returns true if the call frame is
 // included as part of the stack frame.
 bool ARMBaseRegisterInfo::
-hasReservedCallFrame(MachineFunction &MF) const {
+hasReservedCallFrame(const MachineFunction &MF) const {
   const MachineFrameInfo *FFI = MF.getFrameInfo();
   unsigned CFSize = FFI->getMaxCallFrameSize();
   // It's not always a good idea to include the call frame as part of the
@@ -1244,7 +1362,7 @@ hasReservedCallFrame(MachineFunction &MF) const {
 // is not sufficient here since we still may reference some objects via SP
 // even when FP is available in Thumb2 mode.
 bool ARMBaseRegisterInfo::
-canSimplifyCallFramePseudos(MachineFunction &MF) const {
+canSimplifyCallFramePseudos(const MachineFunction &MF) const {
   return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects();
 }
 
@@ -1305,10 +1423,258 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
-unsigned
+int64_t ARMBaseRegisterInfo::
+getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const {
+  const TargetInstrDesc &Desc = MI->getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  int64_t InstrOffs = 0;;
+  int Scale = 1;
+  unsigned ImmIdx = 0;
+  switch (AddrMode) {
+  case ARMII::AddrModeT2_i8:
+  case ARMII::AddrModeT2_i12:
+    // i8 supports only negative, and i12 supports only positive, so
+    // based on Offset sign, consider the appropriate instruction
+    InstrOffs = MI->getOperand(Idx+1).getImm();
+    Scale = 1;
+    break;
+  case ARMII::AddrMode5: {
+    // VFP address mode.
+    const MachineOperand &OffOp = MI->getOperand(Idx+1);
+    InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm());
+    if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub)
+      InstrOffs = -InstrOffs;
+    Scale = 4;
+    break;
+  }
+  case ARMII::AddrMode2: {
+    ImmIdx = Idx+2;
+    InstrOffs = ARM_AM::getAM2Offset(MI->getOperand(ImmIdx).getImm());
+    if (ARM_AM::getAM2Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+      InstrOffs = -InstrOffs;
+    break;
+  }
+  case ARMII::AddrMode3: {
+    ImmIdx = Idx+2;
+    InstrOffs = ARM_AM::getAM3Offset(MI->getOperand(ImmIdx).getImm());
+    if (ARM_AM::getAM3Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+      InstrOffs = -InstrOffs;
+    break;
+  }
+  case ARMII::AddrModeT1_s: {
+    ImmIdx = Idx+1;
+    InstrOffs = MI->getOperand(ImmIdx).getImm();
+    Scale = 4;
+    break;
+  }
+  default:
+    llvm_unreachable("Unsupported addressing mode!");
+    break;
+  }
+
+  return InstrOffs * Scale;
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool ARMBaseRegisterInfo::
+needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
+  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i) {
+    assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!");
+  }
+
+  // It's the load/store FI references that cause issues, as it can be difficult
+  // to materialize the offset if it won't fit in the literal field. Estimate
+  // based on the size of the local frame and some conservative assumptions
+  // about the rest of the stack frame (note, this is pre-regalloc, so
+  // we don't know everything for certain yet) whether this offset is likely
+  // to be out of range of the immediate. Return true if so.
+
+  // We only generate virtual base registers for loads and stores, so
+  // return false for everything else.
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  case ARM::LDR: case ARM::LDRH: case ARM::LDRB:
+  case ARM::STR: case ARM::STRH: case ARM::STRB:
+  case ARM::t2LDRi12: case ARM::t2LDRi8:
+  case ARM::t2STRi12: case ARM::t2STRi8:
+  case ARM::VLDRS: case ARM::VLDRD:
+  case ARM::VSTRS: case ARM::VSTRD:
+  case ARM::tSTRspi: case ARM::tLDRspi:
+    if (ForceAllBaseRegAlloc)
+      return true;
+    break;
+  default:
+    return false;
+  }
+
+  // Without a virtual base register, if the function has variable sized
+  // objects, all fixed-size local references will be via the frame pointer,
+  // Approximate the offset and see if it's legal for the instruction.
+  // Note that the incoming offset is based on the SP value at function entry,
+  // so it'll be negative.
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Estimate an offset from the frame pointer.
+  // Conservatively assume all callee-saved registers get pushed. R4-R6
+  // will be earlier than the FP, so we ignore those.
+  // R7, LR
+  int64_t FPOffset = Offset - 8;
+  // ARM and Thumb2 functions also need to consider R8-R11 and D8-D15
+  if (!AFI->isThumbFunction() || !AFI->isThumb1OnlyFunction())
+    FPOffset -= 80;
+  // Estimate an offset from the stack pointer.
+  // The incoming offset is relating to the SP at the start of the function,
+  // but when we access the local it'll be relative to the SP after local
+  // allocation, so adjust our SP-relative offset by that allocation size.
+  Offset = -Offset;
+  Offset += MFI->getLocalFrameSize();
+  // Assume that we'll have at least some spill slots allocated.
+  // FIXME: This is a total SWAG number. We should run some statistics
+  //        and pick a real one.
+  Offset += 128; // 128 bytes of spill slots
+
+  // If there is a frame pointer, try using it.
+  // The FP is only available if there is no dynamic realignment. We
+  // don't know for sure yet whether we'll need that, so we guess based
+  // on whether there are any local variables that would trigger it.
+  unsigned StackAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  if (hasFP(MF) &&
+      !((MFI->getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) {
+    if (isFrameOffsetLegal(MI, FPOffset))
+      return false;
+  }
+  // If we can reference via the stack pointer, try that.
+  // FIXME: This (and the code that resolves the references) can be improved
+  //        to only disallow SP relative references in the live range of
+  //        the VLA(s). In practice, it's unclear how much difference that
+  //        would make, but it may be worth doing.
+  if (!MFI->hasVarSizedObjects() && isFrameOffsetLegal(MI, Offset))
+    return false;
+
+  // The offset likely isn't legal, we want to allocate a virtual base register.
+  return true;
+}
+
+/// materializeFrameBaseRegister - Insert defining instruction(s) for
+/// BaseReg to be a pointer to FrameIdx before insertion point I.
+void ARMBaseRegisterInfo::
+materializeFrameBaseRegister(MachineBasicBlock::iterator I, unsigned BaseReg,
+                             int FrameIdx, int64_t Offset) const {
+  ARMFunctionInfo *AFI =
+    I->getParent()->getParent()->getInfo<ARMFunctionInfo>();
+  unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri :
+    (AFI->isThumb1OnlyFunction() ? ARM::tADDrSPi : ARM::t2ADDri);
+
+  MachineInstrBuilder MIB =
+    BuildMI(*I->getParent(), I, I->getDebugLoc(), TII.get(ADDriOpc), BaseReg)
+    .addFrameIndex(FrameIdx).addImm(Offset);
+  if (!AFI->isThumb1OnlyFunction())
+    AddDefaultCC(AddDefaultPred(MIB));
+}
+
+void
+ARMBaseRegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
+                                       unsigned BaseReg, int64_t Offset) const {
+  MachineInstr &MI = *I;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This resolveFrameIndex does not support Thumb1!");
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = false;
+  if (!AFI->isThumbFunction())
+    Done = rewriteARMFrameIndex(MI, i, BaseReg, Off, TII);
+  else {
+    assert(AFI->isThumb2Function());
+    Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII);
+  }
+  assert (Done && "Unable to resolve frame index!");
+}
+
+bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                             int64_t Offset) const {
+  const TargetInstrDesc &Desc = MI->getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  unsigned i = 0;
+
+  while (!MI->getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!");
+  }
+
+  // AddrMode4 and AddrMode6 cannot handle any offset.
+  if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6)
+    return Offset == 0;
+
+  unsigned NumBits = 0;
+  unsigned Scale = 1;
+  bool isSigned = true;
+  switch (AddrMode) {
+  case ARMII::AddrModeT2_i8:
+  case ARMII::AddrModeT2_i12:
+    // i8 supports only negative, and i12 supports only positive, so
+    // based on Offset sign, consider the appropriate instruction
+    Scale = 1;
+    if (Offset < 0) {
+      NumBits = 8;
+      Offset = -Offset;
+    } else {
+      NumBits = 12;
+    }
+    break;
+  case ARMII::AddrMode5:
+    // VFP address mode.
+    NumBits = 8;
+    Scale = 4;
+    break;
+  case ARMII::AddrMode2:
+    NumBits = 12;
+    break;
+  case ARMII::AddrMode3:
+    NumBits = 8;
+    break;
+  case ARMII::AddrModeT1_s:
+    NumBits = 5;
+    Scale = 4;
+    isSigned = false;
+    break;
+  default:
+    llvm_unreachable("Unsupported addressing mode!");
+    break;
+  }
+
+  Offset += getFrameIndexInstrOffset(MI, i);
+  // Make sure the offset is encodable for instructions that scale the
+  // immediate.
+  if ((Offset & (Scale-1)) != 0)
+    return false;
+
+  if (isSigned && Offset < 0)
+    Offset = -Offset;
+
+  unsigned Mask = (1 << NumBits) - 1;
+  if ((unsigned)Offset <= Mask * Scale)
+    return true;
+
+  return false;
+}
+
+void
 ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                         int SPAdj, FrameIndexValue *Value,
-                                         RegScavenger *RS) const {
+                                         int SPAdj, RegScavenger *RS) const {
   unsigned i = 0;
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
@@ -1325,16 +1691,13 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int FrameIndex = MI.getOperand(i).getIndex();
   unsigned FrameReg;
 
-  int Offset = getFrameIndexReference(MF, FrameIndex, FrameReg);
-  if (FrameReg != ARM::SP)
-    SPAdj = 0;
-  Offset += SPAdj;
+  int Offset = ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj);
 
   // Special handling of dbg_value instructions.
   if (MI.isDebugValue()) {
     MI.getOperand(i).  ChangeToRegister(FrameReg, false /*isDef*/);
     MI.getOperand(i+1).ChangeToImmediate(Offset);
-    return 0;
+    return;
   }
 
   // Modify MI as necessary to handle as much of 'Offset' as possible
@@ -1346,7 +1709,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     Done = rewriteT2FrameIndex(MI, i, FrameReg, Offset, TII);
   }
   if (Done)
-    return 0;
+    return;
 
   // If we get here, the immediate doesn't fit into the instruction.  We folded
   // as much as possible above, handle the rest, providing a register that is
@@ -1366,10 +1729,6 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     MI.getOperand(i).ChangeToRegister(FrameReg, false, false, false);
   else {
     ScratchReg = MF.getRegInfo().createVirtualRegister(ARM::GPRRegisterClass);
-    if (Value) {
-      Value->first = FrameReg; // use the frame register as a kind indicator
-      Value->second = Offset;
-    }
     if (!AFI->isThumbFunction())
       emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
                               Offset, Pred, PredReg, TII);
@@ -1379,10 +1738,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                              Offset, Pred, PredReg, TII);
     }
     MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true);
-    if (!ReuseFrameIndexVals)
-      ScratchReg = 0;
   }
-  return ScratchReg;
 }
 
 /// Move iterator past the next bunch of callee save load / store ops for
@@ -1494,7 +1850,8 @@ emitPrologue(MachineFunction &MF) const {
   // Otherwise, if this is not Darwin, all the callee-saved registers go
   // into spill area 1, including the FP in R11.  In either case, it is
   // now safe to emit this assignment.
-  if (STI.isTargetDarwin() || hasFP(MF)) {
+  bool HasFP = hasFP(MF);
+  if (HasFP) {
     unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri;
     MachineInstrBuilder MIB =
       BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr)
@@ -1513,7 +1870,7 @@ emitPrologue(MachineFunction &MF) const {
   unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
   unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
   unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
-  if (STI.isTargetDarwin() || hasFP(MF))
+  if (HasFP)
     AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
                                 NumBytes);
   AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
@@ -1525,18 +1882,22 @@ emitPrologue(MachineFunction &MF) const {
   if (NumBytes) {
     // Adjust SP after all the callee-save spills.
     emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes);
+    if (HasFP)
+      AFI->setShouldRestoreSPFromFP(true);
   }
 
   if (STI.isTargetELF() && hasFP(MF)) {
     MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
                              AFI->getFramePtrSpillOffset());
+    AFI->setShouldRestoreSPFromFP(true);
   }
 
   AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
   AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
   AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
 
-  // If we need dynamic stack realignment, do it here.
+  // If we need dynamic stack realignment, do it here. Be paranoid and make
+  // sure if we also have VLAs, we have a base pointer for frame access.
   if (needsStackRealignment(MF)) {
     unsigned MaxAlign = MFI->getMaxAlignment();
     assert (!AFI->isThumb1OnlyFunction());
@@ -1562,7 +1923,28 @@ emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP)
         .addReg(ARM::R4, RegState::Kill);
     }
+
+    AFI->setShouldRestoreSPFromFP(true);
+  }
+
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  if (hasBasePointer(MF)) {
+    if (isARM)
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), BasePtr)
+        .addReg(ARM::SP)
+        .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+    else
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), BasePtr)
+        .addReg(ARM::SP);
   }
+
+  // If the frame has variable sized objects then the epilogue must restore
+  // the sp from fp.
+  if (!AFI->shouldRestoreSPFromFP() && MFI->hasVarSizedObjects())
+    AFI->setShouldRestoreSPFromFP(true);
 }
 
 static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
@@ -1617,34 +1999,25 @@ emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
                  AFI->getGPRCalleeSavedArea2Size() +
                  AFI->getDPRCalleeSavedAreaSize());
 
-    // Darwin ABI requires FP to point to the stack slot that contains the
-    // previous FP.
-    bool HasFP = hasFP(MF);
-    if ((STI.isTargetDarwin() && NumBytes) || HasFP) {
+    // Reset SP based on frame pointer only if the stack frame extends beyond
+    // frame pointer stack slot or target is ELF and the function has FP.
+    if (AFI->shouldRestoreSPFromFP()) {
       NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
-      // Reset SP based on frame pointer only if the stack frame extends beyond
-      // frame pointer stack slot or target is ELF and the function has FP.
-      if (HasFP ||
-          AFI->getGPRCalleeSavedArea2Size() ||
-          AFI->getDPRCalleeSavedAreaSize()  ||
-          AFI->getDPRCalleeSavedAreaOffset()) {
-        if (NumBytes) {
-          if (isARM)
-            emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
-                                    ARMCC::AL, 0, TII);
-          else
-            emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
-                                    ARMCC::AL, 0, TII);
-        } else {
-          // Thumb2 or ARM.
-          if (isARM)
-            BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
-              .addReg(FramePtr)
-              .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
-          else
-            BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), ARM::SP)
-              .addReg(FramePtr);
-        }
+      if (NumBytes) {
+        if (isARM)
+          emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+                                  ARMCC::AL, 0, TII);
+        else
+          emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+                                 ARMCC::AL, 0, TII);
+      } else {
+        // Thumb2 or ARM.
+        if (isARM)
+          BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
+            .addReg(FramePtr).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+        else
+          BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), ARM::SP)
+            .addReg(FramePtr);
       }
     } else if (NumBytes)
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
@@ -1670,7 +2043,7 @@ emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
 
     // Jump to label or value in register.
     if (RetOpcode == ARM::TCRETURNdi) {
-      BuildMI(MBB, MBBI, dl, 
+      BuildMI(MBB, MBBI, dl,
             TII.get(STI.isThumb() ? ARM::TAILJMPdt : ARM::TAILJMPd)).
         addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
                          JumpTarget.getTargetFlags());
@@ -1685,7 +2058,7 @@ emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
     } else if (RetOpcode == ARM::TCRETURNriND) {
       BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPrND)).
         addReg(JumpTarget.getReg(), RegState::Kill);
-    } 
+    }
 
     MachineInstr *NewMI = prior(MBBI);
     for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index f7ee0d5cc66d0..fa2eb6c10498e 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -44,7 +44,7 @@ static inline bool isARMLowRegister(unsigned Reg) {
   }
 }
 
-struct ARMBaseRegisterInfo : public ARMGenRegisterInfo {
+class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
 protected:
   const ARMBaseInstrInfo &TII;
   const ARMSubtarget &STI;
@@ -52,6 +52,11 @@ protected:
   /// FramePtr - ARM physical register used as frame ptr.
   unsigned FramePtr;
 
+  /// BasePtr - ARM physical register used as a base ptr in complex stack
+  /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+  /// variable size stack objects.
+  unsigned BasePtr;
+
   // Can be only subclassed.
   explicit ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
                                const ARMSubtarget &STI);
@@ -102,9 +107,18 @@ public:
                           MachineFunction &MF) const;
 
   bool hasFP(const MachineFunction &MF) const;
+  bool hasBasePointer(const MachineFunction &MF) const;
 
   bool canRealignStack(const MachineFunction &MF) const;
   bool needsStackRealignment(const MachineFunction &MF) const;
+  int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const;
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const;
+  void materializeFrameBaseRegister(MachineBasicBlock::iterator I,
+                                    unsigned BaseReg, int FrameIdx,
+                                    int64_t Offset) const;
+  void resolveFrameIndex(MachineBasicBlock::iterator I,
+                         unsigned BaseReg, int64_t Offset) const;
+  bool isFrameOffsetLegal(const MachineInstr *MI, int64_t Offset) const;
 
   bool cannotEliminateFrame(const MachineFunction &MF) const;
 
@@ -116,6 +130,8 @@ public:
   unsigned getFrameRegister(const MachineFunction &MF) const;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const;
+  int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
+                                 unsigned &FrameReg, int SPAdj) const;
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
 
   // Exception handling queries.
@@ -144,16 +160,17 @@ public:
 
   virtual bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
 
-  virtual bool hasReservedCallFrame(MachineFunction &MF) const;
-  virtual bool canSimplifyCallFramePseudos(MachineFunction &MF) const;
+  virtual bool requiresVirtualBaseRegisters(const MachineFunction &MF) const;
+
+  virtual bool hasReservedCallFrame(const MachineFunction &MF) const;
+  virtual bool canSimplifyCallFramePseudos(const MachineFunction &MF) const;
 
   virtual void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                            MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator I) const;
 
-  virtual unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                       int SPAdj, FrameIndexValue *Value = NULL,
-                                       RegScavenger *RS = NULL) const;
+  virtual void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                   int SPAdj, RegScavenger *RS = NULL) const;
 
   virtual void emitPrologue(MachineFunction &MF) const;
   virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 8fdb07f81626a..293e32aa5376e 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -1,4 +1,4 @@
-//===- ARMCallingConv.td - Calling Conventions for ARM ----------*- C++ -*-===//
+//===- ARMCallingConv.td - Calling Conventions for ARM -----*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -68,7 +68,7 @@ def CC_ARM_AAPCS_Common : CallingConv<[
                        "ArgFlags.getOrigAlign() != 8",
                        CCAssignToReg<[R0, R1, R2, R3]>>>,
 
-  CCIfType<[i32], CCIfAlign<"8", CCAssignToStack<4, 8>>>,
+  CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, R3>>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
   CCIfType<[f64], CCAssignToStack<8, 8>>,
   CCIfType<[v2f64], CCAssignToStack<16, 8>>
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index 7895cb0719229..b1a702f90cfc3 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -65,7 +65,7 @@ namespace {
     static char ID;
   public:
     ARMCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
-      : MachineFunctionPass(&ID), JTI(0),
+      : MachineFunctionPass(ID), JTI(0),
         II((const ARMInstrInfo *)tm.getInstrInfo()),
         TD(tm.getTargetData()), TM(tm),
         MCE(mce), MCPEs(0), MJTEs(0),
@@ -124,6 +124,8 @@ namespace {
 
     void emitMiscArithInstruction(const MachineInstr &MI);
 
+    void emitSaturateInstruction(const MachineInstr &MI);
+
     void emitBranchInstruction(const MachineInstr &MI);
 
     void emitInlineJumpTable(unsigned JTIndex);
@@ -389,6 +391,9 @@ void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) {
   case ARMII::ArithMiscFrm:
     emitMiscArithInstruction(MI);
     break;
+  case ARMII::SatFrm:
+    emitSaturateInstruction(MI);
+    break;
   case ARMII::BrFrm:
     emitBranchInstruction(MI);
     break;
@@ -654,6 +659,19 @@ void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) {
   switch (Opcode) {
   default:
     llvm_unreachable("ARMCodeEmitter::emitPseudoInstruction");
+  case ARM::BX:
+  case ARM::BMOVPCRX:
+  case ARM::BXr9:
+  case ARM::BMOVPCRXr9: {
+    // First emit mov lr, pc
+    unsigned Binary = 0x01a0e00f;
+    Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+    emitWordLE(Binary);
+
+    // and then emit the branch.
+    emitMiscBranchInstruction(MI);
+    break;
+  }
   case TargetOpcode::INLINEASM: {
     // We allow inline assembler nodes with empty bodies - they can
     // implicitly define registers, which is ok for JIT.
@@ -662,7 +680,7 @@ void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) {
     }
     break;
   }
-  case TargetOpcode::DBG_LABEL:
+  case TargetOpcode::PROLOG_LABEL:
   case TargetOpcode::EH_LABEL:
     MCE.emitLabel(MI.getOperand(0).getMCSymbol());
     break;
@@ -1209,12 +1227,58 @@ void ARMCodeEmitter::emitMiscArithInstruction(const MachineInstr &MI) {
 
   // Encode shift_imm.
   unsigned ShiftAmt = MI.getOperand(OpIdx).getImm();
+  if (TID.Opcode == ARM::PKHTB) {
+    assert(ShiftAmt != 0 && "PKHTB shift_imm is 0!");
+    if (ShiftAmt == 32)
+      ShiftAmt = 0;
+  }
   assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!");
   Binary |= ShiftAmt << ARMII::ShiftShift;
 
   emitWordLE(Binary);
 }
 
+void ARMCodeEmitter::emitSaturateInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // Part of binary is determined by TableGen.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
+
+  // Encode saturate bit position.
+  unsigned Pos = MI.getOperand(1).getImm();
+  if (TID.Opcode == ARM::SSAT || TID.Opcode == ARM::SSAT16)
+    Pos -= 1;
+  assert((Pos < 16 || (Pos < 32 &&
+                       TID.Opcode != ARM::SSAT16 &&
+                       TID.Opcode != ARM::USAT16)) &&
+         "saturate bit position out of range");
+  Binary |= Pos << 16;
+
+  // Encode Rm
+  Binary |= getMachineOpValue(MI, 2);
+
+  // Encode shift_imm.
+  if (TID.getNumOperands() == 4) {
+    unsigned ShiftOp = MI.getOperand(3).getImm();
+    ARM_AM::ShiftOpc Opc = ARM_AM::getSORegShOp(ShiftOp);
+    if (Opc == ARM_AM::asr)
+      Binary |= (1 << 6);
+    unsigned ShiftAmt = MI.getOperand(3).getImm();
+    if (ShiftAmt == 32 && Opc == ARM_AM::asr)
+      ShiftAmt = 0;
+    assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!");
+    Binary |= ShiftAmt << ARMII::ShiftShift;
+  }
+
+  emitWordLE(Binary);
+}
+
 void ARMCodeEmitter::emitBranchInstruction(const MachineInstr &MI) {
   const TargetInstrDesc &TID = MI.getDesc();
 
@@ -1485,7 +1549,7 @@ ARMCodeEmitter::emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI) {
 
   // Set addressing mode by modifying bits U(23) and P(24)
   const MachineOperand &MO = MI.getOperand(OpIdx++);
-  Binary |= getAddrModeUPBits(ARM_AM::getAM5SubMode(MO.getImm()));
+  Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(MO.getImm()));
 
   // Set bit W(21)
   if (IsUpdating)
@@ -1494,7 +1558,7 @@ ARMCodeEmitter::emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI) {
   // First register is encoded in Dd.
   Binary |= encodeVFPRd(MI, OpIdx+2);
 
-  // Number of registers are encoded in offset field.
+  // Count the number of registers.
   unsigned NumRegs = 1;
   for (unsigned i = OpIdx+3, e = MI.getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 65a3da6f1617c..60e923bd2c385 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -18,9 +18,9 @@
 #include "ARMAddressingModes.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMInstrInfo.h"
+#include "Thumb2InstrInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
@@ -165,7 +165,7 @@ namespace {
     /// HasInlineAsm - True if the function contains inline assembly.
     bool HasInlineAsm;
 
-    const TargetInstrInfo *TII;
+    const ARMInstrInfo *TII;
     const ARMSubtarget *STI;
     ARMFunctionInfo *AFI;
     bool isThumb;
@@ -173,7 +173,7 @@ namespace {
     bool isThumb2;
   public:
     static char ID;
-    ARMConstantIslands() : MachineFunctionPass(&ID) {}
+    ARMConstantIslands() : MachineFunctionPass(ID) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -272,7 +272,7 @@ FunctionPass *llvm::createARMConstantIslandPass() {
 bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
   MachineConstantPool &MCP = *MF.getConstantPool();
 
-  TII = MF.getTarget().getInstrInfo();
+  TII = (const ARMInstrInfo*)MF.getTarget().getInstrInfo();
   AFI = MF.getInfo<ARMFunctionInfo>();
   STI = &MF.getTarget().getSubtarget<ARMSubtarget>();
 
@@ -323,6 +323,8 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
   // constant pool users.
   InitialFunctionScan(MF, CPEMIs);
   CPEMIs.clear();
+  DEBUG(dumpBBs());
+
 
   /// Remove dead constant pool entries.
   RemoveUnusedCPEntries();
@@ -355,7 +357,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
   }
 
   // Shrink 32-bit Thumb2 branch, load, and store instructions.
-  if (isThumb2)
+  if (isThumb2 && !STI->prefers32BitThumb())
     MadeChange |= OptimizeThumb2Instructions(MF);
 
   // After a while, this might be made debug-only, but it is not expensive.
@@ -366,6 +368,8 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
   if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
     MadeChange |= UndoLRSpillRestore();
 
+  DEBUG(errs() << '\n'; dumpBBs());
+
   BBSizes.clear();
   BBOffsets.clear();
   WaterList.clear();
@@ -509,6 +513,10 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF,
         case ARM::tBR_JTr:
           // A Thumb1 table jump may involve padding; for the offsets to
           // be right, functions containing these must be 4-byte aligned.
+          // tBR_JTr expands to a mov pc followed by .align 2 and then the jump
+          // table entries. So this code checks whether offset of tBR_JTr + 2
+          // is aligned.  That is held in Offset+MBBSize, which already has
+          // 2 added in for the size of the mov pc instruction.
           MF.EnsureAlignment(2U);
           if ((Offset+MBBSize)%4 != 0 || HasInlineAsm)
             // FIXME: Add a pseudo ALIGN instruction instead.
@@ -768,28 +776,54 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
     WaterList.insert(IP, OrigBB);
   NewWaterList.insert(OrigBB);
 
-  // Figure out how large the first NewMBB is.  (It cannot
-  // contain a constpool_entry or tablejump.)
-  unsigned NewBBSize = 0;
-  for (MachineBasicBlock::iterator I = NewBB->begin(), E = NewBB->end();
-       I != E; ++I)
-    NewBBSize += TII->GetInstSizeInBytes(I);
-
   unsigned OrigBBI = OrigBB->getNumber();
   unsigned NewBBI = NewBB->getNumber();
-  // Set the size of NewBB in BBSizes.
-  BBSizes[NewBBI] = NewBBSize;
 
-  // We removed instructions from UserMBB, subtract that off from its size.
-  // Add 2 or 4 to the block to count the unconditional branch we added to it.
   int delta = isThumb1 ? 2 : 4;
-  BBSizes[OrigBBI] -= NewBBSize - delta;
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  unsigned OrigBBSize = 0;
+  for (MachineBasicBlock::iterator I = OrigBB->begin(), E = OrigBB->end();
+       I != E; ++I)
+    OrigBBSize += TII->GetInstSizeInBytes(I);
+  BBSizes[OrigBBI] = OrigBBSize;
 
   // ...and adjust BBOffsets for NewBB accordingly.
   BBOffsets[NewBBI] = BBOffsets[OrigBBI] + BBSizes[OrigBBI];
 
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  unsigned NewBBSize = 0;
+  for (MachineBasicBlock::iterator I = NewBB->begin(), E = NewBB->end();
+       I != E; ++I)
+    NewBBSize += TII->GetInstSizeInBytes(I);
+  // Set the size of NewBB in BBSizes.  It does not include any padding now.
+  BBSizes[NewBBI] = NewBBSize;
+
+  MachineInstr* ThumbJTMI = prior(NewBB->end());
+  if (ThumbJTMI->getOpcode() == ARM::tBR_JTr) {
+    // We've added another 2-byte instruction before this tablejump, which
+    // means we will always need padding if we didn't before, and vice versa.
+
+    // The original offset of the jump instruction was:
+    unsigned OrigOffset = BBOffsets[OrigBBI] + BBSizes[OrigBBI] - delta;
+    if (OrigOffset%4 == 0) {
+      // We had padding before and now we don't.  No net change in code size.
+      delta = 0;
+    } else {
+      // We didn't have padding before and now we do.
+      BBSizes[NewBBI] += 2;
+      delta = 4;
+    }
+  }
+
   // All BBOffsets following these blocks must be modified.
-  AdjustBBOffsetsAfter(NewBB, delta);
+  if (delta)
+    AdjustBBOffsetsAfter(NewBB, delta);
 
   return NewBB;
 }
@@ -915,6 +949,10 @@ void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB,
       }
       // Thumb1 jump tables require padding.  They should be at the end;
       // following unconditional branches are removed by AnalyzeBranch.
+      // tBR_JTr expands to a mov pc followed by .align 2 and then the jump
+      // table entries. So this code checks whether offset of tBR_JTr
+      // is aligned; if it is, the offset of the jump table following the
+      // instruction will not be aligned, and we need padding.
       MachineInstr *ThumbJTMI = prior(MBB->end());
       if (ThumbJTMI->getOpcode() == ARM::tBR_JTr) {
         unsigned NewMIOffset = GetOffsetOf(ThumbJTMI);
@@ -1143,11 +1181,13 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
     MachineBasicBlock::iterator MI = UserMI;
     ++MI;
     unsigned CPUIndex = CPUserIndex+1;
+    unsigned NumCPUsers = CPUsers.size();
+    MachineInstr *LastIT = 0;
     for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
          Offset < BaseInsertOffset;
          Offset += TII->GetInstSizeInBytes(MI),
-            MI = llvm::next(MI)) {
-      if (CPUIndex < CPUsers.size() && CPUsers[CPUIndex].MI == MI) {
+           MI = llvm::next(MI)) {
+      if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
         CPUser &U = CPUsers[CPUIndex];
         if (!OffsetIsInRange(Offset, EndInsertOffset,
                              U.MaxDisp, U.NegOk, U.IsSoImm)) {
@@ -1159,9 +1199,23 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
         EndInsertOffset += CPUsers[CPUIndex].CPEMI->getOperand(2).getImm();
         CPUIndex++;
       }
+
+      // Remember the last IT instruction.
+      if (MI->getOpcode() == ARM::t2IT)
+        LastIT = MI;
     }
+
     DEBUG(errs() << "Split in middle of big block\n");
-    NewMBB = SplitBlockBeforeInstr(prior(MI));
+    --MI;
+
+    // Avoid splitting an IT block.
+    if (LastIT) {
+      unsigned PredReg = 0;
+      ARMCC::CondCodes CC = llvm::getITInstrPredicate(MI, PredReg);
+      if (CC != ARMCC::AL)
+        MI = LastIT;
+    }
+    NewMBB = SplitBlockBeforeInstr(MI);
   }
 }
 
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 9c62597b4323b..fc2e3c3fadaea 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -19,14 +19,21 @@
 #include "ARMBaseInstrInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-
+#include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
 namespace {
   class ARMExpandPseudo : public MachineFunctionPass {
+    // Constants for register spacing in NEON load/store instructions.
+    enum NEONRegSpacing {
+      SingleSpc,
+      EvenDblSpc,
+      OddDblSpc
+    };
+
   public:
     static char ID;
-    ARMExpandPseudo() : MachineFunctionPass(&ID) {}
+    ARMExpandPseudo() : MachineFunctionPass(ID) {}
 
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
@@ -41,6 +48,10 @@ namespace {
     void TransferImpOps(MachineInstr &OldMI,
                         MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI);
     bool ExpandMBB(MachineBasicBlock &MBB);
+    void ExpandVLD(MachineBasicBlock::iterator &MBBI, unsigned Opc,
+                   bool hasWriteBack, NEONRegSpacing RegSpc, unsigned NumRegs);
+    void ExpandVST(MachineBasicBlock::iterator &MBBI, unsigned Opc,
+                   bool hasWriteBack, NEONRegSpacing RegSpc, unsigned NumRegs);
   };
   char ARMExpandPseudo::ID = 0;
 }
@@ -63,6 +74,129 @@ void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI,
   }
 }
 
+/// ExpandVLD - Translate VLD pseudo instructions with Q, QQ or QQQQ register
+/// operands to real VLD instructions with D register operands.
+void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI,
+                                unsigned Opc, bool hasWriteBack,
+                                NEONRegSpacing RegSpc, unsigned NumRegs) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc));
+  unsigned OpIdx = 0;
+
+  bool DstIsDead = MI.getOperand(OpIdx).isDead();
+  unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+  unsigned D0, D1, D2, D3;
+  if (RegSpc == SingleSpc) {
+    D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
+    D1 = TRI->getSubReg(DstReg, ARM::dsub_1);
+    D2 = TRI->getSubReg(DstReg, ARM::dsub_2);
+    D3 = TRI->getSubReg(DstReg, ARM::dsub_3);
+  } else if (RegSpc == EvenDblSpc) {
+    D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
+    D1 = TRI->getSubReg(DstReg, ARM::dsub_2);
+    D2 = TRI->getSubReg(DstReg, ARM::dsub_4);
+    D3 = TRI->getSubReg(DstReg, ARM::dsub_6);
+  } else {
+    assert(RegSpc == OddDblSpc && "unknown register spacing for VLD");
+    D0 = TRI->getSubReg(DstReg, ARM::dsub_1);
+    D1 = TRI->getSubReg(DstReg, ARM::dsub_3);
+    D2 = TRI->getSubReg(DstReg, ARM::dsub_5);
+    D3 = TRI->getSubReg(DstReg, ARM::dsub_7);
+  } 
+  MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+  if (NumRegs > 2)
+    MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
+  if (NumRegs > 3)
+    MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
+
+  if (hasWriteBack) {
+    bool WBIsDead = MI.getOperand(OpIdx).isDead();
+    unsigned WBReg = MI.getOperand(OpIdx++).getReg();
+    MIB.addReg(WBReg, RegState::Define | getDeadRegState(WBIsDead));
+  }
+  // Copy the addrmode6 operands.
+  bool AddrIsKill = MI.getOperand(OpIdx).isKill();
+  MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(AddrIsKill));
+  MIB.addImm(MI.getOperand(OpIdx++).getImm());
+  if (hasWriteBack) {
+    // Copy the am6offset operand.
+    bool OffsetIsKill = MI.getOperand(OpIdx).isKill();
+    MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(OffsetIsKill));
+  }
+
+  MIB = AddDefaultPred(MIB);
+  TransferImpOps(MI, MIB, MIB);
+  // For an instruction writing the odd subregs, add an implicit use of the
+  // super-register because the even subregs were loaded separately.
+  if (RegSpc == OddDblSpc)
+    MIB.addReg(DstReg, RegState::Implicit);
+  // Add an implicit def for the super-register.
+  MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+  MI.eraseFromParent();
+}
+
+/// ExpandVST - Translate VST pseudo instructions with Q, QQ or QQQQ register
+/// operands to real VST instructions with D register operands.
+void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI,
+                                unsigned Opc, bool hasWriteBack,
+                                NEONRegSpacing RegSpc, unsigned NumRegs) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc));
+  unsigned OpIdx = 0;
+  if (hasWriteBack) {
+    bool DstIsDead = MI.getOperand(OpIdx).isDead();
+    unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+    MIB.addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead));
+  }
+  // Copy the addrmode6 operands.
+  bool AddrIsKill = MI.getOperand(OpIdx).isKill();
+  MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(AddrIsKill));
+  MIB.addImm(MI.getOperand(OpIdx++).getImm());
+  if (hasWriteBack) {
+    // Copy the am6offset operand.
+    bool OffsetIsKill = MI.getOperand(OpIdx).isKill();
+    MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(OffsetIsKill));
+  }
+
+  bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+  unsigned SrcReg = MI.getOperand(OpIdx).getReg();
+  unsigned D0, D1, D2, D3;
+  if (RegSpc == SingleSpc) {
+    D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
+    D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
+    D2 = TRI->getSubReg(SrcReg, ARM::dsub_2);
+    D3 = TRI->getSubReg(SrcReg, ARM::dsub_3);
+  } else if (RegSpc == EvenDblSpc) {
+    D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
+    D1 = TRI->getSubReg(SrcReg, ARM::dsub_2);
+    D2 = TRI->getSubReg(SrcReg, ARM::dsub_4);
+    D3 = TRI->getSubReg(SrcReg, ARM::dsub_6);
+  } else {
+    assert(RegSpc == OddDblSpc && "unknown register spacing for VST");
+    D0 = TRI->getSubReg(SrcReg, ARM::dsub_1);
+    D1 = TRI->getSubReg(SrcReg, ARM::dsub_3);
+    D2 = TRI->getSubReg(SrcReg, ARM::dsub_5);
+    D3 = TRI->getSubReg(SrcReg, ARM::dsub_7);
+  } 
+
+  MIB.addReg(D0).addReg(D1);
+  if (NumRegs > 2)
+    MIB.addReg(D2);
+  if (NumRegs > 3)
+    MIB.addReg(D3);
+  MIB = AddDefaultPred(MIB);
+  TransferImpOps(MI, MIB, MIB);
+  if (SrcIsKill)
+    // Add an implicit kill for the super-reg.
+    (*MIB).addRegisterKilled(SrcReg, TRI, true);
+  MI.eraseFromParent();
+}
+
 bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
   bool Modified = false;
 
@@ -71,9 +205,13 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
     MachineInstr &MI = *MBBI;
     MachineBasicBlock::iterator NMBBI = llvm::next(MBBI);
 
+    bool ModifiedOp = true;
     unsigned Opcode = MI.getOpcode();
     switch (Opcode) {
-    default: break;
+    default:
+      ModifiedOp = false;
+      break;
+
     case ARM::tLDRpci_pic: 
     case ARM::t2LDRpci_pic: {
       unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic)
@@ -92,10 +230,10 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
         .addOperand(MI.getOperand(2));
       TransferImpOps(MI, MIB1, MIB2);
       MI.eraseFromParent();
-      Modified = true;
       break;
     }
 
+    case ARM::MOVi32imm:
     case ARM::t2MOVi32imm: {
       unsigned PredReg = 0;
       ARMCC::CondCodes Pred = llvm::getInstrPredicate(&MI, PredReg);
@@ -104,9 +242,13 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
       const MachineOperand &MO = MI.getOperand(1);
       MachineInstrBuilder LO16, HI16;
 
-      LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::t2MOVi16),
+      LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                     TII->get(Opcode == ARM::MOVi32imm ?
+                              ARM::MOVi16 : ARM::t2MOVi16),
                      DstReg);
-      HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::t2MOVTi16))
+      HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                     TII->get(Opcode == ARM::MOVi32imm ?
+                              ARM::MOVTi16 : ARM::t2MOVTi16))
         .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead))
         .addReg(DstReg);
 
@@ -128,7 +270,6 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
       HI16.addImm(Pred).addReg(PredReg);
       TransferImpOps(MI, LO16, HI16);
       MI.eraseFromParent();
-      Modified = true;
       break;
     }
 
@@ -155,9 +296,211 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
                      .addReg(OddSrc, getKillRegState(SrcIsKill)));
       TransferImpOps(MI, Even, Odd);
       MI.eraseFromParent();
-      Modified = true;
     }
+
+    case ARM::VLD1q8Pseudo:
+      ExpandVLD(MBBI, ARM::VLD1q8, false, SingleSpc, 2); break;
+    case ARM::VLD1q16Pseudo:
+      ExpandVLD(MBBI, ARM::VLD1q16, false, SingleSpc, 2); break;
+    case ARM::VLD1q32Pseudo:
+      ExpandVLD(MBBI, ARM::VLD1q32, false, SingleSpc, 2); break;
+    case ARM::VLD1q64Pseudo:
+      ExpandVLD(MBBI, ARM::VLD1q64, false, SingleSpc, 2); break;
+    case ARM::VLD1q8Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD1q8, true, SingleSpc, 2); break;
+    case ARM::VLD1q16Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD1q16, true, SingleSpc, 2); break;
+    case ARM::VLD1q32Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD1q32, true, SingleSpc, 2); break;
+    case ARM::VLD1q64Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD1q64, true, SingleSpc, 2); break;
+
+    case ARM::VLD2d8Pseudo:
+      ExpandVLD(MBBI, ARM::VLD2d8, false, SingleSpc, 2); break;
+    case ARM::VLD2d16Pseudo:
+      ExpandVLD(MBBI, ARM::VLD2d16, false, SingleSpc, 2); break;
+    case ARM::VLD2d32Pseudo:
+      ExpandVLD(MBBI, ARM::VLD2d32, false, SingleSpc, 2); break;
+    case ARM::VLD2q8Pseudo:
+      ExpandVLD(MBBI, ARM::VLD2q8, false, SingleSpc, 4); break;
+    case ARM::VLD2q16Pseudo:
+      ExpandVLD(MBBI, ARM::VLD2q16, false, SingleSpc, 4); break;
+    case ARM::VLD2q32Pseudo:
+      ExpandVLD(MBBI, ARM::VLD2q32, false, SingleSpc, 4); break;
+    case ARM::VLD2d8Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD2d8, true, SingleSpc, 2); break;
+    case ARM::VLD2d16Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD2d16, true, SingleSpc, 2); break;
+    case ARM::VLD2d32Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD2d32, true, SingleSpc, 2); break;
+    case ARM::VLD2q8Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD2q8, true, SingleSpc, 4); break;
+    case ARM::VLD2q16Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD2q16, true, SingleSpc, 4); break;
+    case ARM::VLD2q32Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD2q32, true, SingleSpc, 4); break;
+
+    case ARM::VLD3d8Pseudo:
+      ExpandVLD(MBBI, ARM::VLD3d8, false, SingleSpc, 3); break;
+    case ARM::VLD3d16Pseudo:
+      ExpandVLD(MBBI, ARM::VLD3d16, false, SingleSpc, 3); break;
+    case ARM::VLD3d32Pseudo:
+      ExpandVLD(MBBI, ARM::VLD3d32, false, SingleSpc, 3); break;
+    case ARM::VLD1d64TPseudo:
+      ExpandVLD(MBBI, ARM::VLD1d64T, false, SingleSpc, 3); break;
+    case ARM::VLD3d8Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD3d8_UPD, true, SingleSpc, 3); break;
+    case ARM::VLD3d16Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD3d16_UPD, true, SingleSpc, 3); break;
+    case ARM::VLD3d32Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD3d32_UPD, true, SingleSpc, 3); break;
+    case ARM::VLD1d64TPseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD1d64T_UPD, true, SingleSpc, 3); break;
+    case ARM::VLD3q8Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD3q8_UPD, true, EvenDblSpc, 3); break;
+    case ARM::VLD3q16Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD3q16_UPD, true, EvenDblSpc, 3); break;
+    case ARM::VLD3q32Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD3q32_UPD, true, EvenDblSpc, 3); break;
+    case ARM::VLD3q8oddPseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD3q8_UPD, true, OddDblSpc, 3); break;
+    case ARM::VLD3q16oddPseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD3q16_UPD, true, OddDblSpc, 3); break;
+    case ARM::VLD3q32oddPseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD3q32_UPD, true, OddDblSpc, 3); break;
+
+    case ARM::VLD4d8Pseudo:
+      ExpandVLD(MBBI, ARM::VLD4d8, false, SingleSpc, 4); break;
+    case ARM::VLD4d16Pseudo:
+      ExpandVLD(MBBI, ARM::VLD4d16, false, SingleSpc, 4); break;
+    case ARM::VLD4d32Pseudo:
+      ExpandVLD(MBBI, ARM::VLD4d32, false, SingleSpc, 4); break;
+    case ARM::VLD1d64QPseudo:
+      ExpandVLD(MBBI, ARM::VLD1d64Q, false, SingleSpc, 4); break;
+    case ARM::VLD4d8Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD4d8_UPD, true, SingleSpc, 4); break;
+    case ARM::VLD4d16Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD4d16_UPD, true, SingleSpc, 4); break;
+    case ARM::VLD4d32Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD4d32_UPD, true, SingleSpc, 4); break;
+    case ARM::VLD1d64QPseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD1d64Q_UPD, true, SingleSpc, 4); break;
+    case ARM::VLD4q8Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD4q8_UPD, true, EvenDblSpc, 4); break;
+    case ARM::VLD4q16Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD4q16_UPD, true, EvenDblSpc, 4); break;
+    case ARM::VLD4q32Pseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD4q32_UPD, true, EvenDblSpc, 4); break;
+    case ARM::VLD4q8oddPseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD4q8_UPD, true, OddDblSpc, 4); break;
+    case ARM::VLD4q16oddPseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD4q16_UPD, true, OddDblSpc, 4); break;
+    case ARM::VLD4q32oddPseudo_UPD:
+      ExpandVLD(MBBI, ARM::VLD4q32_UPD, true, OddDblSpc, 4); break;
+
+    case ARM::VST1q8Pseudo:
+      ExpandVST(MBBI, ARM::VST1q8, false, SingleSpc, 2); break;
+    case ARM::VST1q16Pseudo:
+      ExpandVST(MBBI, ARM::VST1q16, false, SingleSpc, 2); break;
+    case ARM::VST1q32Pseudo:
+      ExpandVST(MBBI, ARM::VST1q32, false, SingleSpc, 2); break;
+    case ARM::VST1q64Pseudo:
+      ExpandVST(MBBI, ARM::VST1q64, false, SingleSpc, 2); break;
+    case ARM::VST1q8Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST1q8_UPD, true, SingleSpc, 2); break;
+    case ARM::VST1q16Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST1q16_UPD, true, SingleSpc, 2); break;
+    case ARM::VST1q32Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST1q32_UPD, true, SingleSpc, 2); break;
+    case ARM::VST1q64Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST1q64_UPD, true, SingleSpc, 2); break;
+
+    case ARM::VST2d8Pseudo:
+      ExpandVST(MBBI, ARM::VST2d8, false, SingleSpc, 2); break;
+    case ARM::VST2d16Pseudo:
+      ExpandVST(MBBI, ARM::VST2d16, false, SingleSpc, 2); break;
+    case ARM::VST2d32Pseudo:
+      ExpandVST(MBBI, ARM::VST2d32, false, SingleSpc, 2); break;
+    case ARM::VST2q8Pseudo:
+      ExpandVST(MBBI, ARM::VST2q8, false, SingleSpc, 4); break;
+    case ARM::VST2q16Pseudo:
+      ExpandVST(MBBI, ARM::VST2q16, false, SingleSpc, 4); break;
+    case ARM::VST2q32Pseudo:
+      ExpandVST(MBBI, ARM::VST2q32, false, SingleSpc, 4); break;
+    case ARM::VST2d8Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST2d8_UPD, true, SingleSpc, 2); break;
+    case ARM::VST2d16Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST2d16_UPD, true, SingleSpc, 2); break;
+    case ARM::VST2d32Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST2d32_UPD, true, SingleSpc, 2); break;
+    case ARM::VST2q8Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST2q8_UPD, true, SingleSpc, 4); break;
+    case ARM::VST2q16Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST2q16_UPD, true, SingleSpc, 4); break;
+    case ARM::VST2q32Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST2q32_UPD, true, SingleSpc, 4); break;
+
+    case ARM::VST3d8Pseudo:
+      ExpandVST(MBBI, ARM::VST3d8, false, SingleSpc, 3); break;
+    case ARM::VST3d16Pseudo:
+      ExpandVST(MBBI, ARM::VST3d16, false, SingleSpc, 3); break;
+    case ARM::VST3d32Pseudo:
+      ExpandVST(MBBI, ARM::VST3d32, false, SingleSpc, 3); break;
+    case ARM::VST1d64TPseudo:
+      ExpandVST(MBBI, ARM::VST1d64T, false, SingleSpc, 3); break;
+    case ARM::VST3d8Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST3d8_UPD, true, SingleSpc, 3); break;
+    case ARM::VST3d16Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST3d16_UPD, true, SingleSpc, 3); break;
+    case ARM::VST3d32Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST3d32_UPD, true, SingleSpc, 3); break;
+    case ARM::VST1d64TPseudo_UPD:
+      ExpandVST(MBBI, ARM::VST1d64T_UPD, true, SingleSpc, 3); break;
+    case ARM::VST3q8Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST3q8_UPD, true, EvenDblSpc, 3); break;
+    case ARM::VST3q16Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST3q16_UPD, true, EvenDblSpc, 3); break;
+    case ARM::VST3q32Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST3q32_UPD, true, EvenDblSpc, 3); break;
+    case ARM::VST3q8oddPseudo_UPD:
+      ExpandVST(MBBI, ARM::VST3q8_UPD, true, OddDblSpc, 3); break;
+    case ARM::VST3q16oddPseudo_UPD:
+      ExpandVST(MBBI, ARM::VST3q16_UPD, true, OddDblSpc, 3); break;
+    case ARM::VST3q32oddPseudo_UPD:
+      ExpandVST(MBBI, ARM::VST3q32_UPD, true, OddDblSpc, 3); break;
+
+    case ARM::VST4d8Pseudo:
+      ExpandVST(MBBI, ARM::VST4d8, false, SingleSpc, 4); break;
+    case ARM::VST4d16Pseudo:
+      ExpandVST(MBBI, ARM::VST4d16, false, SingleSpc, 4); break;
+    case ARM::VST4d32Pseudo:
+      ExpandVST(MBBI, ARM::VST4d32, false, SingleSpc, 4); break;
+    case ARM::VST1d64QPseudo:
+      ExpandVST(MBBI, ARM::VST1d64Q, false, SingleSpc, 4); break;
+    case ARM::VST4d8Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST4d8_UPD, true, SingleSpc, 4); break;
+    case ARM::VST4d16Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST4d16_UPD, true, SingleSpc, 4); break;
+    case ARM::VST4d32Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST4d32_UPD, true, SingleSpc, 4); break;
+    case ARM::VST1d64QPseudo_UPD:
+      ExpandVST(MBBI, ARM::VST1d64Q_UPD, true, SingleSpc, 4); break;
+    case ARM::VST4q8Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST4q8_UPD, true, EvenDblSpc, 4); break;
+    case ARM::VST4q16Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST4q16_UPD, true, EvenDblSpc, 4); break;
+    case ARM::VST4q32Pseudo_UPD:
+      ExpandVST(MBBI, ARM::VST4q32_UPD, true, EvenDblSpc, 4); break;
+    case ARM::VST4q8oddPseudo_UPD:
+      ExpandVST(MBBI, ARM::VST4q8_UPD, true, OddDblSpc, 4); break;
+    case ARM::VST4q16oddPseudo_UPD:
+      ExpandVST(MBBI, ARM::VST4q16_UPD, true, OddDblSpc, 4); break;
+    case ARM::VST4q32oddPseudo_UPD:
+      ExpandVST(MBBI, ARM::VST4q32_UPD, true, OddDblSpc, 4); break;
     }
+
+    if (ModifiedOp)
+      Modified = true;
     MBBI = NMBBI;
   }
 
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
new file mode 100644
index 0000000000000..4892eae95833b
--- /dev/null
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -0,0 +1,665 @@
+//===-- ARMFastISel.cpp - ARM FastISel implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// ARMGenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMRegisterInfo.h"
+#include "ARMTargetMachine.h"
+#include "ARMSubtarget.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static cl::opt<bool>
+EnableARMFastISel("arm-fast-isel",
+                  cl::desc("Turn on experimental ARM fast-isel support"),
+                  cl::init(false), cl::Hidden);
+
+namespace {
+
+class ARMFastISel : public FastISel {
+
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARMSubtarget *Subtarget;
+  const TargetMachine &TM;
+  const TargetInstrInfo &TII;
+  const TargetLowering &TLI;
+  const ARMFunctionInfo *AFI;
+
+  // Convenience variable to avoid checking all the time.
+  bool isThumb;
+
+  public:
+    explicit ARMFastISel(FunctionLoweringInfo &funcInfo) 
+    : FastISel(funcInfo),
+      TM(funcInfo.MF->getTarget()),
+      TII(*TM.getInstrInfo()),
+      TLI(*TM.getTargetLowering()) {
+      Subtarget = &TM.getSubtarget<ARMSubtarget>();
+      AFI = funcInfo.MF->getInfo<ARMFunctionInfo>();
+      isThumb = AFI->isThumbFunction();
+    }
+
+    // Code from FastISel.cpp.
+    virtual unsigned FastEmitInst_(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC);
+    virtual unsigned FastEmitInst_r(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC,
+                                    unsigned Op0, bool Op0IsKill);
+    virtual unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill,
+                                     unsigned Op1, bool Op1IsKill);
+    virtual unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill,
+                                     uint64_t Imm);
+    virtual unsigned FastEmitInst_rf(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill,
+                                     const ConstantFP *FPImm);
+    virtual unsigned FastEmitInst_i(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC,
+                                    uint64_t Imm);
+    virtual unsigned FastEmitInst_rri(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill,
+                                      uint64_t Imm);
+    virtual unsigned FastEmitInst_extractsubreg(MVT RetVT,
+                                                unsigned Op0, bool Op0IsKill,
+                                                uint32_t Idx);
+                                                
+    // Backend specific FastISel code.
+    virtual bool TargetSelectInstruction(const Instruction *I);
+    virtual unsigned TargetMaterializeConstant(const Constant *C);
+
+  #include "ARMGenFastISel.inc"
+  
+    // Instruction selection routines.
+    virtual bool ARMSelectLoad(const Instruction *I);
+    virtual bool ARMSelectStore(const Instruction *I);
+    virtual bool ARMSelectBranch(const Instruction *I);
+
+    // Utility routines.
+  private:
+    bool isTypeLegal(const Type *Ty, EVT &VT);
+    bool isLoadTypeLegal(const Type *Ty, EVT &VT);
+    bool ARMEmitLoad(EVT VT, unsigned &ResultReg, unsigned Reg, int Offset);
+    bool ARMEmitStore(EVT VT, unsigned SrcReg, unsigned Reg, int Offset);
+    bool ARMLoadAlloca(const Instruction *I);
+    bool ARMStoreAlloca(const Instruction *I, unsigned SrcReg);
+    bool ARMComputeRegOffset(const Value *Obj, unsigned &Reg, int &Offset);
+    bool ARMMaterializeConstant(const ConstantInt *Val, unsigned &Reg);
+    
+    bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR);
+    const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB);
+};
+
+} // end anonymous namespace
+
+// #include "ARMGenCallingConv.inc"
+
+// DefinesOptionalPredicate - This is different from DefinesPredicate in that
+// we don't care about implicit defs here, just places we'll need to add a
+// default CCReg argument. Sets CPSR if we're setting CPSR instead of CCR.
+bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.hasOptionalDef())
+    return false;
+
+  // Look to see if our OptionalDef is defining CPSR or CCR.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    if (MO.getReg() == ARM::CPSR)
+      *CPSR = true;
+  }
+  return true;
+}
+
+// If the machine is predicable go ahead and add the predicate operands, if
+// it needs default CC operands add those.
+const MachineInstrBuilder &
+ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
+  MachineInstr *MI = &*MIB;
+
+  // Do we use a predicate?
+  if (TII.isPredicable(MI))
+    AddDefaultPred(MIB);
+  
+  // Do we optionally set a predicate?  Preds is size > 0 iff the predicate
+  // defines CPSR. All other OptionalDefines in ARM are the CCR register.
+  bool CPSR = false;
+  if (DefinesOptionalPredicate(MI, &CPSR)) {
+    if (CPSR)
+      AddDefaultT1CC(MIB);
+    else
+      AddDefaultCC(MIB);
+  }
+  return MIB;
+}
+
+unsigned ARMFastISel::FastEmitInst_(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass* RC) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg));
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                   .addReg(Op0, Op0IsKill * RegState::Kill));
+  else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                   TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill));
+  else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addImm(Imm));
+  else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addImm(Imm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_rf(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      const ConstantFP *FPImm) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addFPImm(FPImm));
+  else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addFPImm(FPImm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Op0, bool Op0IsKill,
+                                       unsigned Op1, bool Op1IsKill,
+                                       uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill)
+                   .addImm(Imm));
+  else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill)
+                   .addImm(Imm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_i(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+  
+  if (II.getNumDefs() >= 1)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                   .addImm(Imm));
+  else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                   .addImm(Imm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_extractsubreg(MVT RetVT,
+                                                 unsigned Op0, bool Op0IsKill,
+                                                 uint32_t Idx) {
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
+  assert(TargetRegisterInfo::isVirtualRegister(Op0) &&
+         "Cannot yet extract from physregs");
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                         DL, TII.get(TargetOpcode::COPY), ResultReg)
+                 .addReg(Op0, getKillRegState(Op0IsKill), Idx));
+  return ResultReg;
+}
+
+unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) {
+  EVT VT = TLI.getValueType(C->getType(), true);
+
+  // Only handle simple types.
+  if (!VT.isSimple()) return 0;
+  
+  // TODO: This should be safe for fp because they're just bits from the
+  // Constant.
+  // TODO: Theoretically we could materialize fp constants with instructions
+  // from VFP3.
+
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = TD.getPrefTypeAlignment(C->getType());
+  if (Align == 0) {
+    // TODO: Figure out if this is correct.
+    Align = TD.getTypeAllocSize(C->getType());
+  }
+  unsigned Idx = MCP.getConstantPoolIndex(C, Align);
+
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+  // Different addressing modes between ARM/Thumb2 for constant pool loads.
+  if (isThumb)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(ARM::t2LDRpci))
+                    .addReg(DestReg).addConstantPoolIndex(Idx));
+  else
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(ARM::LDRcp))
+                    .addReg(DestReg).addConstantPoolIndex(Idx)
+                    .addReg(0).addImm(0));
+    
+  return DestReg;
+}
+
+bool ARMFastISel::isTypeLegal(const Type *Ty, EVT &VT) {
+  VT = TLI.getValueType(Ty, true);
+  
+  // Only handle simple types.
+  if (VT == MVT::Other || !VT.isSimple()) return false;
+    
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+bool ARMFastISel::isLoadTypeLegal(const Type *Ty, EVT &VT) {
+  if (isTypeLegal(Ty, VT)) return true;
+  
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now.
+  if (VT == MVT::i8 || VT == MVT::i16)
+    return true;
+  
+  return false;
+}
+
+// Computes the Reg+Offset to get to an object.
+bool ARMFastISel::ARMComputeRegOffset(const Value *Obj, unsigned &Reg,
+                                      int &Offset) {
+  // Some boilerplate from the X86 FastISel.
+  const User *U = NULL;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks; it's possible we haven't
+    // visited them yet, so the instructions may not yet be assigned
+    // virtual registers.
+    if (FuncInfo.MBBMap[I->getParent()] != FuncInfo.MBB)
+      return false;
+
+    Opcode = I->getOpcode();
+    U = I;
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+  
+  switch (Opcode) {
+    default: 
+    //errs() << "Failing Opcode is: " << *Op1 << "\n";
+    break;
+    case Instruction::Alloca: {
+      assert(false && "Alloca should have been handled earlier!");
+      return false;
+    }
+  }
+  
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(Obj)) {
+    //errs() << "Failing GV is: " << GV << "\n";
+    (void)GV;
+    return false;
+  }
+  
+  // Try to get this in a register if nothing else has worked.
+  Reg = getRegForValue(Obj);
+  if (Reg == 0) return false;
+
+  // Since the offset may be too large for the load instruction
+  // get the reg+offset into a register.
+  // TODO: Verify the additions work, otherwise we'll need to add the
+  // offset instead of 0 to the instructions and do all sorts of operand
+  // munging.
+  // TODO: Optimize this somewhat.
+  if (Offset != 0) {
+    ARMCC::CondCodes Pred = ARMCC::AL;
+    unsigned PredReg = 0;
+
+    if (!isThumb)
+      emitARMRegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              Reg, Reg, Offset, Pred, PredReg,
+                              static_cast<const ARMBaseInstrInfo&>(TII));
+    else {
+      assert(AFI->isThumb2Function());
+      emitT2RegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                             Reg, Reg, Offset, Pred, PredReg,
+                             static_cast<const ARMBaseInstrInfo&>(TII));
+    }
+  }
+  
+  return true;
+}
+
+bool ARMFastISel::ARMLoadAlloca(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+
+  // Verify it's an alloca.
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(Op0)) {
+    DenseMap<const AllocaInst*, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
+      unsigned ResultReg = createResultReg(RC);
+      TII.loadRegFromStackSlot(*FuncInfo.MBB, *FuncInfo.InsertPt,
+                               ResultReg, SI->second, RC,
+                               TM.getRegisterInfo());
+      UpdateValueMap(I, ResultReg);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg,
+                              unsigned Reg, int Offset) {
+  
+  assert(VT.isSimple() && "Non-simple types are invalid here!");
+  unsigned Opc;
+  
+  switch (VT.getSimpleVT().SimpleTy) {
+    default: 
+      assert(false && "Trying to emit for an unhandled type!");
+      return false;
+    case MVT::i16:
+      Opc = isThumb ? ARM::tLDRH : ARM::LDRH;
+      VT = MVT::i32;
+      break;
+    case MVT::i8:
+      Opc = isThumb ? ARM::tLDRB : ARM::LDRB;
+      VT = MVT::i32;
+      break;
+    case MVT::i32:
+      Opc = isThumb ? ARM::tLDR : ARM::LDR;
+      break;
+  }
+  
+  ResultReg = createResultReg(TLI.getRegClassFor(VT));
+  
+  // TODO: Fix the Addressing modes so that these can share some code.
+  // Since this is a Thumb1 load this will work in Thumb1 or 2 mode.
+  if (isThumb)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(Opc), ResultReg)
+                    .addReg(Reg).addImm(Offset).addReg(0));
+  else
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(Opc), ResultReg)
+                    .addReg(Reg).addReg(0).addImm(Offset));
+                    
+  return true;
+}
+
+bool ARMFastISel::ARMStoreAlloca(const Instruction *I, unsigned SrcReg) {
+  Value *Op1 = I->getOperand(1);
+
+  // Verify it's an alloca.
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(Op1)) {
+    DenseMap<const AllocaInst*, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
+      assert(SrcReg != 0 && "Nothing to store!");
+      TII.storeRegToStackSlot(*FuncInfo.MBB, *FuncInfo.InsertPt,
+                              SrcReg, true /*isKill*/, SI->second, RC,
+                              TM.getRegisterInfo());
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg,
+                               unsigned DstReg, int Offset) {
+  unsigned StrOpc;
+  switch (VT.getSimpleVT().SimpleTy) {
+    default: return false;
+    case MVT::i1:
+    case MVT::i8: StrOpc = isThumb ? ARM::tSTRB : ARM::STRB; break;
+    case MVT::i16: StrOpc = isThumb ? ARM::tSTRH : ARM::STRH; break;
+    case MVT::i32: StrOpc = isThumb ? ARM::tSTR : ARM::STR; break;
+    case MVT::f32:
+      if (!Subtarget->hasVFP2()) return false;
+      StrOpc = ARM::VSTRS;
+      break;
+    case MVT::f64:
+      if (!Subtarget->hasVFP2()) return false;
+      StrOpc = ARM::VSTRD;
+      break;
+  }
+  
+  if (isThumb)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(StrOpc), SrcReg)
+                    .addReg(DstReg).addImm(Offset).addReg(0));
+  else
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(StrOpc), SrcReg)
+                    .addReg(DstReg).addReg(0).addImm(Offset));
+  
+  return true;
+}
+
+bool ARMFastISel::ARMSelectStore(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+  unsigned SrcReg = 0;
+
+  // Yay type legalization
+  EVT VT;
+  if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  // Get the value to be stored into a register.
+  SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+    
+  // If we're an alloca we know we have a frame index and can emit the store
+  // quickly.
+  if (ARMStoreAlloca(I, SrcReg))
+    return true;
+    
+  // Our register and offset with innocuous defaults.
+  unsigned Reg = 0;
+  int Offset = 0;
+  
+  // See if we can handle this as Reg + Offset
+  if (!ARMComputeRegOffset(I->getOperand(1), Reg, Offset))
+    return false;
+    
+  if (!ARMEmitStore(VT, SrcReg, Reg, Offset /* 0 */)) return false;
+    
+  return false;
+  
+}
+
+bool ARMFastISel::ARMSelectLoad(const Instruction *I) {
+  // If we're an alloca we know we have a frame index and can emit the load
+  // directly in short order.
+  if (ARMLoadAlloca(I))
+    return true;
+    
+  // Verify we have a legal type before going any further.
+  EVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
+  
+  // Our register and offset with innocuous defaults.
+  unsigned Reg = 0;
+  int Offset = 0;
+  
+  // See if we can handle this as Reg + Offset
+  if (!ARMComputeRegOffset(I->getOperand(0), Reg, Offset))
+    return false;
+  
+  unsigned ResultReg;
+  if (!ARMEmitLoad(VT, ResultReg, Reg, Offset /* 0 */)) return false;
+  
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::ARMSelectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+  
+  // Simple branch support.
+  unsigned CondReg = getRegForValue(BI->getCondition());
+  if (CondReg == 0) return false;
+  
+  unsigned CmpOpc = isThumb ? ARM::t2CMPrr : ARM::CMPrr;
+  unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc;
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
+                  .addReg(CondReg).addReg(CondReg));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc))
+                  .addMBB(TBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+  FastEmitBranch(FBB, DL);
+  FuncInfo.MBB->addSuccessor(TBB);
+  return true;
+}
+
+// TODO: SoftFP support.
+bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
+  // No Thumb-1 for now.
+  if (isThumb && !AFI->isThumb2Function()) return false;
+  
+  switch (I->getOpcode()) {
+    case Instruction::Load:
+      return ARMSelectLoad(I);
+    case Instruction::Store:
+      return ARMSelectStore(I);
+    case Instruction::Br:
+      return ARMSelectBranch(I);
+    default: break;
+  }
+  return false;
+}
+
+namespace llvm {
+  llvm::FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo) {
+    if (EnableARMFastISel) return new ARMFastISel(funcInfo);
+    return 0;
+  }
+}
diff --git a/lib/Target/ARM/ARMGlobalMerge.cpp b/lib/Target/ARM/ARMGlobalMerge.cpp
new file mode 100644
index 0000000000000..85b0c6c248d02
--- /dev/null
+++ b/lib/Target/ARM/ARMGlobalMerge.cpp
@@ -0,0 +1,212 @@
+//===-- ARMGlobalMerge.cpp - Internal globals merging  --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass merges globals with internal linkage into one. This way all the
+// globals which were merged into a biggest one can be addressed using offsets
+// from the same base pointer (no need for separate base pointer for each of the
+// global). Such a transformation can significantly reduce the register pressure
+// when many globals are involved.
+//
+// For example, consider the code which touches several global variables at once:
+//
+// static int foo[N], bar[N], baz[N];
+//
+// for (i = 0; i < N; ++i) {
+//    foo[i] = bar[i] * baz[i];
+// }
+//
+//  On ARM the addresses of 3 arrays should be kept in the registers, thus
+//  this code has quite large register pressure (loop body):
+//
+//  ldr     r1, [r5], #4
+//  ldr     r2, [r6], #4
+//  mul     r1, r2, r1
+//  str     r1, [r0], #4
+//
+//  Pass converts the code to something like:
+//
+//  static struct {
+//    int foo[N];
+//    int bar[N];
+//    int baz[N];
+//  } merged;
+//
+//  for (i = 0; i < N; ++i) {
+//    merged.foo[i] = merged.bar[i] * merged.baz[i];
+//  }
+//
+//  and in ARM code this becomes:
+//
+//  ldr     r0, [r5, #40]
+//  ldr     r1, [r5, #80]
+//  mul     r0, r1, r0
+//  str     r0, [r5], #4
+//
+//  note that we saved 2 registers here almostly "for free".
+// ===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-global-merge"
+#include "ARM.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Attributes.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+namespace {
+  class LLVM_LIBRARY_VISIBILITY ARMGlobalMerge : public FunctionPass {
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// target type sizes.
+    const TargetLowering *TLI;
+
+    bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
+                 Module &M, bool) const;
+
+  public:
+    static char ID;             // Pass identification, replacement for typeid.
+    explicit ARMGlobalMerge(const TargetLowering *tli)
+      : FunctionPass(ID), TLI(tli) {}
+
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function& F);
+
+    const char *getPassName() const {
+      return "Merge internal globals";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+
+    struct GlobalCmp {
+      const TargetData *TD;
+
+      GlobalCmp(const TargetData *td):
+        TD(td) { }
+
+      bool operator() (const GlobalVariable* GV1,
+                       const GlobalVariable* GV2) {
+        const Type* Ty1 = cast<PointerType>(GV1->getType())->getElementType();
+        const Type* Ty2 = cast<PointerType>(GV2->getType())->getElementType();
+
+        return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2));
+      }
+    };
+  };
+} // end anonymous namespace
+
+char ARMGlobalMerge::ID = 0;
+
+bool ARMGlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
+                             Module &M, bool isConst) const {
+  const TargetData *TD = TLI->getTargetData();
+
+  // FIXME: Infer the maximum possible offset depending on the actual users
+  // (these max offsets are different for the users inside Thumb or ARM
+  // functions)
+  unsigned MaxOffset = TLI->getMaximalGlobalOffset();
+
+  // FIXME: Find better heuristics
+  std::stable_sort(Globals.begin(), Globals.end(), GlobalCmp(TD));
+
+  const Type *Int32Ty = Type::getInt32Ty(M.getContext());
+
+  for (size_t i = 0, e = Globals.size(); i != e; ) {
+    size_t j = 0;
+    uint64_t MergedSize = 0;
+    std::vector<const Type*> Tys;
+    std::vector<Constant*> Inits;
+    for (j = i; MergedSize < MaxOffset && j != e; ++j) {
+      const Type* Ty = Globals[j]->getType()->getElementType();
+      Tys.push_back(Ty);
+      Inits.push_back(Globals[j]->getInitializer());
+      MergedSize += TD->getTypeAllocSize(Ty);
+    }
+
+    StructType* MergedTy = StructType::get(M.getContext(), Tys);
+    Constant* MergedInit = ConstantStruct::get(MergedTy, Inits);
+    GlobalVariable* MergedGV = new GlobalVariable(M, MergedTy, isConst,
+                                                  GlobalValue::InternalLinkage,
+                                                  MergedInit, "merged");
+    for (size_t k = i; k < j; ++k) {
+      SmallVector<Constant*, 2> Idx;
+      Idx.push_back(ConstantInt::get(Int32Ty, 0));
+      Idx.push_back(ConstantInt::get(Int32Ty, k-i));
+
+      Constant* GEP =
+        ConstantExpr::getInBoundsGetElementPtr(MergedGV,
+                                               &Idx[0], Idx.size());
+
+      Globals[k]->replaceAllUsesWith(GEP);
+      Globals[k]->eraseFromParent();
+    }
+    i = j;
+  }
+
+  return true;
+}
+
+
+bool ARMGlobalMerge::doInitialization(Module& M) {
+  SmallVector<GlobalVariable*, 16> Globals, ConstGlobals;
+  const TargetData *TD = TLI->getTargetData();
+  unsigned MaxOffset = TLI->getMaximalGlobalOffset();
+  bool Changed = false;
+
+  // Grab all non-const globals.
+  for (Module::global_iterator I = M.global_begin(),
+         E = M.global_end(); I != E; ++I) {
+    // Merge is safe for "normal" internal globals only
+    if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection())
+      continue;
+
+    // Ignore fancy-aligned globals for now.
+    if (I->getAlignment() != 0)
+      continue;
+
+    // Ignore all 'special' globals.
+    if (I->getName().startswith("llvm.") ||
+        I->getName().startswith(".llvm."))
+      continue;
+
+    if (TD->getTypeAllocSize(I->getType()) < MaxOffset) {
+      if (I->isConstant())
+        ConstGlobals.push_back(I);
+      else
+        Globals.push_back(I);
+    }
+  }
+
+  if (Globals.size() > 1)
+    Changed |= doMerge(Globals, M, false);
+  // FIXME: This currently breaks the EH processing due to way how the 
+  // typeinfo detection works. We might want to detect the TIs and ignore 
+  // them in the future.
+  
+  // if (ConstGlobals.size() > 1)
+  //  Changed |= doMerge(ConstGlobals, M, true);
+
+  return Changed;
+}
+
+bool ARMGlobalMerge::runOnFunction(Function& F) {
+  return false;
+}
+
+FunctionPass *llvm::createARMGlobalMergePass(const TargetLowering *tli) {
+  return new ARMGlobalMerge(tli);
+}
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index c84d3ff813248..51a30c158dd1b 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -36,6 +36,11 @@
 
 using namespace llvm;
 
+static cl::opt<bool>
+DisableShifterOp("disable-shifter-op", cl::Hidden,
+  cl::desc("Disable isel of shifter-op"),
+  cl::init(false));
+
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
 /// instructions for SelectionDAG operations.
@@ -113,6 +118,16 @@ public:
   bool SelectT2AddrModeSoReg(SDNode *Op, SDValue N, SDValue &Base,
                              SDValue &OffReg, SDValue &ShImm);
 
+  inline bool Pred_so_imm(SDNode *inN) const {
+    ConstantSDNode *N = cast<ConstantSDNode>(inN);
+    return ARM_AM::getSOImmVal(N->getZExtValue()) != -1;
+  }
+
+  inline bool Pred_t2_so_imm(SDNode *inN) const {
+    ConstantSDNode *N = cast<ConstantSDNode>(inN);
+    return ARM_AM::getT2SOImmVal(N->getZExtValue()) != -1;
+  }
+
   // Include the pieces autogenerated from the target description.
 #include "ARMGenDAGISel.inc"
 
@@ -220,6 +235,9 @@ bool ARMDAGToDAGISel::SelectShifterOperandReg(SDNode *Op,
                                               SDValue &BaseReg,
                                               SDValue &ShReg,
                                               SDValue &Opc) {
+  if (DisableShifterOp)
+    return false;
+
   ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
 
   // Don't match base register only case. That is matched to a separate
@@ -463,7 +481,7 @@ bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N,
 bool ARMDAGToDAGISel::SelectAddrMode4(SDNode *Op, SDValue N,
                                       SDValue &Addr, SDValue &Mode) {
   Addr = N;
-  Mode = CurDAG->getTargetConstant(0, MVT::i32);
+  Mode = CurDAG->getTargetConstant(ARM_AM::getAM4ModeImm(ARM_AM::ia), MVT::i32);
   return true;
 }
 
@@ -666,6 +684,9 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDNode *Op, SDValue N,
 bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDNode *Op, SDValue N,
                                                 SDValue &BaseReg,
                                                 SDValue &Opc) {
+  if (DisableShifterOp)
+    return false;
+
   ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
 
   // Don't match base register only case. That is matched to a separate
@@ -1090,110 +1111,79 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
     break;
   }
 
+  EVT ResTy;
+  if (NumVecs == 1)
+    ResTy = VT;
+  else {
+    unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+    if (!is64BitVector)
+      ResTyElts *= 2;
+    ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
+  }
+
   SDValue Pred = getAL(CurDAG);
   SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+  SDValue SuperReg;
   if (is64BitVector) {
     unsigned Opc = DOpcodes[OpcodeIndex];
     const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain };
-    std::vector<EVT> ResTys(NumVecs, VT);
-    ResTys.push_back(MVT::Other);
-    SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 5);
-    if (NumVecs < 2)
+    SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other, Ops, 5);
+    if (NumVecs == 1)
       return VLd;
 
-    SDValue RegSeq;
-    SDValue V0 = SDValue(VLd, 0);
-    SDValue V1 = SDValue(VLd, 1);
-
-    // Form a REG_SEQUENCE to force register allocation.
-    if (NumVecs == 2)
-      RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
-    else {
-      SDValue V2 = SDValue(VLd, 2);
-      // If it's a vld3, form a quad D-register but discard the last part.
-      SDValue V3 = (NumVecs == 3)
-          ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
-          : SDValue(VLd, 3);
-      RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
-    }
-
+    SuperReg = SDValue(VLd, 0);
     assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
     for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
       SDValue D = CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec,
-                                                 dl, VT, RegSeq);
+                                                 dl, VT, SuperReg);
       ReplaceUses(SDValue(N, Vec), D);
     }
-    ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, NumVecs));
+    ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
     return NULL;
   }
 
-  EVT RegVT = GetNEONSubregVT(VT);
   if (NumVecs <= 2) {
     // Quad registers are directly supported for VLD1 and VLD2,
     // loading pairs of D regs.
     unsigned Opc = QOpcodes0[OpcodeIndex];
     const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain };
-    std::vector<EVT> ResTys(2 * NumVecs, RegVT);
-    ResTys.push_back(MVT::Other);
-    SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 5);
-    Chain = SDValue(VLd, 2 * NumVecs);
+    SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other, Ops, 5);
+    if (NumVecs == 1)
+      return VLd;
+
+    SuperReg = SDValue(VLd, 0);
+    Chain = SDValue(VLd, 1);
 
-    // Combine the even and odd subregs to produce the result.
-    if (NumVecs == 1) {
-      SDNode *Q = PairDRegs(VT, SDValue(VLd, 0), SDValue(VLd, 1));
-      ReplaceUses(SDValue(N, 0), SDValue(Q, 0));
-    } else {
-      SDValue QQ = SDValue(QuadDRegs(MVT::v4i64,
-                                     SDValue(VLd, 0), SDValue(VLd, 1),
-                                     SDValue(VLd, 2), SDValue(VLd, 3)), 0);
-      SDValue Q0 = CurDAG->getTargetExtractSubreg(ARM::qsub_0, dl, VT, QQ);
-      SDValue Q1 = CurDAG->getTargetExtractSubreg(ARM::qsub_1, dl, VT, QQ);
-      ReplaceUses(SDValue(N, 0), Q0);
-      ReplaceUses(SDValue(N, 1), Q1);
-    }
   } else {
     // Otherwise, quad registers are loaded with two separate instructions,
     // where one loads the even registers and the other loads the odd registers.
-
-    std::vector<EVT> ResTys(NumVecs, RegVT);
-    ResTys.push_back(MemAddr.getValueType());
-    ResTys.push_back(MVT::Other);
+    EVT AddrTy = MemAddr.getValueType();
 
     // Load the even subregs.
     unsigned Opc = QOpcodes0[OpcodeIndex];
-    const SDValue OpsA[] = { MemAddr, Align, Reg0, Pred, Reg0, Chain };
-    SDNode *VLdA = CurDAG->getMachineNode(Opc, dl, ResTys, OpsA, 6);
-    Chain = SDValue(VLdA, NumVecs+1);
+    SDValue ImplDef =
+      SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
+    const SDValue OpsA[] = { MemAddr, Align, Reg0, ImplDef, Pred, Reg0, Chain };
+    SDNode *VLdA =
+      CurDAG->getMachineNode(Opc, dl, ResTy, AddrTy, MVT::Other, OpsA, 7);
+    Chain = SDValue(VLdA, 2);
 
     // Load the odd subregs.
     Opc = QOpcodes1[OpcodeIndex];
-    const SDValue OpsB[] = { SDValue(VLdA, NumVecs),
-                             Align, Reg0, Pred, Reg0, Chain };
-    SDNode *VLdB = CurDAG->getMachineNode(Opc, dl, ResTys, OpsB, 6);
-    Chain = SDValue(VLdB, NumVecs+1);
-
-    SDValue V0 = SDValue(VLdA, 0);
-    SDValue V1 = SDValue(VLdB, 0);
-    SDValue V2 = SDValue(VLdA, 1);
-    SDValue V3 = SDValue(VLdB, 1);
-    SDValue V4 = SDValue(VLdA, 2);
-    SDValue V5 = SDValue(VLdB, 2);
-    SDValue V6 = (NumVecs == 3)
-      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT), 0)
-      : SDValue(VLdA, 3);
-    SDValue V7 = (NumVecs == 3)
-      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT), 0)
-      : SDValue(VLdB, 3);
-    SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V0, V1, V2, V3,
-                                       V4, V5, V6, V7), 0);
-
-    // Extract out the 3 / 4 Q registers.
-    assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-      SDValue Q = CurDAG->getTargetExtractSubreg(ARM::qsub_0+Vec,
-                                                 dl, VT, RegSeq);
-      ReplaceUses(SDValue(N, Vec), Q);
-    }
+    const SDValue OpsB[] = { SDValue(VLdA, 1), Align, Reg0, SDValue(VLdA, 0),
+                             Pred, Reg0, Chain };
+    SDNode *VLdB =
+      CurDAG->getMachineNode(Opc, dl, ResTy, AddrTy, MVT::Other, OpsB, 7);
+    SuperReg = SDValue(VLdB, 0);
+    Chain = SDValue(VLdB, 2);
+  }
+
+  // Extract out the Q registers.
+  assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+    SDValue Q = CurDAG->getTargetExtractSubreg(ARM::qsub_0+Vec,
+                                               dl, VT, SuperReg);
+    ReplaceUses(SDValue(N, Vec), Q);
   }
   ReplaceUses(SDValue(N, NumVecs), Chain);
   return NULL;
@@ -1235,12 +1225,14 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
   SDValue Pred = getAL(CurDAG);
   SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
 
-  SmallVector<SDValue, 10> Ops;
+  SmallVector<SDValue, 7> Ops;
   Ops.push_back(MemAddr);
   Ops.push_back(Align);
 
   if (is64BitVector) {
-    if (NumVecs >= 2) {
+    if (NumVecs == 1) {
+      Ops.push_back(N->getOperand(3));
+    } else {
       SDValue RegSeq;
       SDValue V0 = N->getOperand(0+3);
       SDValue V1 = N->getOperand(1+3);
@@ -1257,111 +1249,61 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
           : N->getOperand(3+3);
         RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
       }
-
-      // Now extract the D registers back out.
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT,
-                                                   RegSeq));
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT,
-                                                   RegSeq));
-      if (NumVecs > 2)
-        Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT,
-                                                     RegSeq));
-      if (NumVecs > 3)
-        Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT,
-                                                     RegSeq));
-    } else {
-      for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-        Ops.push_back(N->getOperand(Vec+3));
+      Ops.push_back(RegSeq);
     }
     Ops.push_back(Pred);
     Ops.push_back(Reg0); // predicate register
     Ops.push_back(Chain);
     unsigned Opc = DOpcodes[OpcodeIndex];
-    return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), NumVecs+5);
+    return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 6);
   }
 
-  EVT RegVT = GetNEONSubregVT(VT);
   if (NumVecs <= 2) {
-    // Quad registers are directly supported for VST1 and VST2,
-    // storing pairs of D regs.
+    // Quad registers are directly supported for VST1 and VST2.
     unsigned Opc = QOpcodes0[OpcodeIndex];
-    if (NumVecs == 2) {
-      // First extract the pair of Q registers.
+    if (NumVecs == 1) {
+      Ops.push_back(N->getOperand(3));
+    } else {
+      // Form a QQ register.
       SDValue Q0 = N->getOperand(3);
       SDValue Q1 = N->getOperand(4);
-
-      // Form a QQ register.
-      SDValue QQ = SDValue(PairQRegs(MVT::v4i64, Q0, Q1), 0);
-
-      // Now extract the D registers back out.
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT,
-                                                   QQ));
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT,
-                                                   QQ));
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, RegVT,
-                                                   QQ));
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, RegVT,
-                                                   QQ));
-      Ops.push_back(Pred);
-      Ops.push_back(Reg0); // predicate register
-      Ops.push_back(Chain);
-      return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 5 + 4);
-    } else {
-      for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-        Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT,
-                                                     N->getOperand(Vec+3)));
-        Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT,
-                                                     N->getOperand(Vec+3)));
-      }
-      Ops.push_back(Pred);
-      Ops.push_back(Reg0); // predicate register
-      Ops.push_back(Chain);
-      return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(),
-                                    5 + 2 * NumVecs);
+      Ops.push_back(SDValue(PairQRegs(MVT::v4i64, Q0, Q1), 0));
     }
+    Ops.push_back(Pred);
+    Ops.push_back(Reg0); // predicate register
+    Ops.push_back(Chain);
+    return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 6);
   }
 
   // Otherwise, quad registers are stored with two separate instructions,
   // where one stores the even registers and the other stores the odd registers.
 
   // Form the QQQQ REG_SEQUENCE.
-  SDValue V[8];
-  for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) {
-    V[i]   = CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT,
-                                            N->getOperand(Vec+3));
-    V[i+1] = CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT,
-                                            N->getOperand(Vec+3));
-  }
-  if (NumVecs == 3)
-    V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                 dl, RegVT), 0);
-
-  SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3],
-                                     V[4], V[5], V[6], V[7]), 0);
+  SDValue V0 = N->getOperand(0+3);
+  SDValue V1 = N->getOperand(1+3);
+  SDValue V2 = N->getOperand(2+3);
+  SDValue V3 = (NumVecs == 3)
+    ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+    : N->getOperand(3+3);
+  SDValue RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0);
 
   // Store the even D registers.
-  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
   Ops.push_back(Reg0); // post-access address offset
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec*2, dl,
-                                                 RegVT, RegSeq));
+  Ops.push_back(RegSeq);
   Ops.push_back(Pred);
   Ops.push_back(Reg0); // predicate register
   Ops.push_back(Chain);
   unsigned Opc = QOpcodes0[OpcodeIndex];
   SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
-                                        MVT::Other, Ops.data(), NumVecs+6);
+                                        MVT::Other, Ops.data(), 7);
   Chain = SDValue(VStA, 1);
 
   // Store the odd D registers.
   Ops[0] = SDValue(VStA, 0); // MemAddr
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-    Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1+Vec*2, dl,
-                                                RegVT, RegSeq);
-  Ops[NumVecs+5] = Chain;
+  Ops[6] = Chain;
   Opc = QOpcodes1[OpcodeIndex];
   SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
-                                        MVT::Other, Ops.data(), NumVecs+6);
+                                        MVT::Other, Ops.data(), 7);
   Chain = SDValue(VStB, 1);
   ReplaceUses(SDValue(N, 0), Chain);
   return NULL;
@@ -1675,7 +1617,7 @@ SelectT2CMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
   if (!T)
     return 0;
 
-  if (Predicate_t2_so_imm(TrueVal.getNode())) {
+  if (Pred_t2_so_imm(TrueVal.getNode())) {
     SDValue True = CurDAG->getTargetConstant(T->getZExtValue(), MVT::i32);
     SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
     SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag };
@@ -1692,7 +1634,7 @@ SelectARMCMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
   if (!T)
     return 0;
 
-  if (Predicate_so_imm(TrueVal.getNode())) {
+  if (Pred_so_imm(TrueVal.getNode())) {
     SDValue True = CurDAG->getTargetConstant(T->getZExtValue(), MVT::i32);
     SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
     SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag };
@@ -1740,7 +1682,7 @@ SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) {
     }
 
     // Pattern: (ARMcmov:i32 GPR:i32:$false,
-    //             (imm:i32)<<P:Predicate_so_imm>>:$true,
+    //             (imm:i32)<<P:Pred_so_imm>>:$true,
     //             (imm:i32):$cc)
     // Emits: (MOVCCi:i32 GPR:i32:$false,
     //           (so_imm:i32 (imm:i32):$true), (imm:i32):$cc)
@@ -2013,43 +1955,6 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       ResNode = SelectARMIndexedLoad(N);
     if (ResNode)
       return ResNode;
-
-    // VLDMQ must be custom-selected for "v2f64 load" to set the AM5Opc value.
-    if (Subtarget->hasVFP2() &&
-        N->getValueType(0).getSimpleVT().SimpleTy == MVT::v2f64) {
-      SDValue Chain = N->getOperand(0);
-      SDValue AM5Opc =
-        CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::ia, 4), MVT::i32);
-      SDValue Pred = getAL(CurDAG);
-      SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
-      SDValue Ops[] = { N->getOperand(1), AM5Opc, Pred, PredReg, Chain };
-      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-      MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
-      SDNode *Ret = CurDAG->getMachineNode(ARM::VLDMQ, dl,
-                                           MVT::v2f64, MVT::Other, Ops, 5);
-      cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
-      return Ret;
-    }
-    // Other cases are autogenerated.
-    break;
-  }
-  case ISD::STORE: {
-    // VSTMQ must be custom-selected for "v2f64 store" to set the AM5Opc value.
-    if (Subtarget->hasVFP2() &&
-        N->getOperand(1).getValueType().getSimpleVT().SimpleTy == MVT::v2f64) {
-      SDValue Chain = N->getOperand(0);
-      SDValue AM5Opc =
-        CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::ia, 4), MVT::i32);
-      SDValue Pred = getAL(CurDAG);
-      SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
-      SDValue Ops[] = { N->getOperand(1), N->getOperand(2),
-                        AM5Opc, Pred, PredReg, Chain };
-      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-      MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
-      SDNode *Ret = CurDAG->getMachineNode(ARM::VSTMQ, dl, MVT::Other, Ops, 6);
-      cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
-      return Ret;
-    }
     // Other cases are autogenerated.
     break;
   }
@@ -2206,39 +2111,40 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     case Intrinsic::arm_neon_vld1: {
       unsigned DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16,
                               ARM::VLD1d32, ARM::VLD1d64 };
-      unsigned QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
-                              ARM::VLD1q32, ARM::VLD1q64 };
+      unsigned QOpcodes[] = { ARM::VLD1q8Pseudo, ARM::VLD1q16Pseudo,
+                              ARM::VLD1q32Pseudo, ARM::VLD1q64Pseudo };
       return SelectVLD(N, 1, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vld2: {
-      unsigned DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
-                              ARM::VLD2d32, ARM::VLD1q64 };
-      unsigned QOpcodes[] = { ARM::VLD2q8, ARM::VLD2q16, ARM::VLD2q32 };
+      unsigned DOpcodes[] = { ARM::VLD2d8Pseudo, ARM::VLD2d16Pseudo,
+                              ARM::VLD2d32Pseudo, ARM::VLD1q64Pseudo };
+      unsigned QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
+                              ARM::VLD2q32Pseudo };
       return SelectVLD(N, 2, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vld3: {
-      unsigned DOpcodes[] = { ARM::VLD3d8, ARM::VLD3d16,
-                              ARM::VLD3d32, ARM::VLD1d64T };
-      unsigned QOpcodes0[] = { ARM::VLD3q8_UPD,
-                               ARM::VLD3q16_UPD,
-                               ARM::VLD3q32_UPD };
-      unsigned QOpcodes1[] = { ARM::VLD3q8odd_UPD,
-                               ARM::VLD3q16odd_UPD,
-                               ARM::VLD3q32odd_UPD };
+      unsigned DOpcodes[] = { ARM::VLD3d8Pseudo, ARM::VLD3d16Pseudo,
+                              ARM::VLD3d32Pseudo, ARM::VLD1d64TPseudo };
+      unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+                               ARM::VLD3q16Pseudo_UPD,
+                               ARM::VLD3q32Pseudo_UPD };
+      unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
+                               ARM::VLD3q16oddPseudo_UPD,
+                               ARM::VLD3q32oddPseudo_UPD };
       return SelectVLD(N, 3, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
     case Intrinsic::arm_neon_vld4: {
-      unsigned DOpcodes[] = { ARM::VLD4d8, ARM::VLD4d16,
-                              ARM::VLD4d32, ARM::VLD1d64Q };
-      unsigned QOpcodes0[] = { ARM::VLD4q8_UPD,
-                               ARM::VLD4q16_UPD,
-                               ARM::VLD4q32_UPD };
-      unsigned QOpcodes1[] = { ARM::VLD4q8odd_UPD,
-                               ARM::VLD4q16odd_UPD,
-                               ARM::VLD4q32odd_UPD };
+      unsigned DOpcodes[] = { ARM::VLD4d8Pseudo, ARM::VLD4d16Pseudo,
+                              ARM::VLD4d32Pseudo, ARM::VLD1d64QPseudo };
+      unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+                               ARM::VLD4q16Pseudo_UPD,
+                               ARM::VLD4q32Pseudo_UPD };
+      unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
+                               ARM::VLD4q16oddPseudo_UPD,
+                               ARM::VLD4q32oddPseudo_UPD };
       return SelectVLD(N, 4, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
@@ -2266,39 +2172,40 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     case Intrinsic::arm_neon_vst1: {
       unsigned DOpcodes[] = { ARM::VST1d8, ARM::VST1d16,
                               ARM::VST1d32, ARM::VST1d64 };
-      unsigned QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
-                              ARM::VST1q32, ARM::VST1q64 };
+      unsigned QOpcodes[] = { ARM::VST1q8Pseudo, ARM::VST1q16Pseudo,
+                              ARM::VST1q32Pseudo, ARM::VST1q64Pseudo };
       return SelectVST(N, 1, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vst2: {
-      unsigned DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
-                              ARM::VST2d32, ARM::VST1q64 };
-      unsigned QOpcodes[] = { ARM::VST2q8, ARM::VST2q16, ARM::VST2q32 };
+      unsigned DOpcodes[] = { ARM::VST2d8Pseudo, ARM::VST2d16Pseudo,
+                              ARM::VST2d32Pseudo, ARM::VST1q64Pseudo };
+      unsigned QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
+                              ARM::VST2q32Pseudo };
       return SelectVST(N, 2, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vst3: {
-      unsigned DOpcodes[] = { ARM::VST3d8, ARM::VST3d16,
-                              ARM::VST3d32, ARM::VST1d64T };
-      unsigned QOpcodes0[] = { ARM::VST3q8_UPD,
-                               ARM::VST3q16_UPD,
-                               ARM::VST3q32_UPD };
-      unsigned QOpcodes1[] = { ARM::VST3q8odd_UPD,
-                               ARM::VST3q16odd_UPD,
-                               ARM::VST3q32odd_UPD };
+      unsigned DOpcodes[] = { ARM::VST3d8Pseudo, ARM::VST3d16Pseudo,
+                              ARM::VST3d32Pseudo, ARM::VST1d64TPseudo };
+      unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+                               ARM::VST3q16Pseudo_UPD,
+                               ARM::VST3q32Pseudo_UPD };
+      unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
+                               ARM::VST3q16oddPseudo_UPD,
+                               ARM::VST3q32oddPseudo_UPD };
       return SelectVST(N, 3, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
     case Intrinsic::arm_neon_vst4: {
-      unsigned DOpcodes[] = { ARM::VST4d8, ARM::VST4d16,
-                              ARM::VST4d32, ARM::VST1d64Q };
-      unsigned QOpcodes0[] = { ARM::VST4q8_UPD,
-                               ARM::VST4q16_UPD,
-                               ARM::VST4q32_UPD };
-      unsigned QOpcodes1[] = { ARM::VST4q8odd_UPD,
-                               ARM::VST4q16odd_UPD,
-                               ARM::VST4q32odd_UPD };
+      unsigned DOpcodes[] = { ARM::VST4d8Pseudo, ARM::VST4d16Pseudo,
+                              ARM::VST4d32Pseudo, ARM::VST1d64QPseudo };
+      unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+                               ARM::VST4q16Pseudo_UPD,
+                               ARM::VST4q32Pseudo_UPD };
+      unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
+                               ARM::VST4q16oddPseudo_UPD,
+                               ARM::VST4q32oddPseudo_UPD };
       return SelectVST(N, 4, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0091df753eb78..ce4a2c90689c2 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -55,7 +55,14 @@ STATISTIC(NumTailCalls, "Number of tail calls");
 static cl::opt<bool>
 EnableARMTailCalls("arm-tail-calls", cl::Hidden,
   cl::desc("Generate tail calls (TEMPORARY OPTION)."),
-  cl::init(true));
+  cl::init(false));
+
+// This option should go away when Machine LICM is smart enough to hoist a 
+// reg-to-reg VDUP.
+static cl::opt<bool>
+EnableARMVDUPsplat("arm-vdup-splat", cl::Hidden,
+  cl::desc("Generate VDUP for integer constant splats (TEMPORARY OPTION)."),
+  cl::init(false));
 
 static cl::opt<bool>
 EnableARMLongCalls("arm-long-calls", cl::Hidden,
@@ -122,7 +129,10 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
     setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
     setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
     setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
+    setLoadExtAction(ISD::SEXTLOAD, VT.getSimpleVT(), Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT.getSimpleVT(), Expand);
   }
+  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
 
   // Promote all bit-wise operations.
   if (VT.isInteger() && VT != PromotedBitwiseVT) {
@@ -166,6 +176,7 @@ static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
 ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     : TargetLowering(TM, createTLOF(TM)) {
   Subtarget = &TM.getSubtarget<ARMSubtarget>();
+  RegInfo = TM.getRegisterInfo();
 
   if (Subtarget->isTargetDarwin()) {
     // Uses VFP for Thumb libfuncs if available.
@@ -264,7 +275,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     addRegisterClass(MVT::i32, ARM::GPRRegisterClass);
   if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
     addRegisterClass(MVT::f32, ARM::SPRRegisterClass);
-    addRegisterClass(MVT::f64, ARM::DPRRegisterClass);
+    if (!Subtarget->isFPOnlySP())
+      addRegisterClass(MVT::f64, ARM::DPRRegisterClass);
 
     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   }
@@ -310,9 +322,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
 
+    setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+
     // Neon does not support some operations on v1i64 and v2i64 types.
     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
-    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    // Custom handling for some quad-vector types to detect VMULL.
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
     setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
     setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
 
@@ -410,12 +427,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   // doesn't yet know how to not do that for SjLj.
   setExceptionSelectorRegister(ARM::R0);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
-  // Handle atomics directly for ARMv[67] (except for Thumb1), otherwise
-  // use the default expansion.
-  bool canHandleAtomics =
-    (Subtarget->hasV7Ops() ||
-      (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only()));
-  if (canHandleAtomics) {
+  // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
+  // the default expansion.
+  if (Subtarget->hasDataBarrier() ||
+      (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only())) {
     // membarrier needs custom lowering; the rest are legal and handled
     // normally.
     setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
@@ -466,10 +481,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
-  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only())
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
     // iff target supports vfp2.
     setOperationAction(ISD::BIT_CONVERT, MVT::i64, Custom);
+    setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+  }
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -481,9 +498,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
   setOperationAction(ISD::SETCC,     MVT::f64, Expand);
-  setOperationAction(ISD::SELECT,    MVT::i32, Expand);
-  setOperationAction(ISD::SELECT,    MVT::f32, Expand);
-  setOperationAction(ISD::SELECT,    MVT::f64, Expand);
+  setOperationAction(ISD::SELECT,    MVT::i32, Custom);
+  setOperationAction(ISD::SELECT,    MVT::f32, Custom);
+  setOperationAction(ISD::SELECT,    MVT::f64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
@@ -530,6 +547,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::MUL);
 
+  if (Subtarget->hasV6T2Ops())
+    setTargetDAGCombine(ISD::OR);
+
   setStackPointerRegisterToSaveRestore(ARM::SP);
 
   if (UseSoftFloat || Subtarget->isThumb1Only() || !Subtarget->hasVFP2())
@@ -547,6 +567,37 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     benefitFromCodePlacementOpt = true;
 }
 
+std::pair<const TargetRegisterClass*, uint8_t>
+ARMTargetLowering::findRepresentativeClass(EVT VT) const{
+  const TargetRegisterClass *RRC = 0;
+  uint8_t Cost = 1;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(VT);
+  // Use DPR as representative register class for all floating point
+  // and vector types. Since there are 32 SPR registers and 32 DPR registers so
+  // the cost is 1 for both f32 and f64.
+  case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
+  case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
+    RRC = ARM::DPRRegisterClass;
+    break;
+  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
+  case MVT::v4f32: case MVT::v2f64:
+    RRC = ARM::DPRRegisterClass;
+    Cost = 2;
+    break;
+  case MVT::v4i64:
+    RRC = ARM::DPRRegisterClass;
+    Cost = 4;
+    break;
+  case MVT::v8i64:
+    RRC = ARM::DPRRegisterClass;
+    Cost = 8;
+    break;
+  }
+  return std::make_pair(RRC, Cost);
+}
+
 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
   default: return 0;
@@ -561,6 +612,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
+  case ARMISD::AND:           return "ARMISD::AND";
   case ARMISD::CMP:           return "ARMISD::CMP";
   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
@@ -635,9 +687,12 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VZIP:          return "ARMISD::VZIP";
   case ARMISD::VUZP:          return "ARMISD::VUZP";
   case ARMISD::VTRN:          return "ARMISD::VTRN";
+  case ARMISD::VMULLs:        return "ARMISD::VMULLs";
+  case ARMISD::VMULLu:        return "ARMISD::VMULLu";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::FMAX:          return "ARMISD::FMAX";
   case ARMISD::FMIN:          return "ARMISD::FMIN";
+  case ARMISD::BFI:           return "ARMISD::BFI";
   }
 }
 
@@ -656,11 +711,23 @@ TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const {
   return TargetLowering::getRegClassFor(VT);
 }
 
+// Create a fast isel object.
+FastISel *
+ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
+  return ARM::createFastISel(funcInfo);
+}
+
 /// getFunctionAlignment - Return the Log2 alignment of this function.
 unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const {
   return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 1 : 2;
 }
 
+/// getMaximalGlobalOffset - Returns the maximal possible offset which can
+/// be used for loads / stores from the global.
+unsigned ARMTargetLowering::getMaximalGlobalOffset() const {
+  return (Subtarget->isThumb1Only() ? 127 : 4095);
+}
+
 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
   unsigned NumVals = N->getNumValues();
   if (!NumVals)
@@ -688,6 +755,24 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
   return Sched::RegPressure;
 }
 
+unsigned
+ARMTargetLowering::getRegPressureLimit(const TargetRegisterClass *RC,
+                                       MachineFunction &MF) const {
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case ARM::tGPRRegClassID:
+    return RegInfo->hasFP(MF) ? 4 : 5;
+  case ARM::GPRRegClassID: {
+    unsigned FP = RegInfo->hasFP(MF) ? 1 : 0;
+    return 10 - FP - (Subtarget->isR9Reserved() ? 1 : 0);
+  }
+  case ARM::SPRRegClassID:  // Currently not used as 'rep' register class.
+  case ARM::DPRRegClassID:
+    return 32 - 10;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Lowering Code
 //===----------------------------------------------------------------------===//
@@ -793,8 +878,9 @@ static bool f64AssignAAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                            CCState &State, bool CanFail) {
   static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
   static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
+  static const unsigned ShadowRegList[] = { ARM::R0, ARM::R1 };
 
-  unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
+  unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList, 2);
   if (Reg == 0) {
     // For the 2nd half of a v2f64, do not just fail.
     if (CanFail)
@@ -812,6 +898,10 @@ static bool f64AssignAAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
     if (HiRegList[i] == Reg)
       break;
 
+  unsigned T = State.AllocateReg(LoRegList[i]);
+  (void)T;
+  assert(T == LoRegList[i] && "Could not allocate register");
+
   State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
                                          LocVT, LocInfo));
@@ -1624,6 +1714,10 @@ static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
 }
 
+unsigned ARMTargetLowering::getJumpTableEncoding() const {
+  return MachineJumpTableInfo::EK_Inline;
+}
+
 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
                                              SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -1917,17 +2011,19 @@ static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG,
   DebugLoc dl = Op.getDebugLoc();
   SDValue Op5 = Op.getOperand(5);
   unsigned isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue();
-  // v6 and v7 can both handle barriers directly, but need handled a bit
-  // differently. Thumb1 and pre-v6 ARM mode use a libcall instead and should
+  // Some subtargets which have dmb and dsb instructions can handle barriers
+  // directly. Some ARMv6 cpus can support them with the help of mcr
+  // instruction. Thumb1 and pre-v6 ARM mode use a libcall instead and should
   // never get here.
   unsigned Opc = isDeviceBarrier ? ARMISD::SYNCBARRIER : ARMISD::MEMBARRIER;
-  if (Subtarget->hasV7Ops())
+  if (Subtarget->hasDataBarrier())
     return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0));
-  else if (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only())
+  else {
+    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb1Only() &&
+           "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
     return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0),
                        DAG.getConstant(0, MVT::i32));
-  assert(0 && "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
-  return SDValue();
+  }
 }
 
 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
@@ -1945,54 +2041,6 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
 }
 
 SDValue
-ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  SDNode *Node = Op.getNode();
-  DebugLoc dl = Node->getDebugLoc();
-  EVT VT = Node->getValueType(0);
-  SDValue Chain = Op.getOperand(0);
-  SDValue Size  = Op.getOperand(1);
-  SDValue Align = Op.getOperand(2);
-
-  // Chain the dynamic stack allocation so that it doesn't modify the stack
-  // pointer when other instructions are using the stack.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
-
-  unsigned AlignVal = cast<ConstantSDNode>(Align)->getZExtValue();
-  unsigned StackAlign = getTargetMachine().getFrameInfo()->getStackAlignment();
-  if (AlignVal > StackAlign)
-    // Do this now since selection pass cannot introduce new target
-    // independent node.
-    Align = DAG.getConstant(-(uint64_t)AlignVal, VT);
-
-  // In Thumb1 mode, there isn't a "sub r, sp, r" instruction, we will end up
-  // using a "add r, sp, r" instead. Negate the size now so we don't have to
-  // do even more horrible hack later.
-  MachineFunction &MF = DAG.getMachineFunction();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  if (AFI->isThumb1OnlyFunction()) {
-    bool Negate = true;
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Size);
-    if (C) {
-      uint32_t Val = C->getZExtValue();
-      if (Val <= 508 && ((Val & 3) == 0))
-        Negate = false;
-    }
-    if (Negate)
-      Size = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, VT), Size);
-  }
-
-  SDVTList VTList = DAG.getVTList(VT, MVT::Other);
-  SDValue Ops1[] = { Chain, Size, Align };
-  SDValue Res = DAG.getNode(ARMISD::DYN_ALLOC, dl, VTList, Ops1, 3);
-  Chain = Res.getValue(1);
-  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
-                             DAG.getIntPtrConstant(0, true), SDValue());
-  SDValue Ops2[] = { Res, Chain };
-  return DAG.getMergeValues(Ops2, 2, dl);
-}
-
-SDValue
 ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                                         SDValue &Root, SelectionDAG &DAG,
                                         DebugLoc dl) const {
@@ -2229,28 +2277,28 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       default: break;
       case ISD::SETLT:
       case ISD::SETGE:
-        if (isLegalICmpImmediate(C-1)) {
+        if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
           RHS = DAG.getConstant(C-1, MVT::i32);
         }
         break;
       case ISD::SETULT:
       case ISD::SETUGE:
-        if (C > 0 && isLegalICmpImmediate(C-1)) {
+        if (C != 0 && isLegalICmpImmediate(C-1)) {
           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
           RHS = DAG.getConstant(C-1, MVT::i32);
         }
         break;
       case ISD::SETLE:
       case ISD::SETGT:
-        if (isLegalICmpImmediate(C+1)) {
+        if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
           RHS = DAG.getConstant(C+1, MVT::i32);
         }
         break;
       case ISD::SETULE:
       case ISD::SETUGT:
-        if (C < 0xffffffff && isLegalICmpImmediate(C+1)) {
+        if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
           RHS = DAG.getConstant(C+1, MVT::i32);
         }
@@ -2287,6 +2335,52 @@ ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Flag, Cmp);
 }
 
+SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Cond = Op.getOperand(0);
+  SDValue SelectTrue = Op.getOperand(1);
+  SDValue SelectFalse = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Convert:
+  //
+  //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
+  //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
+  //
+  if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
+    const ConstantSDNode *CMOVTrue =
+      dyn_cast<ConstantSDNode>(Cond.getOperand(0));
+    const ConstantSDNode *CMOVFalse =
+      dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+
+    if (CMOVTrue && CMOVFalse) {
+      unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
+      unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
+
+      SDValue True;
+      SDValue False;
+      if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
+        True = SelectTrue;
+        False = SelectFalse;
+      } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
+        True = SelectFalse;
+        False = SelectTrue;
+      }
+
+      if (True.getNode() && False.getNode()) {
+        EVT VT = Cond.getValueType();
+        SDValue ARMcc = Cond.getOperand(2);
+        SDValue CCR = Cond.getOperand(3);
+        SDValue Cmp = Cond.getOperand(4);
+        return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp);
+      }
+    }
+  }
+
+  return DAG.getSelectCC(dl, Cond,
+                         DAG.getConstant(0, Cond.getValueType()),
+                         SelectTrue, SelectFalse, ISD::SETNE);
+}
+
 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
@@ -2403,8 +2497,9 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
   bool SeenZero = false;
   if (canChangeToInt(LHS, SeenZero, Subtarget) &&
       canChangeToInt(RHS, SeenZero, Subtarget) &&
-      // If one of the operand is zero, it's safe to ignore the NaN case.
-      (FiniteOnlyFPMath() || SeenZero)) {
+      // If one of the operand is zero, it's safe to ignore the NaN case since
+      // we only care about equality comparisons.
+      (SeenZero || (DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS)))) {
     // If unsafe fp math optimization is enabled and there are no othter uses of
     // the CMP operands, and the condition code is EQ oe NE, we can optimize it
     // to an integer comparison.
@@ -2587,7 +2682,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(ARM::LR, ARM::GPRRegisterClass); 
+  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
 }
 
@@ -2730,6 +2825,24 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
   return DAG.getMergeValues(Ops, 2, dl);
 }
 
+SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 
+                                            SelectionDAG &DAG) const {
+  // The rounding mode is in bits 23:22 of the FPSCR.
+  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
+  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
+  // so that the shift + and get folded into a bitfield extract.
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
+                              DAG.getConstant(Intrinsic::arm_get_fpscr,
+                                              MVT::i32));
+  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 
+                                  DAG.getConstant(1U << 22, MVT::i32));
+  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
+                              DAG.getConstant(22, MVT::i32));
+  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 
+                     DAG.getConstant(3, MVT::i32));
+}
+
 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
                          const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
@@ -3046,6 +3159,11 @@ static bool isVEXTMask(const SmallVectorImpl<int> &M, EVT VT,
                        bool &ReverseVEXT, unsigned &Imm) {
   unsigned NumElts = VT.getVectorNumElements();
   ReverseVEXT = false;
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
   Imm = M[0];
 
   // If this is a VEXT shuffle, the immediate value is the index of the first
@@ -3061,6 +3179,7 @@ static bool isVEXTMask(const SmallVectorImpl<int> &M, EVT VT,
       ReverseVEXT = true;
     }
 
+    if (M[i] < 0) continue; // ignore UNDEF indices
     if (ExpectedElt != static_cast<unsigned>(M[i]))
       return false;
   }
@@ -3086,13 +3205,16 @@ static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT,
 
   unsigned NumElts = VT.getVectorNumElements();
   unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
 
   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
     return false;
 
   for (unsigned i = 0; i < NumElts; ++i) {
-    if ((unsigned) M[i] !=
-        (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
       return false;
   }
 
@@ -3108,8 +3230,8 @@ static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT,
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((unsigned) M[i] != i + WhichResult ||
-        (unsigned) M[i+1] != i + NumElts + WhichResult)
+    if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
+        (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult))
       return false;
   }
   return true;
@@ -3127,8 +3249,8 @@ static bool isVTRN_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT,
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((unsigned) M[i] != i + WhichResult ||
-        (unsigned) M[i+1] != i + WhichResult)
+    if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
+        (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult))
       return false;
   }
   return true;
@@ -3143,6 +3265,7 @@ static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT,
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned i = 0; i != NumElts; ++i) {
+    if (M[i] < 0) continue; // ignore UNDEF indices
     if ((unsigned) M[i] != 2 * i + WhichResult)
       return false;
   }
@@ -3168,7 +3291,8 @@ static bool isVUZP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT,
   for (unsigned j = 0; j != 2; ++j) {
     unsigned Idx = WhichResult;
     for (unsigned i = 0; i != Half; ++i) {
-      if ((unsigned) M[i + j * Half] != Idx)
+      int MIdx = M[i + j * Half];
+      if (MIdx >= 0 && (unsigned) MIdx != Idx)
         return false;
       Idx += 2;
     }
@@ -3191,8 +3315,8 @@ static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT,
   WhichResult = (M[0] == 0 ? 0 : 1);
   unsigned Idx = WhichResult * NumElts / 2;
   for (unsigned i = 0; i != NumElts; i += 2) {
-    if ((unsigned) M[i] != Idx ||
-        (unsigned) M[i+1] != Idx + NumElts)
+    if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
+        (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts))
       return false;
     Idx += 1;
   }
@@ -3217,8 +3341,8 @@ static bool isVZIP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT,
   WhichResult = (M[0] == 0 ? 0 : 1);
   unsigned Idx = WhichResult * NumElts / 2;
   for (unsigned i = 0; i != NumElts; i += 2) {
-    if ((unsigned) M[i] != Idx ||
-        (unsigned) M[i+1] != Idx)
+    if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
+        (M[i+1] >= 0 && (unsigned) M[i+1] != Idx))
       return false;
     Idx += 1;
   }
@@ -3230,9 +3354,30 @@ static bool isVZIP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT,
   return true;
 }
 
+// If N is an integer constant that can be moved into a register in one
+// instruction, return an SDValue of such a constant (will become a MOV
+// instruction).  Otherwise return null.
+static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
+                                     const ARMSubtarget *ST, DebugLoc dl) {
+  uint64_t Val;
+  if (!isa<ConstantSDNode>(N))
+    return SDValue();
+  Val = cast<ConstantSDNode>(N)->getZExtValue();
+
+  if (ST->isThumb1Only()) {
+    if (Val <= 255 || ~Val <= 255)
+      return DAG.getConstant(Val, MVT::i32);
+  } else {
+    if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
+      return DAG.getConstant(Val, MVT::i32);
+  }
+  return SDValue();
+}
+
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.
-static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 
+                                 const ARMSubtarget *ST) {
   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
@@ -3292,15 +3437,41 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
   if (isOnlyLowElement)
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
 
-  // If all elements are constants, fall back to the default expansion, which
-  // will generate a load from the constant pool.
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+
+  if (EnableARMVDUPsplat) {
+    // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
+    // i32 and try again.
+    if (usesOnlyOneValue && EltSize <= 32) {
+      if (!isConstant)
+        return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+      if (VT.getVectorElementType().isFloatingPoint()) {
+        SmallVector<SDValue, 8> Ops;
+        for (unsigned i = 0; i < NumElts; ++i)
+          Ops.push_back(DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 
+                                    Op.getOperand(i)));
+        SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &Ops[0],
+                                  NumElts);
+        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 
+                           LowerBUILD_VECTOR(Val, DAG, ST));
+      }
+      SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
+      if (Val.getNode())
+        return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
+    }
+  }
+
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
   if (isConstant)
     return SDValue();
 
-  // Use VDUP for non-constant splats.
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-  if (usesOnlyOneValue && EltSize <= 32)
-    return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+  if (!EnableARMVDUPsplat) {
+    // Use VDUP for non-constant splats.
+    if (usesOnlyOneValue && EltSize <= 32)
+      return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+  }
 
   // Vectors with 32- or 64-bit elements can be built by directly assigning
   // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
@@ -3585,6 +3756,51 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Val);
 }
 
+/// SkipExtension - For a node that is either a SIGN_EXTEND, ZERO_EXTEND, or
+/// an extending load, return the unextended value.
+static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+    return N->getOperand(0);
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(),
+                     LD->getBasePtr(), LD->getSrcValue(),
+                     LD->getSrcValueOffset(), LD->isVolatile(),
+                     LD->isNonTemporal(), LD->getAlignment());
+}
+
+static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+  // Multiplications are only custom-lowered for 128-bit vectors so that
+  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
+  EVT VT = Op.getValueType();
+  assert(VT.is128BitVector() && "unexpected type for custom-lowering ISD::MUL");
+  SDNode *N0 = Op.getOperand(0).getNode();
+  SDNode *N1 = Op.getOperand(1).getNode();
+  unsigned NewOpc = 0;
+  if ((N0->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N0)) &&
+      (N1->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N1))) {
+    NewOpc = ARMISD::VMULLs;
+  } else if ((N0->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N0)) &&
+             (N1->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N1))) {
+    NewOpc = ARMISD::VMULLu;
+  } else if (VT.getSimpleVT().SimpleTy == MVT::v2i64) {
+    // Fall through to expand this.  It is not legal.
+    return SDValue();
+  } else {
+    // Other vector multiplications are legal.
+    return Op;
+  }
+
+  // Legalize to a VMULL instruction.
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue Op0 = SkipExtension(N0, DAG);
+  SDValue Op1 = SkipExtension(N1, DAG);
+
+  assert(Op0.getValueType().is64BitVector() &&
+         Op1.getValueType().is64BitVector() &&
+         "unexpected types for extended operands to VMULL");
+  return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
@@ -3594,10 +3810,10 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
       LowerGlobalAddressELF(Op, DAG);
   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::SELECT:        return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
-  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::VASTART:       return LowerVASTART(Op, DAG);
   case ISD::MEMBARRIER:    return LowerMEMBARRIER(Op, DAG, Subtarget);
   case ISD::SINT_TO_FP:
@@ -3621,10 +3837,12 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
   case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
   case ISD::VSETCC:        return LowerVSETCC(Op, DAG);
-  case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
+  case ISD::MUL:           return LowerMUL(Op, DAG);
   }
   return SDValue();
 }
@@ -4002,78 +4220,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     MI->eraseFromParent();   // The pseudo instruction is gone now.
     return BB;
   }
-
-  case ARM::tANDsp:
-  case ARM::tADDspr_:
-  case ARM::tSUBspi_:
-  case ARM::t2SUBrSPi_:
-  case ARM::t2SUBrSPi12_:
-  case ARM::t2SUBrSPs_: {
-    MachineFunction *MF = BB->getParent();
-    unsigned DstReg = MI->getOperand(0).getReg();
-    unsigned SrcReg = MI->getOperand(1).getReg();
-    bool DstIsDead = MI->getOperand(0).isDead();
-    bool SrcIsKill = MI->getOperand(1).isKill();
-
-    if (SrcReg != ARM::SP) {
-      // Copy the source to SP from virtual register.
-      const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(SrcReg);
-      unsigned CopyOpc = (RC == ARM::tGPRRegisterClass)
-        ? ARM::tMOVtgpr2gpr : ARM::tMOVgpr2gpr;
-      BuildMI(*BB, MI, dl, TII->get(CopyOpc), ARM::SP)
-        .addReg(SrcReg, getKillRegState(SrcIsKill));
-    }
-
-    unsigned OpOpc = 0;
-    bool NeedPred = false, NeedCC = false, NeedOp3 = false;
-    switch (MI->getOpcode()) {
-    default:
-      llvm_unreachable("Unexpected pseudo instruction!");
-    case ARM::tANDsp:
-      OpOpc = ARM::tAND;
-      NeedPred = true;
-      break;
-    case ARM::tADDspr_:
-      OpOpc = ARM::tADDspr;
-      break;
-    case ARM::tSUBspi_:
-      OpOpc = ARM::tSUBspi;
-      break;
-    case ARM::t2SUBrSPi_:
-      OpOpc = ARM::t2SUBrSPi;
-      NeedPred = true; NeedCC = true;
-      break;
-    case ARM::t2SUBrSPi12_:
-      OpOpc = ARM::t2SUBrSPi12;
-      NeedPred = true;
-      break;
-    case ARM::t2SUBrSPs_:
-      OpOpc = ARM::t2SUBrSPs;
-      NeedPred = true; NeedCC = true; NeedOp3 = true;
-      break;
-    }
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(OpOpc), ARM::SP);
-    if (OpOpc == ARM::tAND)
-      AddDefaultT1CC(MIB);
-    MIB.addReg(ARM::SP);
-    MIB.addOperand(MI->getOperand(2));
-    if (NeedOp3)
-      MIB.addOperand(MI->getOperand(3));
-    if (NeedPred)
-      AddDefaultPred(MIB);
-    if (NeedCC)
-      AddDefaultCC(MIB);
-
-    // Copy the result from SP to virtual register.
-    const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(DstReg);
-    unsigned CopyOpc = (RC == ARM::tGPRRegisterClass)
-      ? ARM::tMOVgpr2tgpr : ARM::tMOVgpr2gpr;
-    BuildMI(*BB, MI, dl, TII->get(CopyOpc))
-      .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead))
-      .addReg(ARM::SP);
-    MI->eraseFromParent();   // The pseudo instruction is gone now.
-    return BB;
-  }
   }
 }
 
@@ -4141,30 +4287,42 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
   return SDValue();
 }
 
-/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
-static SDValue PerformADDCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
-  // added by evan in r37685 with no testcase.
-  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
-
+/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
+/// operands N0 and N1.  This is a helper for PerformADDCombine that is
+/// called with the default operands, and if that fails, with commuted
+/// operands.
+static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
   if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) {
     SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
     if (Result.getNode()) return Result;
   }
-  if (N1.getOpcode() == ISD::SELECT && N1.getNode()->hasOneUse()) {
-    SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
-    if (Result.getNode()) return Result;
-  }
-
   return SDValue();
 }
 
+/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+///
+static SDValue PerformADDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // First try with the default operand order.
+  SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI);
+  if (Result.getNode())
+    return Result;
+
+  // If that didn't work, try again with the operands commuted.
+  return PerformADDCombineWithOperands(N, N1, N0, DCI);
+}
+
 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
+///
 static SDValue PerformSUBCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
-  // added by evan in r37685 with no testcase.
-  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
 
   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
   if (N1.getOpcode() == ISD::SELECT && N1.getNode()->hasOneUse()) {
@@ -4231,6 +4389,105 @@ static SDValue PerformMULCombine(SDNode *N,
   return SDValue();
 }
 
+/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
+static SDValue PerformORCombine(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const ARMSubtarget *Subtarget) {
+  // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
+  // reasonable.
+
+  // BFI is only available on V6T2+
+  if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+  // 1) or (and A, mask), val => ARMbfi A, val, mask
+  //      iff (val & mask) == val
+  //
+  // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
+  //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
+  //          && CountPopulation_32(mask) == CountPopulation_32(~mask2)
+  //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
+  //          && CountPopulation_32(mask) == CountPopulation_32(~mask2)
+  //  (i.e., copy a bitfield value into another bitfield of the same width)
+  if (N0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32)
+    return SDValue();
+
+
+  // The value and the mask need to be constants so we can verify this is
+  // actually a bitfield set. If the mask is 0xffff, we can do better
+  // via a movt instruction, so don't use BFI in that case.
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (!C)
+    return SDValue();
+  unsigned Mask = C->getZExtValue();
+  if (Mask == 0xffff)
+    return SDValue();
+  SDValue Res;
+  // Case (1): or (and A, mask), val => ARMbfi A, val, mask
+  if ((C = dyn_cast<ConstantSDNode>(N1))) {
+    unsigned Val = C->getZExtValue();
+    if (!ARM::isBitFieldInvertedMask(Mask) || (Val & ~Mask) != Val)
+      return SDValue();
+    Val >>= CountTrailingZeros_32(~Mask);
+
+    Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0),
+                      DAG.getConstant(Val, MVT::i32),
+                      DAG.getConstant(Mask, MVT::i32));
+
+    // Do not add new nodes to DAG combiner worklist.
+    DCI.CombineTo(N, Res, false);
+  } else if (N1.getOpcode() == ISD::AND) {
+    // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
+    C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!C)
+      return SDValue();
+    unsigned Mask2 = C->getZExtValue();
+
+    if (ARM::isBitFieldInvertedMask(Mask) &&
+        ARM::isBitFieldInvertedMask(~Mask2) &&
+        (CountPopulation_32(Mask) == CountPopulation_32(~Mask2))) {
+      // The pack halfword instruction works better for masks that fit it,
+      // so use that when it's available.
+      if (Subtarget->hasT2ExtractPack() &&
+          (Mask == 0xffff || Mask == 0xffff0000))
+        return SDValue();
+      // 2a
+      unsigned lsb = CountTrailingZeros_32(Mask2);
+      Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
+                        DAG.getConstant(lsb, MVT::i32));
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0), Res,
+                        DAG.getConstant(Mask, MVT::i32));
+      // Do not add new nodes to DAG combiner worklist.
+      DCI.CombineTo(N, Res, false);
+    } else if (ARM::isBitFieldInvertedMask(~Mask) &&
+               ARM::isBitFieldInvertedMask(Mask2) &&
+               (CountPopulation_32(~Mask) == CountPopulation_32(Mask2))) {
+      // The pack halfword instruction works better for masks that fit it,
+      // so use that when it's available.
+      if (Subtarget->hasT2ExtractPack() &&
+          (Mask2 == 0xffff || Mask2 == 0xffff0000))
+        return SDValue();
+      // 2b
+      unsigned lsb = CountTrailingZeros_32(Mask);
+      Res = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0),
+                        DAG.getConstant(lsb, MVT::i32));
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
+                                DAG.getConstant(Mask2, MVT::i32));
+      // Do not add new nodes to DAG combiner worklist.
+      DCI.CombineTo(N, Res, false);
+    }
+  }
+
+  return SDValue();
+}
+
 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
 /// ARMISD::VMOVRRD.
 static SDValue PerformVMOVRRDCombine(SDNode *N,
@@ -4561,7 +4818,7 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
                                        const ARMSubtarget *ST) {
   // If the target supports NEON, try to use vmax/vmin instructions for f32
-  // selects like "x < y ? x : y".  Unless the FiniteOnlyFPMath option is set,
+  // selects like "x < y ? x : y".  Unless the NoNaNsFPMath option is set,
   // be careful about NaNs:  NEON's vmax/vmin return NaN if either operand is
   // a NaN; only do the transformation when it matches that behavior.
 
@@ -4648,6 +4905,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ADD:        return PerformADDCombine(N, DCI);
   case ISD::SUB:        return PerformSUBCombine(N, DCI);
   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
+  case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI);
   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
   case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
@@ -5379,6 +5637,21 @@ int ARM::getVFPf64Imm(const APFloat &FPImm) {
   return ((int)Sign << 7) | (Exp << 4) | Mantissa;
 }
 
+bool ARM::isBitFieldInvertedMask(unsigned v) {
+  if (v == 0xffffffff)
+    return 0;
+  // there can be 1's on either or both "outsides", all the "inside"
+  // bits must be 0's
+  unsigned int lsb = 0, msb = 31;
+  while (v & (1 << msb)) --msb;
+  while (v & (1 << lsb)) ++lsb;
+  for (unsigned int i = lsb; i <= msb; ++i) {
+    if (v & (1 << i))
+      return 0;
+  }
+  return 1;
+}
+
 /// isFPImmLegal - Returns true if the target can instruction select the
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 128b72e1e743e..ba9ea7f15e7b2 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -17,6 +17,8 @@
 
 #include "ARMSubtarget.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include <vector>
@@ -45,6 +47,8 @@ namespace llvm {
 
       PIC_ADD,      // Add with a PC operand and a PIC label.
 
+      AND,          // ARM "and" instruction that sets the 's' flag in CPSR.
+
       CMP,          // ARM compare instructions.
       CMPZ,         // ARM compare that sets only Z flag.
       CMPFP,        // ARM VFP compare instruction, sets FPSCR.
@@ -80,7 +84,7 @@ namespace llvm {
 
       MEMBARRIER,   // Memory barrier
       SYNCBARRIER,  // Memory sync barrier
-
+      
       VCEQ,         // Vector compare equal.
       VCGE,         // Vector compare greater than or equal.
       VCGEU,        // Vector compare unsigned greater than or equal.
@@ -141,6 +145,10 @@ namespace llvm {
       VUZP,         // unzip (deinterleave)
       VTRN,         // transpose
 
+      // Vector multiply long:
+      VMULLs,       // ...signed
+      VMULLu,       // ...unsigned
+
       // Operands of the standard BUILD_VECTOR node are not legalized, which
       // is fine if BUILD_VECTORs are always lowered to shuffles or other
       // operations, but for ARM some BUILD_VECTORs are legal as-is and their
@@ -150,7 +158,10 @@ namespace llvm {
 
       // Floating-point max and min:
       FMAX,
-      FMIN
+      FMIN,
+
+      // Bit-field insert
+      BFI
     };
   }
 
@@ -162,6 +173,7 @@ namespace llvm {
     /// returns -1.
     int getVFPf32Imm(const APFloat &FPImm);
     int getVFPf64Imm(const APFloat &FPImm);
+    bool isBitFieldInvertedMask(unsigned v);
   }
 
   //===--------------------------------------------------------------------===//
@@ -171,6 +183,8 @@ namespace llvm {
   public:
     explicit ARMTargetLowering(TargetMachine &TM);
 
+    virtual unsigned getJumpTableEncoding(void) const;
+
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
     /// ReplaceNodeResults - Replace the results of node with an illegal result
@@ -255,8 +269,19 @@ namespace llvm {
     /// getFunctionAlignment - Return the Log2 alignment of this function.
     virtual unsigned getFunctionAlignment(const Function *F) const;
 
+    /// getMaximalGlobalOffset - Returns the maximal possible offset which can
+    /// be used for loads / stores from the global.
+    virtual unsigned getMaximalGlobalOffset() const;
+
+    /// createFastISel - This method returns a target specific FastISel object,
+    /// or null if the target does not support "fast" ISel.
+    virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo) const;
+
     Sched::Preference getSchedulingPreference(SDNode *N) const;
 
+    unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                                 MachineFunction &MF) const;
+
     bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const;
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
@@ -265,11 +290,17 @@ namespace llvm {
     /// materialize the FP immediate as a load from a constant pool.
     virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
 
+  protected:
+    std::pair<const TargetRegisterClass*, uint8_t>
+    findRepresentativeClass(EVT VT) const;
+
   private:
     /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
     /// make the right decision when generating code for different targets.
     const ARMSubtarget *Subtarget;
 
+    const TargetRegisterInfo *RegInfo;
+
     /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created.
     ///
     unsigned ARMPCLabelIndex;
@@ -310,14 +341,15 @@ namespace llvm {
                                    SelectionDAG &DAG) const;
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
@@ -377,6 +409,10 @@ namespace llvm {
                                         unsigned BinOpcode) const;
 
   };
+  
+  namespace ARM {
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo);
+  }
 }
 
 #endif  // ARMISELLOWERING_H
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index ac568e75ccc46..113cfffe61f94 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -36,37 +36,38 @@ def LdStMulFrm    : Format<10>;
 def LdStExFrm     : Format<11>;
 
 def ArithMiscFrm  : Format<12>;
-def ExtFrm        : Format<13>;
-
-def VFPUnaryFrm   : Format<14>;
-def VFPBinaryFrm  : Format<15>;
-def VFPConv1Frm   : Format<16>;
-def VFPConv2Frm   : Format<17>;
-def VFPConv3Frm   : Format<18>;
-def VFPConv4Frm   : Format<19>;
-def VFPConv5Frm   : Format<20>;
-def VFPLdStFrm    : Format<21>;
-def VFPLdStMulFrm : Format<22>;
-def VFPMiscFrm    : Format<23>;
-
-def ThumbFrm      : Format<24>;
-def MiscFrm       : Format<25>;
-
-def NGetLnFrm     : Format<26>;
-def NSetLnFrm     : Format<27>;
-def NDupFrm       : Format<28>;
-def NLdStFrm      : Format<29>;
-def N1RegModImmFrm: Format<30>;
-def N2RegFrm      : Format<31>;
-def NVCVTFrm      : Format<32>;
-def NVDupLnFrm    : Format<33>;
-def N2RegVShLFrm  : Format<34>;
-def N2RegVShRFrm  : Format<35>;
-def N3RegFrm      : Format<36>;
-def N3RegVShFrm   : Format<37>;
-def NVExtFrm      : Format<38>;
-def NVMulSLFrm    : Format<39>;
-def NVTBLFrm      : Format<40>;
+def SatFrm        : Format<13>;
+def ExtFrm        : Format<14>;
+
+def VFPUnaryFrm   : Format<15>;
+def VFPBinaryFrm  : Format<16>;
+def VFPConv1Frm   : Format<17>;
+def VFPConv2Frm   : Format<18>;
+def VFPConv3Frm   : Format<19>;
+def VFPConv4Frm   : Format<20>;
+def VFPConv5Frm   : Format<21>;
+def VFPLdStFrm    : Format<22>;
+def VFPLdStMulFrm : Format<23>;
+def VFPMiscFrm    : Format<24>;
+
+def ThumbFrm      : Format<25>;
+def MiscFrm       : Format<26>;
+
+def NGetLnFrm     : Format<27>;
+def NSetLnFrm     : Format<28>;
+def NDupFrm       : Format<29>;
+def NLdStFrm      : Format<30>;
+def N1RegModImmFrm: Format<31>;
+def N2RegFrm      : Format<32>;
+def NVCVTFrm      : Format<33>;
+def NVDupLnFrm    : Format<34>;
+def N2RegVShLFrm  : Format<35>;
+def N2RegVShRFrm  : Format<36>;
+def N3RegFrm      : Format<37>;
+def N3RegVShFrm   : Format<38>;
+def NVExtFrm      : Format<39>;
+def NVMulSLFrm    : Format<40>;
+def NVTBLFrm      : Format<41>;
 
 // Misc flags.
 
@@ -87,21 +88,21 @@ class Xform16Bit { bit canXformTo16Bit = 1; }
 class AddrMode<bits<4> val> {
   bits<4> Value = val;
 }
-def AddrModeNone  : AddrMode<0>;
-def AddrMode1     : AddrMode<1>;
-def AddrMode2     : AddrMode<2>;
-def AddrMode3     : AddrMode<3>;
-def AddrMode4     : AddrMode<4>;
-def AddrMode5     : AddrMode<5>;
-def AddrMode6     : AddrMode<6>;
-def AddrModeT1_1  : AddrMode<7>;
-def AddrModeT1_2  : AddrMode<8>;
-def AddrModeT1_4  : AddrMode<9>;
-def AddrModeT1_s  : AddrMode<10>;
-def AddrModeT2_i12: AddrMode<11>;
-def AddrModeT2_i8 : AddrMode<12>;
-def AddrModeT2_so : AddrMode<13>;
-def AddrModeT2_pc : AddrMode<14>;
+def AddrModeNone    : AddrMode<0>;
+def AddrMode1       : AddrMode<1>;
+def AddrMode2       : AddrMode<2>;
+def AddrMode3       : AddrMode<3>;
+def AddrMode4       : AddrMode<4>;
+def AddrMode5       : AddrMode<5>;
+def AddrMode6       : AddrMode<6>;
+def AddrModeT1_1    : AddrMode<7>;
+def AddrModeT1_2    : AddrMode<8>;
+def AddrModeT1_4    : AddrMode<9>;
+def AddrModeT1_s    : AddrMode<10>;
+def AddrModeT2_i12  : AddrMode<11>;
+def AddrModeT2_i8   : AddrMode<12>;
+def AddrModeT2_so   : AddrMode<13>;
+def AddrModeT2_pc   : AddrMode<14>;
 def AddrModeT2_i8s4 : AddrMode<15>;
 
 // Instruction size.
@@ -137,11 +138,17 @@ def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains
 // ARM special operands.
 //
 
+def CondCodeOperand : AsmOperandClass {
+  let Name = "CondCode";
+  let SuperClasses = [];
+}
+
 // ARM Predicate operand. Default to 14 = always (AL). Second part is CC
 // register whose default is 0 (no register).
 def pred : PredicateOperand<OtherVT, (ops i32imm, CCR),
                                      (ops (i32 14), (i32 zero_reg))> {
   let PrintMethod = "printPredicateOperand";
+  let ParserMatchClass = CondCodeOperand;
 }
 
 // Conditional code result for instructions whose 's' bit is set, e.g. subs.
@@ -240,6 +247,7 @@ class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   let Pattern = pattern;
   list<Predicate> Predicates = [IsARM];
 }
+
 // A few are not predicable
 class InoP<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
            IndexMode im, Format f, InstrItinClass itin,
@@ -254,9 +262,9 @@ class InoP<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   list<Predicate> Predicates = [IsARM];
 }
 
-// Same as I except it can optionally modify CPSR. Note it's modeled as
-// an input operand since by default it's a zero register. It will
-// become an implicit def once it's "flipped".
+// Same as I except it can optionally modify CPSR. Note it's modeled as an input
+// operand since by default it's a zero register. It will become an implicit def
+// once it's "flipped".
 class sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
          IndexMode im, Format f, InstrItinClass itin,
          string opc, string asm, string cstr,
@@ -313,7 +321,7 @@ class ABXI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
 }
 class ABXIx2<dag oops, dag iops, InstrItinClass itin,
              string asm, list<dag> pattern>
-  : XI<oops, iops, AddrModeNone, Size8Bytes, IndexModeNone, BrMiscFrm, itin,
+  : XI<oops, iops, AddrModeNone, Size8Bytes, IndexModeNone, Pseudo, itin,
        asm, "", pattern>;
 
 // BR_JT instructions
@@ -322,16 +330,14 @@ class JTI<dag oops, dag iops, InstrItinClass itin,
   : XI<oops, iops, AddrModeNone, SizeSpecial, IndexModeNone, BrMiscFrm, itin,
        asm, "", pattern>;
 
-
 // Atomic load/store instructions
-
 class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, LdStExFrm, itin,
       opc, asm, "", pattern> {
   let Inst{27-23} = 0b00011;
   let Inst{22-21} = opcod;
-  let Inst{20} = 1;
+  let Inst{20}    = 1;
   let Inst{11-0}  = 0b111110011111;
 }
 class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
@@ -340,7 +346,7 @@ class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
       opc, asm, "", pattern> {
   let Inst{27-23} = 0b00011;
   let Inst{22-21} = opcod;
-  let Inst{20} = 0;
+  let Inst{20}    = 0;
   let Inst{11-4}  = 0b11111001;
 }
 
@@ -350,21 +356,21 @@ class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
   : I<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin,
       opc, asm, "", pattern> {
   let Inst{24-21} = opcod;
-  let Inst{27-26} = {0,0};
+  let Inst{27-26} = 0b00;
 }
 class AsI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
   : sI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin,
        opc, asm, "", pattern> {
   let Inst{24-21} = opcod;
-  let Inst{27-26} = {0,0};
+  let Inst{27-26} = 0b00;
 }
 class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
            string asm, list<dag> pattern>
   : XI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern> {
   let Inst{24-21} = opcod;
-  let Inst{27-26} = {0,0};
+  let Inst{27-26} = 0b00;
 }
 class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
@@ -377,7 +383,7 @@ class AI2<dag oops, dag iops, Format f, InstrItinClass itin,
           string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
       opc, asm, "", pattern> {
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 
 // loads
@@ -389,7 +395,7 @@ class AI2ldw<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 0; // B bit
   let Inst{24}    = 1; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin,
               string asm, list<dag> pattern>
@@ -399,7 +405,7 @@ class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 0; // B bit
   let Inst{24}    = 1; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 class AI2ldb<dag oops, dag iops, Format f, InstrItinClass itin,
              string opc, string asm, list<dag> pattern>
@@ -409,7 +415,7 @@ class AI2ldb<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 1; // B bit
   let Inst{24}    = 1; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin,
               string asm, list<dag> pattern>
@@ -419,7 +425,7 @@ class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 1; // B bit
   let Inst{24}    = 1; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 
 // stores
@@ -431,7 +437,7 @@ class AI2stw<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 0; // B bit
   let Inst{24}    = 1; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 class AXI2stw<dag oops, dag iops, Format f, InstrItinClass itin,
               string asm, list<dag> pattern>
@@ -441,7 +447,7 @@ class AXI2stw<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 0; // B bit
   let Inst{24}    = 1; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 class AI2stb<dag oops, dag iops, Format f, InstrItinClass itin,
              string opc, string asm, list<dag> pattern>
@@ -451,7 +457,7 @@ class AI2stb<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 1; // B bit
   let Inst{24}    = 1; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 class AXI2stb<dag oops, dag iops, Format f, InstrItinClass itin,
               string asm, list<dag> pattern>
@@ -461,7 +467,7 @@ class AXI2stb<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 1; // B bit
   let Inst{24}    = 1; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 
 // Pre-indexed loads
@@ -473,7 +479,7 @@ class AI2ldwpr<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 1; // W bit
   let Inst{22}    = 0; // B bit
   let Inst{24}    = 1; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 class AI2ldbpr<dag oops, dag iops, Format f, InstrItinClass itin,
                string opc, string asm, string cstr, list<dag> pattern>
@@ -483,7 +489,7 @@ class AI2ldbpr<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 1; // W bit
   let Inst{22}    = 1; // B bit
   let Inst{24}    = 1; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 
 // Pre-indexed stores
@@ -495,7 +501,7 @@ class AI2stwpr<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 1; // W bit
   let Inst{22}    = 0; // B bit
   let Inst{24}    = 1; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 class AI2stbpr<dag oops, dag iops, Format f, InstrItinClass itin,
                string opc, string asm, string cstr, list<dag> pattern>
@@ -505,7 +511,7 @@ class AI2stbpr<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 1; // W bit
   let Inst{22}    = 1; // B bit
   let Inst{24}    = 1; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 
 // Post-indexed loads
@@ -517,7 +523,7 @@ class AI2ldwpo<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 0; // B bit
   let Inst{24}    = 0; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 class AI2ldbpo<dag oops, dag iops, Format f, InstrItinClass itin,
                string opc, string asm, string cstr, list<dag> pattern>
@@ -527,7 +533,7 @@ class AI2ldbpo<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 1; // B bit
   let Inst{24}    = 0; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 
 // Post-indexed stores
@@ -539,7 +545,7 @@ class AI2stwpo<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 0; // B bit
   let Inst{24}    = 0; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 class AI2stbpo<dag oops, dag iops, Format f, InstrItinClass itin,
                string opc, string asm, string cstr, list<dag> pattern>
@@ -549,7 +555,7 @@ class AI2stbpo<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{21}    = 0; // W bit
   let Inst{22}    = 1; // B bit
   let Inst{24}    = 0; // P bit
-  let Inst{27-26} = {0,1};
+  let Inst{27-26} = 0b01;
 }
 
 // addrmode3 instructions
@@ -977,7 +983,7 @@ class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3,
       Encoding {
   let Inst{31-27} = opcod1;
   let Inst{15-14} = opcod2;
-  let Inst{12} = opcod3;
+  let Inst{12}    = opcod3;
 }
 
 // BR_JT instructions
@@ -1099,13 +1105,13 @@ class T1Special<bits<4> opcode> : Encoding16 {
 // A6.2.4 Load/store single data item encoding.
 class T1LoadStore<bits<4> opA, bits<3> opB> : Encoding16 {
   let Inst{15-12} = opA;
-  let Inst{11-9} = opB;
+  let Inst{11-9}  = opB;
 }
-class T1LdSt<bits<3> opB> : T1LoadStore<0b0101, opB>;
+class T1LdSt<bits<3> opB>     : T1LoadStore<0b0101, opB>;
 class T1LdSt4Imm<bits<3> opB> : T1LoadStore<0b0110, opB>; // Immediate, 4 bytes
 class T1LdSt1Imm<bits<3> opB> : T1LoadStore<0b0111, opB>; // Immediate, 1 byte
 class T1LdSt2Imm<bits<3> opB> : T1LoadStore<0b1000, opB>; // Immediate, 2 bytes
-class T1LdStSP<bits<3> opB> : T1LoadStore<0b1001, opB>;   // SP relative
+class T1LdStSP<bits<3> opB>   : T1LoadStore<0b1001, opB>; // SP relative
 
 // A6.2.5 Miscellaneous 16-bit instructions encoding.
 class T1Misc<bits<7> opcode> : Encoding16 {
@@ -1125,9 +1131,10 @@ class Thumb2I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   list<Predicate> Predicates = [IsThumb2];
 }
 
-// Same as Thumb2I except it can optionally modify CPSR. Note it's modeled as
-// an input operand since by default it's a zero register. It will
-// become an implicit def once it's "flipped".
+// Same as Thumb2I except it can optionally modify CPSR. Note it's modeled as an
+// input operand since by default it's a zero register. It will become an
+// implicit def once it's "flipped".
+// 
 // FIXME: This uses unified syntax so {s} comes before {p}. We should make it
 // more consistent.
 class Thumb2sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
@@ -1185,11 +1192,11 @@ class T2Ii8s4<bit P, bit W, bit load, dag oops, dag iops, InstrItinClass itin,
             pattern> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b00;
-  let Inst{24} = P;
-  let Inst{23} = ?; // The U bit.
-  let Inst{22} = 1;
-  let Inst{21} = W;
-  let Inst{20} = load;
+  let Inst{24}    = P;
+  let Inst{23}    = ?; // The U bit.
+  let Inst{22}    = 1;
+  let Inst{21}    = W;
+  let Inst{20}    = load;
 }
 
 class T2sI<dag oops, dag iops, InstrItinClass itin,
@@ -1225,14 +1232,14 @@ class T2Iidxldst<bit signed, bits<2> opcod, bit load, bit pre,
   list<Predicate> Predicates = [IsThumb2];
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
-  let Inst{24} = signed;
-  let Inst{23} = 0;
+  let Inst{24}    = signed;
+  let Inst{23}    = 0;
   let Inst{22-21} = opcod;
-  let Inst{20} = load;
-  let Inst{11} = 1;
+  let Inst{20}    = load;
+  let Inst{11}    = 1;
   // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
-  let Inst{10} = pre; // The P bit.
-  let Inst{8} = 1; // The W bit.
+  let Inst{10}    = pre; // The P bit.
+  let Inst{8}     = 1; // The W bit.
 }
 
 // Helper class for disassembly only
@@ -1243,9 +1250,9 @@ class T2I_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops, dag iops,
   : T2I<oops, iops, itin, opc, asm, pattern> {
   let Inst{31-27} = 0b11111;
   let Inst{26-24} = 0b011;
-  let Inst{23} = long;
+  let Inst{23}    = long;
   let Inst{22-20} = op22_20;
-  let Inst{7-4} = op7_4;
+  let Inst{7-4}   = op7_4;
 }
 
 // Tv5Pat - Same as Pat<>, but requires V5T Thumb mode.
@@ -1325,9 +1332,9 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
 }
 
 // Load / store multiple
-class AXDI5<dag oops, dag iops, IndexMode im, InstrItinClass itin,
+class AXDI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
-  : VFPXI<oops, iops, AddrMode5, Size4Bytes, im,
+  : VFPXI<oops, iops, AddrMode4, Size4Bytes, im,
           VFPLdStMulFrm, itin, asm, cstr, pattern> {
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-25} = 0b110;
@@ -1337,9 +1344,9 @@ class AXDI5<dag oops, dag iops, IndexMode im, InstrItinClass itin,
   let D = VFPNeonDomain;
 }
 
-class AXSI5<dag oops, dag iops, IndexMode im, InstrItinClass itin,
+class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
-  : VFPXI<oops, iops, AddrMode5, Size4Bytes, im,
+  : VFPXI<oops, iops, AddrMode4, Size4Bytes, im,
           VFPLdStMulFrm, itin, asm, cstr, pattern> {
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-25} = 0b110;
@@ -1367,8 +1374,8 @@ class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
   let Inst{27-23} = opcod1;
   let Inst{21-20} = opcod2;
   let Inst{11-8}  = 0b1011;
-  let Inst{6} = op6;
-  let Inst{4} = op4;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
 }
 
 // Double precision, binary, VML[AS] (for additional predicate)
@@ -1379,12 +1386,11 @@ class ADbI_vmlX<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
   let Inst{27-23} = opcod1;
   let Inst{21-20} = opcod2;
   let Inst{11-8}  = 0b1011;
-  let Inst{6} = op6;
-  let Inst{4} = op4;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
   list<Predicate> Predicates = [HasVFP2, UseVMLx];
 }
 
-
 // Single precision, unary
 class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
@@ -1415,8 +1421,8 @@ class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
   let Inst{27-23} = opcod1;
   let Inst{21-20} = opcod2;
   let Inst{11-8}  = 0b1010;
-  let Inst{6} = op6;
-  let Inst{4} = op4;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
 }
 
 // Single precision binary, if no NEON
@@ -1521,10 +1527,18 @@ class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
   : NeonI<oops, iops, AddrMode6, IndexModeNone, NLdStFrm, itin, opc, dt, asm,
           cstr, pattern> {
   let Inst{31-24} = 0b11110100;
-  let Inst{23} = op23;
+  let Inst{23}    = op23;
   let Inst{21-20} = op21_20;
-  let Inst{11-8} = op11_8;
-  let Inst{7-4} = op7_4;
+  let Inst{11-8}  = op11_8;
+  let Inst{7-4}   = op7_4;
+}
+
+class PseudoNLdSt<dag oops, dag iops, InstrItinClass itin, string cstr>
+  : InstARM<AddrMode6, Size4Bytes, IndexModeNone, Pseudo, NeonDomain, cstr,
+            itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  list<Predicate> Predicates = [HasNEON];
 }
 
 class NDataI<dag oops, dag iops, Format f, InstrItinClass itin,
@@ -1548,13 +1562,13 @@ class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
                string opc, string dt, string asm, string cstr,
                list<dag> pattern>
   : NDataI<oops, iops, N1RegModImmFrm, itin, opc, dt, asm, cstr, pattern> {
-  let Inst{23} = op23;
+  let Inst{23}    = op23;
   let Inst{21-19} = op21_19;
-  let Inst{11-8} = op11_8;
-  let Inst{7} = op7;
-  let Inst{6} = op6;
-  let Inst{5} = op5;
-  let Inst{4} = op4;
+  let Inst{11-8}  = op11_8;
+  let Inst{7}     = op7;
+  let Inst{6}     = op6;
+  let Inst{5}     = op5;
+  let Inst{4}     = op4;
 }
 
 // NEON 2 vector register format.
@@ -1567,9 +1581,9 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
   let Inst{21-20} = op21_20;
   let Inst{19-18} = op19_18;
   let Inst{17-16} = op17_16;
-  let Inst{11-7} = op11_7;
-  let Inst{6} = op6;
-  let Inst{4} = op4;
+  let Inst{11-7}  = op11_7;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
 }
 
 // Same as N2V except it doesn't have a datatype suffix.
@@ -1582,9 +1596,9 @@ class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
   let Inst{21-20} = op21_20;
   let Inst{19-18} = op19_18;
   let Inst{17-16} = op17_16;
-  let Inst{11-7} = op11_7;
-  let Inst{6} = op6;
-  let Inst{4} = op4;
+  let Inst{11-7}  = op11_7;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
 }
 
 // NEON 2 vector register with immediate.
@@ -1592,12 +1606,12 @@ class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
              dag oops, dag iops, Format f, InstrItinClass itin,
              string opc, string dt, string asm, string cstr, list<dag> pattern>
   : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
-  let Inst{24} = op24;
-  let Inst{23} = op23;
+  let Inst{24}   = op24;
+  let Inst{23}   = op23;
   let Inst{11-8} = op11_8;
-  let Inst{7} = op7;
-  let Inst{6} = op6;
-  let Inst{4} = op4;
+  let Inst{7}    = op7;
+  let Inst{6}    = op6;
+  let Inst{4}    = op4;
 }
 
 // NEON 3 vector register format.
@@ -1605,12 +1619,12 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
           dag oops, dag iops, Format f, InstrItinClass itin,
           string opc, string dt, string asm, string cstr, list<dag> pattern>
   : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
-  let Inst{24} = op24;
-  let Inst{23} = op23;
+  let Inst{24}    = op24;
+  let Inst{23}    = op23;
   let Inst{21-20} = op21_20;
-  let Inst{11-8} = op11_8;
-  let Inst{6} = op6;
-  let Inst{4} = op4;
+  let Inst{11-8}  = op11_8;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
 }
 
 // Same as N3V except it doesn't have a data type suffix.
@@ -1619,12 +1633,12 @@ class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
            dag oops, dag iops, Format f, InstrItinClass itin,
            string opc, string asm, string cstr, list<dag> pattern>
   : NDataXI<oops, iops, f, itin, opc, asm, cstr, pattern> {
-  let Inst{24} = op24;
-  let Inst{23} = op23;
+  let Inst{24}    = op24;
+  let Inst{23}    = op23;
   let Inst{21-20} = op21_20;
-  let Inst{11-8} = op11_8;
-  let Inst{6} = op6;
-  let Inst{4} = op4;
+  let Inst{11-8}  = op11_8;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
 }
 
 // NEON VMOVs between scalar and core registers.
@@ -1634,9 +1648,9 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
   : InstARM<AddrModeNone, Size4Bytes, IndexModeNone, f, GenericDomain,
             "", itin> {
   let Inst{27-20} = opcod1;
-  let Inst{11-8} = opcod2;
-  let Inst{6-5} = opcod3;
-  let Inst{4} = 1;
+  let Inst{11-8}  = opcod2;
+  let Inst{6-5}   = opcod3;
+  let Inst{4}     = 1;
 
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
@@ -1670,9 +1684,9 @@ class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops,
   let Inst{24-23} = 0b11;
   let Inst{21-20} = 0b11;
   let Inst{19-16} = op19_16;
-  let Inst{11-7} = 0b11000;
-  let Inst{6} = op6;
-  let Inst{4} = 0;
+  let Inst{11-7}  = 0b11000;
+  let Inst{6}     = op6;
+  let Inst{4}     = 0;
 }
 
 // NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 51fc1522485fa..e66f9b9ad0ac5 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -44,6 +44,10 @@ def SDT_ARMBCC_i64 : SDTypeProfile<0, 6,
                                    SDTCisVT<3, i32>, SDTCisVT<4, i32>,
                                    SDTCisVT<5, OtherVT>]>;
 
+def SDT_ARMAnd     : SDTypeProfile<1, 2,
+                                   [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                    SDTCisVT<2, i32>]>;
+
 def SDT_ARMCmp     : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
 
 def SDT_ARMPICAdd  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
@@ -54,13 +58,16 @@ def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>,
                                                  SDTCisInt<2>]>;
 def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
 
-def SDT_ARMMEMBARRIERV7  : SDTypeProfile<0, 0, []>;
-def SDT_ARMSYNCBARRIERV7 : SDTypeProfile<0, 0, []>;
-def SDT_ARMMEMBARRIERV6  : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def SDT_ARMSYNCBARRIERV6 : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_ARMMEMBARRIER     : SDTypeProfile<0, 0, []>;
+def SDT_ARMSYNCBARRIER    : SDTypeProfile<0, 0, []>;
+def SDT_ARMMEMBARRIERMCR  : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_ARMSYNCBARRIERMCR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
 
+def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                      SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
@@ -99,11 +106,14 @@ def ARMbr2jt         : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT,
 def ARMBcci64        : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64,
                               [SDNPHasChain]>;
 
+def ARMand           : SDNode<"ARMISD::AND", SDT_ARMAnd,
+                              [SDNPOutFlag]>;
+
 def ARMcmp           : SDNode<"ARMISD::CMP", SDT_ARMCmp,
                               [SDNPOutFlag]>;
 
 def ARMcmpZ          : SDNode<"ARMISD::CMPZ", SDT_ARMCmp,
-                              [SDNPOutFlag,SDNPCommutative]>;
+                              [SDNPOutFlag, SDNPCommutative]>;
 
 def ARMpic_add       : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>;
 
@@ -117,51 +127,54 @@ def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
 def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
                                 SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain]>;
 
-def ARMMemBarrierV7  : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIERV7,
-                              [SDNPHasChain]>;
-def ARMSyncBarrierV7 : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIERV7,
-                              [SDNPHasChain]>;
-def ARMMemBarrierV6  : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIERV6,
-                              [SDNPHasChain]>;
-def ARMSyncBarrierV6 : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIERV6,
-                              [SDNPHasChain]>;
+def ARMMemBarrier     : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER,
+                               [SDNPHasChain]>;
+def ARMSyncBarrier    : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIER,
+                               [SDNPHasChain]>;
+def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIERMCR,
+                               [SDNPHasChain]>;
+def ARMSyncBarrierMCR : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIERMCR,
+                               [SDNPHasChain]>;
 
 def ARMrbit          : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>;
 
 def ARMtcret         : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, 
                         [SDNPHasChain,  SDNPOptInFlag, SDNPVariadic]>;
 
+
+def ARMbfi           : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
+
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
 //
-def HasV4T    : Predicate<"Subtarget->hasV4TOps()">;
-def NoV4T     : Predicate<"!Subtarget->hasV4TOps()">;
-def HasV5T    : Predicate<"Subtarget->hasV5TOps()">;
-def HasV5TE   : Predicate<"Subtarget->hasV5TEOps()">;
-def HasV6     : Predicate<"Subtarget->hasV6Ops()">;
-def HasV6T2   : Predicate<"Subtarget->hasV6T2Ops()">;
-def NoV6T2    : Predicate<"!Subtarget->hasV6T2Ops()">;
-def HasV7     : Predicate<"Subtarget->hasV7Ops()">;
-def NoVFP     : Predicate<"!Subtarget->hasVFP2()">;
-def HasVFP2   : Predicate<"Subtarget->hasVFP2()">;
-def HasVFP3   : Predicate<"Subtarget->hasVFP3()">;
-def HasNEON   : Predicate<"Subtarget->hasNEON()">;
-def HasDivide : Predicate<"Subtarget->hasDivide()">;
+def HasV4T           : Predicate<"Subtarget->hasV4TOps()">;
+def NoV4T            : Predicate<"!Subtarget->hasV4TOps()">;
+def HasV5T           : Predicate<"Subtarget->hasV5TOps()">;
+def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">;
+def HasV6            : Predicate<"Subtarget->hasV6Ops()">;
+def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">;
+def NoV6T2           : Predicate<"!Subtarget->hasV6T2Ops()">;
+def HasV7            : Predicate<"Subtarget->hasV7Ops()">;
+def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
+def HasVFP2          : Predicate<"Subtarget->hasVFP2()">;
+def HasVFP3          : Predicate<"Subtarget->hasVFP3()">;
+def HasNEON          : Predicate<"Subtarget->hasNEON()">;
+def HasDivide        : Predicate<"Subtarget->hasDivide()">;
 def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">;
-def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
+def HasDB            : Predicate<"Subtarget->hasDataBarrier()">;
+def UseNEONForFP     : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
 def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
-def IsThumb   : Predicate<"Subtarget->isThumb()">;
-def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">;
-def IsThumb2  : Predicate<"Subtarget->isThumb2()">;
-def IsARM     : Predicate<"!Subtarget->isThumb()">;
-def IsDarwin    : Predicate<"Subtarget->isTargetDarwin()">;
-def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">;
+def IsThumb          : Predicate<"Subtarget->isThumb()">;
+def IsThumb1Only     : Predicate<"Subtarget->isThumb1Only()">;
+def IsThumb2         : Predicate<"Subtarget->isThumb2()">;
+def IsARM            : Predicate<"!Subtarget->isThumb()">;
+def IsDarwin         : Predicate<"Subtarget->isTargetDarwin()">;
+def IsNotDarwin      : Predicate<"!Subtarget->isTargetDarwin()">;
 
 // FIXME: Eventually this will be just "hasV6T2Ops".
-def UseMovt   : Predicate<"Subtarget->useMovt()">;
-def DontUseMovt : Predicate<"!Subtarget->useMovt()">;
-
-def UseVMLx   : Predicate<"Subtarget->useVMLx()">;
+def UseMovt          : Predicate<"Subtarget->useMovt()">;
+def DontUseMovt      : Predicate<"!Subtarget->useMovt()">;
+def UseVMLx          : Predicate<"Subtarget->useVMLx()">;
 
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
@@ -221,29 +234,12 @@ def sext_16_node : PatLeaf<(i32 GPR:$a), [{
 /// e.g., 0xf000ffff
 def bf_inv_mask_imm : Operand<i32>,
                       PatLeaf<(imm), [{
-  uint32_t v = (uint32_t)N->getZExtValue();
-  if (v == 0xffffffff)
-    return 0;
-  // there can be 1's on either or both "outsides", all the "inside"
-  // bits must be 0's
-  unsigned int lsb = 0, msb = 31;
-  while (v & (1 << msb)) --msb;
-  while (v & (1 << lsb)) ++lsb;
-  for (unsigned int i = lsb; i <= msb; ++i) {
-    if (v & (1 << i))
-      return 0;
-  }
-  return 1;
+  return ARM::isBitFieldInvertedMask(N->getZExtValue());
 }] > {
   let PrintMethod = "printBitfieldInvMaskImmOperand";
 }
 
 /// Split a 32-bit immediate into two 16 bit parts.
-def lo16 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() & 0xffff,
-                                   MVT::i32);
-}]>;
-
 def hi16 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, MVT::i32);
 }]>;
@@ -306,6 +302,13 @@ def pclabel : Operand<i32> {
   let PrintMethod = "printPCLabel";
 }
 
+// shift_imm: An integer that encodes a shift amount and the type of shift
+// (currently either asr or lsl) using the same encoding used for the
+// immediates in so_reg operands.
+def shift_imm : Operand<i32> {
+  let PrintMethod = "printShiftImmOperand";
+}
+
 // shifter_operand operands: so_reg and so_imm.
 def so_reg : Operand<i32>,    // reg reg imm
              ComplexPattern<i32, 3, "SelectShifterOperandReg",
@@ -319,10 +322,7 @@ def so_reg : Operand<i32>,    // reg reg imm
 // represented in the imm field in the same 12-bit form that they are encoded
 // into so_imm instructions: the 8-bit immediate is the least significant bits
 // [bits 0-7], the 4-bit shift amount is the next 4 bits [bits 8-11].
-def so_imm : Operand<i32>,
-             PatLeaf<(imm), [{
-      return ARM_AM::getSOImmVal(N->getZExtValue()) != -1;
-    }]> {
+def so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_so_imm(N); }]> {
   let PrintMethod = "printSOImmOperand";
 }
 
@@ -452,11 +452,15 @@ include "ARMInstrFormats.td"
 /// binop that produces a value.
 multiclass AsI1_bin_irs<bits<4> opcod, string opc, PatFrag opnode,
                         bit Commutable = 0> {
+  // The register-immediate version is re-materializable. This is useful
+  // in particular for taking the address of a local.
+  let isReMaterializable = 1 in {
   def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
                IIC_iALUi, opc, "\t$dst, $a, $b",
                [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> {
     let Inst{25} = 1;
   }
+  }
   def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm,
                IIC_iALUr, opc, "\t$dst, $a, $b",
                [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> {
@@ -502,7 +506,7 @@ multiclass AI1_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
 /// AI1_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
 /// patterns. Similar to AsI1_bin_irs except the instruction does not produce
 /// a explicit result, only implicitly set CPSR.
-let Defs = [CPSR] in {
+let isCompare = 1, Defs = [CPSR] in {
 multiclass AI1_cmp_irs<bits<4> opcod, string opc, PatFrag opnode,
                        bit Commutable = 0> {
   def ri : AI1<opcod, (outs), (ins GPR:$a, so_imm:$b), DPFrm, IIC_iCMPi,
@@ -1117,7 +1121,7 @@ let isBranch = 1, isTerminator = 1 in {
 
   let isNotDuplicable = 1, isIndirectBranch = 1 in {
   def BR_JTr : JTI<(outs), (ins GPR:$target, jtblock_operand:$jt, i32imm:$id),
-                    IIC_Br, "mov\tpc, $target \n$jt",
+                    IIC_Br, "mov\tpc, $target$jt",
                     [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]> {
     let Inst{11-4}  = 0b00000000;
     let Inst{15-12} = 0b1111;
@@ -1127,7 +1131,7 @@ let isBranch = 1, isTerminator = 1 in {
   }
   def BR_JTm : JTI<(outs),
                    (ins addrmode2:$target, jtblock_operand:$jt, i32imm:$id),
-                   IIC_Br, "ldr\tpc, $target \n$jt",
+                   IIC_Br, "ldr\tpc, $target$jt",
                    [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt,
                      imm:$id)]> {
     let Inst{15-12} = 0b1111;
@@ -1139,7 +1143,7 @@ let isBranch = 1, isTerminator = 1 in {
   }
   def BR_JTadd : JTI<(outs),
                    (ins GPR:$target, GPR:$idx, jtblock_operand:$jt, i32imm:$id),
-                    IIC_Br, "add\tpc, $target, $idx \n$jt",
+                    IIC_Br, "add\tpc, $target, $idx$jt",
                     [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt,
                       imm:$id)]> {
     let Inst{15-12} = 0b1111;
@@ -1573,8 +1577,12 @@ defm UXTH   : AI_unary_rrot<0b01101111,
 defm UXTB16 : AI_unary_rrot<0b01101100,
                             "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
 
-def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
-               (UXTB16r_rot GPR:$Src, 24)>;
+// FIXME: This pattern incorrectly assumes the shl operator is a rotate.
+//        The transformation should probably be done as a combiner action
+//        instead so we can include a check for masking back in the upper
+//        eight bits of the source into the lower eight bits of the result.
+//def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
+//               (UXTB16r_rot GPR:$Src, 24)>;
 def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
                (UXTB16r_rot GPR:$Src, 8)>;
 
@@ -1631,16 +1639,24 @@ defm ADCS : AI1_adde_sube_s_irs<0b0101, "adcs",
 defm SBCS : AI1_adde_sube_s_irs<0b0110, "sbcs",
                           BinOpFrag<(sube_live_carry node:$LHS, node:$RHS) >>;
 
-// These don't define reg/reg forms, because they are handled above.
 def RSBri : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
-                  IIC_iALUi, "rsb", "\t$dst, $a, $b",
-                  [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]> {
+                 IIC_iALUi, "rsb", "\t$dst, $a, $b",
+                 [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]> {
     let Inst{25} = 1;
 }
 
+// The reg/reg form is only defined for the disassembler; for codegen it is
+// equivalent to SUBrr.
+def RSBrr : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm,
+                 IIC_iALUr, "rsb", "\t$dst, $a, $b",
+                 [/* For disassembly only; pattern left blank */]> {
+    let Inst{25} = 0;
+    let Inst{11-4} = 0b00000000;
+}
+
 def RSBrs : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
-                  IIC_iALUsr, "rsb", "\t$dst, $a, $b",
-                  [(set GPR:$dst, (sub so_reg:$b, GPR:$a))]> {
+                 IIC_iALUsr, "rsb", "\t$dst, $a, $b",
+                 [(set GPR:$dst, (sub so_reg:$b, GPR:$a))]> {
     let Inst{25} = 0;
 }
 
@@ -1667,6 +1683,14 @@ def RSCri : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
                  Requires<[IsARM]> {
     let Inst{25} = 1;
 }
+// The reg/reg form is only defined for the disassembler; for codegen it is
+// equivalent to SUBrr.
+def RSCrr : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+                 DPFrm, IIC_iALUr, "rsc", "\t$dst, $a, $b",
+                 [/* For disassembly only; pattern left blank */]> {
+    let Inst{25} = 0;
+    let Inst{11-4} = 0b00000000;
+}
 def RSCrs : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
                  DPSoRegFrm, IIC_iALUsr, "rsc", "\t$dst, $a, $b",
                  [(set GPR:$dst, (sube_dead_carry so_reg:$b, GPR:$a))]>,
@@ -1716,24 +1740,26 @@ def : ARMPat<(adde   GPR:$src, so_imm_not:$imm),
 
 // ARM Arithmetic Instruction -- for disassembly only
 // GPR:$dst = GPR:$a op GPR:$b
-class AAI<bits<8> op27_20, bits<4> op7_4, string opc>
+class AAI<bits<8> op27_20, bits<4> op7_4, string opc,
+          list<dag> pattern = [/* For disassembly only; pattern left blank */]>
   : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, IIC_iALUr,
-       opc, "\t$dst, $a, $b",
-       [/* For disassembly only; pattern left blank */]> {
+       opc, "\t$dst, $a, $b", pattern> {
   let Inst{27-20} = op27_20;
   let Inst{7-4} = op7_4;
 }
 
 // Saturating add/subtract -- for disassembly only
 
-def QADD    : AAI<0b00010000, 0b0101, "qadd">;
+def QADD    : AAI<0b00010000, 0b0101, "qadd",
+                  [(set GPR:$dst, (int_arm_qadd GPR:$a, GPR:$b))]>;
 def QADD16  : AAI<0b01100010, 0b0001, "qadd16">;
 def QADD8   : AAI<0b01100010, 0b1001, "qadd8">;
 def QASX    : AAI<0b01100010, 0b0011, "qasx">;
 def QDADD   : AAI<0b00010100, 0b0101, "qdadd">;
 def QDSUB   : AAI<0b00010110, 0b0101, "qdsub">;
 def QSAX    : AAI<0b01100010, 0b0101, "qsax">;
-def QSUB    : AAI<0b00010010, 0b0101, "qsub">;
+def QSUB    : AAI<0b00010010, 0b0101, "qsub",
+                  [(set GPR:$dst, (int_arm_qsub GPR:$a, GPR:$b))]>;
 def QSUB16  : AAI<0b01100010, 0b0111, "qsub16">;
 def QSUB8   : AAI<0b01100010, 0b1111, "qsub8">;
 def UQADD16 : AAI<0b01100110, 0b0001, "uqadd16">;
@@ -1793,54 +1819,45 @@ def USADA8 : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
 
 // Signed/Unsigned saturate -- for disassembly only
 
-def SSATlsl : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, i32imm:$shamt),
-                 DPFrm, NoItinerary, "ssat", "\t$dst, $bit_pos, $a, lsl $shamt",
-                 [/* For disassembly only; pattern left blank */]> {
-  let Inst{27-21} = 0b0110101;
-  let Inst{6-4} = 0b001;
-}
-
-def SSATasr : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, i32imm:$shamt),
-                 DPFrm, NoItinerary, "ssat", "\t$dst, $bit_pos, $a, asr $shamt",
-                 [/* For disassembly only; pattern left blank */]> {
+def SSAT : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, shift_imm:$sh),
+              SatFrm, NoItinerary, "ssat", "\t$dst, $bit_pos, $a$sh",
+              [/* For disassembly only; pattern left blank */]> {
   let Inst{27-21} = 0b0110101;
-  let Inst{6-4} = 0b101;
+  let Inst{5-4} = 0b01;
 }
 
-def SSAT16 : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), DPFrm,
+def SSAT16 : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), SatFrm,
                 NoItinerary, "ssat16", "\t$dst, $bit_pos, $a",
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{27-20} = 0b01101010;
   let Inst{7-4} = 0b0011;
 }
 
-def USATlsl : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, i32imm:$shamt),
-                 DPFrm, NoItinerary, "usat", "\t$dst, $bit_pos, $a, lsl $shamt",
-                 [/* For disassembly only; pattern left blank */]> {
-  let Inst{27-21} = 0b0110111;
-  let Inst{6-4} = 0b001;
-}
-
-def USATasr : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, i32imm:$shamt),
-                 DPFrm, NoItinerary, "usat", "\t$dst, $bit_pos, $a, asr $shamt",
-                 [/* For disassembly only; pattern left blank */]> {
+def USAT : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a, shift_imm:$sh),
+              SatFrm, NoItinerary, "usat", "\t$dst, $bit_pos, $a$sh",
+              [/* For disassembly only; pattern left blank */]> {
   let Inst{27-21} = 0b0110111;
-  let Inst{6-4} = 0b101;
+  let Inst{5-4} = 0b01;
 }
 
-def USAT16 : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), DPFrm,
+def USAT16 : AI<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), SatFrm,
                 NoItinerary, "usat16", "\t$dst, $bit_pos, $a",
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{27-20} = 0b01101110;
   let Inst{7-4} = 0b0011;
 }
 
+def : ARMV6Pat<(int_arm_ssat GPR:$a, imm:$pos), (SSAT imm:$pos, GPR:$a, 0)>;
+def : ARMV6Pat<(int_arm_usat GPR:$a, imm:$pos), (USAT imm:$pos, GPR:$a, 0)>;
+
 //===----------------------------------------------------------------------===//
 //  Bitwise Instructions.
 //
 
 defm AND   : AsI1_bin_irs<0b0000, "and",
                           BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
+defm ANDS  : AI1_bin_s_irs<0b0000, "and",
+                           BinOpFrag<(ARMand node:$LHS, node:$RHS)>, 1>;
 defm ORR   : AsI1_bin_irs<0b1100, "orr",
                           BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
 defm EOR   : AsI1_bin_irs<0b0001, "eor",
@@ -1858,11 +1875,11 @@ def BFC    : I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
 }
 
 // A8.6.18  BFI - Bitfield insert (Encoding A1)
-// Added for disassembler with the pattern field purposely left blank.
-def BFI    : I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
+def BFI    : I<(outs GPR:$dst), (ins GPR:$src, GPR:$val, bf_inv_mask_imm:$imm),
                AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi,
-               "bfi", "\t$dst, $src, $imm", "",
-               [/* For disassembly only; pattern left blank */]>,
+               "bfi", "\t$dst, $val, $imm", "$src = $dst",
+               [(set GPR:$dst, (ARMbfi GPR:$src, GPR:$val,
+                                bf_inv_mask_imm:$imm))]>,
                Requires<[IsARM, HasV6T2]> {
   let Inst{27-21} = 0b0111110;
   let Inst{6-4}   = 0b001; // Rn: Inst{3-0} != 15
@@ -2232,11 +2249,20 @@ def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
   let Inst{19-16} = 0b1111;
 }
 
+def lsl_shift_imm : SDNodeXForm<imm, [{
+  unsigned Sh = ARM_AM::getSORegOpc(ARM_AM::lsl, N->getZExtValue());
+  return CurDAG->getTargetConstant(Sh, MVT::i32);
+}]>;
+
+def lsl_amt : PatLeaf<(i32 imm), [{
+  return (N->getZExtValue() < 32);
+}], lsl_shift_imm>;
+
 def PKHBT : AMiscA1I<0b01101000, (outs GPR:$dst),
-                                 (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
-               IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2, lsl $shamt",
+                                 (ins GPR:$src1, GPR:$src2, shift_imm:$sh),
+               IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2$sh",
                [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF),
-                                   (and (shl GPR:$src2, (i32 imm:$shamt)),
+                                   (and (shl GPR:$src2, lsl_amt:$sh),
                                         0xFFFF0000)))]>,
                Requires<[IsARM, HasV6]> {
   let Inst{6-4} = 0b001;
@@ -2245,26 +2271,37 @@ def PKHBT : AMiscA1I<0b01101000, (outs GPR:$dst),
 // Alternate cases for PKHBT where identities eliminate some nodes.
 def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (and GPR:$src2, 0xFFFF0000)),
                (PKHBT GPR:$src1, GPR:$src2, 0)>;
-def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)),
-               (PKHBT GPR:$src1, GPR:$src2, imm16_31:$shamt)>;
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$sh)),
+               (PKHBT GPR:$src1, GPR:$src2, (lsl_shift_imm imm16_31:$sh))>;
+
+def asr_shift_imm : SDNodeXForm<imm, [{
+  unsigned Sh = ARM_AM::getSORegOpc(ARM_AM::asr, N->getZExtValue());
+  return CurDAG->getTargetConstant(Sh, MVT::i32);
+}]>;
 
+def asr_amt : PatLeaf<(i32 imm), [{
+  return (N->getZExtValue() <= 32);
+}], asr_shift_imm>;
 
+// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
+// will match the pattern below.
 def PKHTB : AMiscA1I<0b01101000, (outs GPR:$dst),
-                                 (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
-               IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2, asr $shamt",
+                                 (ins GPR:$src1, GPR:$src2, shift_imm:$sh),
+               IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2$sh",
                [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000),
-                                   (and (sra GPR:$src2, imm16_31:$shamt),
-                                        0xFFFF)))]>, Requires<[IsARM, HasV6]> {
+                                   (and (sra GPR:$src2, asr_amt:$sh),
+                                        0xFFFF)))]>,
+               Requires<[IsARM, HasV6]> {
   let Inst{6-4} = 0b101;
 }
 
 // Alternate cases for PKHTB where identities eliminate some nodes.  Note that
 // a shift amount of 0 is *not legal* here, it is PKHBT instead.
-def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, (i32 16))),
-               (PKHTB GPR:$src1, GPR:$src2, 16)>;
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, imm16_31:$sh)),
+               (PKHTB GPR:$src1, GPR:$src2, (asr_shift_imm imm16_31:$sh))>;
 def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000),
-                   (and (srl GPR:$src2, imm1_15:$shamt), 0xFFFF)),
-               (PKHTB GPR:$src1, GPR:$src2, imm1_15:$shamt)>;
+                   (and (srl GPR:$src2, imm1_15:$sh), 0xFFFF)),
+               (PKHTB GPR:$src1, GPR:$src2, (asr_shift_imm imm1_15:$sh))>;
 
 //===----------------------------------------------------------------------===//
 //  Comparison Instructions...
@@ -2272,8 +2309,52 @@ def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000),
 
 defm CMP  : AI1_cmp_irs<0b1010, "cmp",
                         BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
-//FIXME: Disable CMN, as CCodes are backwards from compare expectations
-//       Compare-to-zero still works out, just not the relationals
+
+// FIXME: There seems to be a (potential) hardware bug with the CMN instruction
+// and comparison with 0. These two pieces of code should give identical
+// results:
+//
+//   rsbs r1, r1, 0
+//   cmp  r0, r1
+//   mov  r0, #0
+//   it   ls
+//   mov  r0, #1
+//
+// and:
+// 
+//   cmn  r0, r1
+//   mov  r0, #0
+//   it   ls
+//   mov  r0, #1
+//
+// However, the CMN gives the *opposite* result when r1 is 0. This is because
+// the carry flag is set in the CMP case but not in the CMN case. In short, the
+// CMP instruction doesn't perform a truncate of the (logical) NOT of 0 plus the
+// value of r0 and the carry bit (because the "carry bit" parameter to
+// AddWithCarry is defined as 1 in this case, the carry flag will always be set
+// when r0 >= 0). The CMN instruction doesn't perform a NOT of 0 so there is
+// never a "carry" when this AddWithCarry is performed (because the "carry bit"
+// parameter to AddWithCarry is defined as 0).
+//
+// The AddWithCarry in the CMP case seems to be relying upon the identity:
+// 
+//   ~x + 1 = -x
+//
+// However when x is 0 and unsigned, this doesn't hold:
+//
+//    x = 0
+//   ~x = 0xFFFF FFFF
+//   ~x + 1 = 0x1 0000 0000
+//   (-x = 0) != (0x1 0000 0000 = ~x + 1)
+//
+// Therefore, we should disable *all* versions of CMN, especially when comparing
+// against zero, until we can limit when the CMN instruction is used (when we
+// know that the RHS is not 0) or when we have a hardware fix for this.
+//
+// (See the ARM docs for the "AddWithCarry" pseudo-code.)
+//
+// This is related to <rdar://problem/7569620>.
+//
 //defm CMN  : AI1_cmp_irs<0b1011, "cmn",
 //                        BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
 
@@ -2298,8 +2379,8 @@ def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm),
 let usesCustomInserter = 1, isBranch = 1, isTerminator = 1,
     Defs = [CPSR] in {
 def BCCi64 : PseudoInst<(outs),
-     (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst),
-      IIC_Br,
+    (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst),
+     IIC_Br,
      "${:comment} B\t$dst GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, imm:$cc",
     [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>;
 
@@ -2346,102 +2427,63 @@ def MOVCCi : AI1<0b1101, (outs GPR:$dst),
 
 // memory barriers protect the atomic sequences
 let hasSideEffects = 1 in {
-def Int_MemBarrierV7 : AInoP<(outs), (ins),
-                        Pseudo, NoItinerary,
-                        "dmb", "",
-                        [(ARMMemBarrierV7)]>,
-                        Requires<[IsARM, HasV7]> {
+def DMBsy : AInoP<(outs), (ins), MiscFrm, NoItinerary, "dmb", "",
+                  [(ARMMemBarrier)]>, Requires<[IsARM, HasDB]> {
   let Inst{31-4} = 0xf57ff05;
   // FIXME: add support for options other than a full system DMB
   // See DMB disassembly-only variants below.
   let Inst{3-0} = 0b1111;
 }
 
-def Int_SyncBarrierV7 : AInoP<(outs), (ins),
-                        Pseudo, NoItinerary,
-                        "dsb", "",
-                        [(ARMSyncBarrierV7)]>,
-                        Requires<[IsARM, HasV7]> {
+def DSBsy : AInoP<(outs), (ins), MiscFrm, NoItinerary, "dsb", "",
+                  [(ARMSyncBarrier)]>, Requires<[IsARM, HasDB]> {
   let Inst{31-4} = 0xf57ff04;
   // FIXME: add support for options other than a full system DSB
   // See DSB disassembly-only variants below.
   let Inst{3-0} = 0b1111;
 }
 
-def Int_MemBarrierV6 : AInoP<(outs), (ins GPR:$zero),
-                       Pseudo, NoItinerary,
+def DMB_MCR : AInoP<(outs), (ins GPR:$zero), MiscFrm, NoItinerary,
                        "mcr", "\tp15, 0, $zero, c7, c10, 5",
-                       [(ARMMemBarrierV6 GPR:$zero)]>,
+                       [(ARMMemBarrierMCR GPR:$zero)]>,
                        Requires<[IsARM, HasV6]> {
   // FIXME: add support for options other than a full system DMB
   // FIXME: add encoding
 }
 
-def Int_SyncBarrierV6 : AInoP<(outs), (ins GPR:$zero),
-                        Pseudo, NoItinerary,
+def DSB_MCR : AInoP<(outs), (ins GPR:$zero), MiscFrm, NoItinerary,
                         "mcr", "\tp15, 0, $zero, c7, c10, 4",
-                        [(ARMSyncBarrierV6 GPR:$zero)]>,
+                        [(ARMSyncBarrierMCR GPR:$zero)]>,
                         Requires<[IsARM, HasV6]> {
   // FIXME: add support for options other than a full system DSB
   // FIXME: add encoding
 }
 }
 
-// Helper class for multiclass MemB -- for disassembly only
-class AMBI<string opc, string asm>
-  : AInoP<(outs), (ins), MiscFrm, NoItinerary, opc, asm,
-          [/* For disassembly only; pattern left blank */]>,
-    Requires<[IsARM, HasV7]> {
-  let Inst{31-20} = 0xf57;
-}
-
-multiclass MemB<bits<4> op7_4, string opc> {
-
-  def st : AMBI<opc, "\tst"> {
-    let Inst{7-4} = op7_4;
-    let Inst{3-0} = 0b1110;
-  }
-
-  def ish : AMBI<opc, "\tish"> {
-    let Inst{7-4} = op7_4;
-    let Inst{3-0} = 0b1011;
-  }
-
-  def ishst : AMBI<opc, "\tishst"> {
-    let Inst{7-4} = op7_4;
-    let Inst{3-0} = 0b1010;
-  }
-
-  def nsh : AMBI<opc, "\tnsh"> {
-    let Inst{7-4} = op7_4;
-    let Inst{3-0} = 0b0111;
-  }
-
-  def nshst : AMBI<opc, "\tnshst"> {
-    let Inst{7-4} = op7_4;
-    let Inst{3-0} = 0b0110;
-  }
+// Memory Barrier Operations Variants -- for disassembly only
 
-  def osh : AMBI<opc, "\tosh"> {
-    let Inst{7-4} = op7_4;
-    let Inst{3-0} = 0b0011;
-  }
+def memb_opt : Operand<i32> {
+  let PrintMethod = "printMemBOption";
+}
 
-  def oshst : AMBI<opc, "\toshst"> {
-    let Inst{7-4} = op7_4;
-    let Inst{3-0} = 0b0010;
-  }
+class AMBI<bits<4> op7_4, string opc>
+  : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, opc, "\t$opt",
+          [/* For disassembly only; pattern left blank */]>,
+    Requires<[IsARM, HasDB]> {
+  let Inst{31-8} = 0xf57ff0;
+  let Inst{7-4} = op7_4;
 }
 
 // These DMB variants are for disassembly only.
-defm DMB : MemB<0b0101, "dmb">;
+def DMBvar : AMBI<0b0101, "dmb">;
 
 // These DSB variants are for disassembly only.
-defm DSB : MemB<0b0100, "dsb">;
+def DSBvar : AMBI<0b0100, "dsb">;
 
 // ISB has only full system option -- for disassembly only
-def ISBsy : AMBI<"isb", ""> {
-  let Inst{7-4} = 0b0110;
+def ISBsy : AInoP<(outs), (ins), MiscFrm, NoItinerary, "isb", "", []>,
+            Requires<[IsARM, HasDB]> {
+  let Inst{31-4} = 0xf57ff06;
   let Inst{3-0} = 0b1111;
 }
 
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 7f7eb980abe83..4d2f1169061ff 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -93,6 +93,11 @@ def NEONzip       : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
 def NEONuzp       : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
 def NEONtrn       : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
 
+def SDTARMVMULL   : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+                                         SDTCisSameAs<1, 2>]>;
+def NEONvmulls    : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
+def NEONvmullu    : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
+
 def SDTARMFMAX    : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>,
                                          SDTCisSameAs<0, 2>]>;
 def NEONfmax      : SDNode<"ARMISD::FMAX", SDTARMFMAX>;
@@ -100,14 +105,14 @@ def NEONfmin      : SDNode<"ARMISD::FMIN", SDTARMFMAX>;
 
 def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{
   ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
-  unsigned EltBits;
+  unsigned EltBits = 0;
   uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
   return (EltBits == 32 && EltVal == 0);
 }]>;
 
 def NEONimmAllOnesV: PatLeaf<(NEONvmovImm (i32 timm)), [{
   ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
-  unsigned EltBits;
+  unsigned EltBits = 0;
   uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
   return (EltBits == 8 && EltVal == 0xff);
 }]>;
@@ -124,15 +129,16 @@ def nModImm : Operand<i32> {
 // NEON load / store instructions
 //===----------------------------------------------------------------------===//
 
-let mayLoad = 1, neverHasSideEffects = 1 in {
 // Use vldmia to load a Q register as a D register pair.
 // This is equivalent to VLDMD except that it has a Q register operand
 // instead of a pair of D registers.
 def VLDMQ
-  : AXDI5<(outs QPR:$dst), (ins addrmode5:$addr, pred:$p),
+  : AXDI4<(outs QPR:$dst), (ins addrmode4:$addr, pred:$p),
           IndexModeNone, IIC_fpLoadm,
-          "vldm${addr:submode}${p}\t${addr:base}, ${dst:dregpair}", "", []>;
+          "vldm${addr:submode}${p}\t$addr, ${dst:dregpair}", "",
+          [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]>;
 
+let mayLoad = 1, neverHasSideEffects = 1 in {
 // Use vld1 to load a Q register as a D register pair.
 // This alternative to VLDMQ allows an alignment to be specified.
 // This is equivalent to VLD1q64 except that it has a Q register operand.
@@ -141,15 +147,16 @@ def VLD1q
           IIC_VLD1, "vld1", "64", "${dst:dregpair}, $addr", "", []>;
 } // mayLoad = 1, neverHasSideEffects = 1
 
-let mayStore = 1, neverHasSideEffects = 1 in {
 // Use vstmia to store a Q register as a D register pair.
 // This is equivalent to VSTMD except that it has a Q register operand
 // instead of a pair of D registers.
 def VSTMQ
-  : AXDI5<(outs), (ins QPR:$src, addrmode5:$addr, pred:$p),
+  : AXDI4<(outs), (ins QPR:$src, addrmode4:$addr, pred:$p),
           IndexModeNone, IIC_fpStorem,
-          "vstm${addr:submode}${p}\t${addr:base}, ${src:dregpair}", "", []>;
+          "vstm${addr:submode}${p}\t$addr, ${src:dregpair}", "",
+          [(store (v2f64 QPR:$src), addrmode4:$addr)]>;
 
+let mayStore = 1, neverHasSideEffects = 1 in {
 // Use vst1 to store a Q register as a D register pair.
 // This alternative to VSTMQ allows an alignment to be specified.
 // This is equivalent to VST1q64 except that it has a Q register operand.
@@ -160,6 +167,25 @@ def VST1q
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 
+// Classes for VLD* pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VLDQPseudo
+  : PseudoNLdSt<(outs QPR:$dst), (ins addrmode6:$addr), IIC_VST, "">;
+class VLDQWBPseudo
+  : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset), IIC_VST,
+                "$addr.addr = $wb">;
+class VLDQQPseudo
+  : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr), IIC_VST, "">;
+class VLDQQWBPseudo
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset), IIC_VST,
+                "$addr.addr = $wb">;
+class VLDQQQQWBPseudo
+  : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), IIC_VST,
+                "$addr.addr = $wb, $src = $dst">;
+
 //   VLD1     : Vector Load (multiple single elements)
 class VLD1D<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst),
@@ -180,6 +206,11 @@ def  VLD1q16  : VLD1Q<0b0100, "16">;
 def  VLD1q32  : VLD1Q<0b1000, "32">;
 def  VLD1q64  : VLD1Q<0b1100, "64">;
 
+def  VLD1q8Pseudo  : VLDQPseudo;
+def  VLD1q16Pseudo : VLDQPseudo;
+def  VLD1q32Pseudo : VLDQPseudo;
+def  VLD1q64Pseudo : VLDQPseudo;
+
 // ...with address register writeback:
 class VLD1DWB<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst, GPR:$wb),
@@ -202,6 +233,11 @@ def VLD1q16_UPD : VLD1QWB<0b0100, "16">;
 def VLD1q32_UPD : VLD1QWB<0b1000, "32">;
 def VLD1q64_UPD : VLD1QWB<0b1100, "64">;
 
+def VLD1q8Pseudo_UPD  : VLDQWBPseudo;
+def VLD1q16Pseudo_UPD : VLDQWBPseudo;
+def VLD1q32Pseudo_UPD : VLDQWBPseudo;
+def VLD1q64Pseudo_UPD : VLDQWBPseudo;
+
 // ...with 3 registers (some of these are only for the disassembler):
 class VLD1D3<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
@@ -222,6 +258,9 @@ def VLD1d16T_UPD : VLD1D3WB<0b0100, "16">;
 def VLD1d32T_UPD : VLD1D3WB<0b1000, "32">;
 def VLD1d64T_UPD : VLD1D3WB<0b1100, "64">;
 
+def VLD1d64TPseudo     : VLDQQPseudo;
+def VLD1d64TPseudo_UPD : VLDQQWBPseudo;
+
 // ...with 4 registers (some of these are only for the disassembler):
 class VLD1D4<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
@@ -244,6 +283,9 @@ def VLD1d16Q_UPD : VLD1D4WB<0b0100, "16">;
 def VLD1d32Q_UPD : VLD1D4WB<0b1000, "32">;
 def VLD1d64Q_UPD : VLD1D4WB<0b1100, "64">;
 
+def VLD1d64QPseudo     : VLDQQPseudo;
+def VLD1d64QPseudo_UPD : VLDQQWBPseudo;
+
 //   VLD2     : Vector Load (multiple 2-element structures)
 class VLD2D<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2),
@@ -263,6 +305,14 @@ def  VLD2q8   : VLD2Q<0b0000, "8">;
 def  VLD2q16  : VLD2Q<0b0100, "16">;
 def  VLD2q32  : VLD2Q<0b1000, "32">;
 
+def  VLD2d8Pseudo  : VLDQPseudo;
+def  VLD2d16Pseudo : VLDQPseudo;
+def  VLD2d32Pseudo : VLDQPseudo;
+
+def  VLD2q8Pseudo  : VLDQQPseudo;
+def  VLD2q16Pseudo : VLDQQPseudo;
+def  VLD2q32Pseudo : VLDQQPseudo;
+
 // ...with address register writeback:
 class VLD2DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, GPR:$wb),
@@ -284,6 +334,14 @@ def VLD2q8_UPD  : VLD2QWB<0b0000, "8">;
 def VLD2q16_UPD : VLD2QWB<0b0100, "16">;
 def VLD2q32_UPD : VLD2QWB<0b1000, "32">;
 
+def VLD2d8Pseudo_UPD  : VLDQWBPseudo;
+def VLD2d16Pseudo_UPD : VLDQWBPseudo;
+def VLD2d32Pseudo_UPD : VLDQWBPseudo;
+
+def VLD2q8Pseudo_UPD  : VLDQQWBPseudo;
+def VLD2q16Pseudo_UPD : VLDQQWBPseudo;
+def VLD2q32Pseudo_UPD : VLDQQWBPseudo;
+
 // ...with double-spaced registers (for disassembly only):
 def VLD2b8      : VLD2D<0b1001, 0b0000, "8">;
 def VLD2b16     : VLD2D<0b1001, 0b0100, "16">;
@@ -302,6 +360,10 @@ def  VLD3d8   : VLD3D<0b0100, 0b0000, "8">;
 def  VLD3d16  : VLD3D<0b0100, 0b0100, "16">;
 def  VLD3d32  : VLD3D<0b0100, 0b1000, "32">;
 
+def  VLD3d8Pseudo  : VLDQQPseudo;
+def  VLD3d16Pseudo : VLDQQPseudo;
+def  VLD3d32Pseudo : VLDQQPseudo;
+
 // ...with address register writeback:
 class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, op11_8, op7_4,
@@ -314,6 +376,10 @@ def VLD3d8_UPD  : VLD3DWB<0b0100, 0b0000, "8">;
 def VLD3d16_UPD : VLD3DWB<0b0100, 0b0100, "16">;
 def VLD3d32_UPD : VLD3DWB<0b0100, 0b1000, "32">;
 
+def VLD3d8Pseudo_UPD  : VLDQQWBPseudo;
+def VLD3d16Pseudo_UPD : VLDQQWBPseudo;
+def VLD3d32Pseudo_UPD : VLDQQWBPseudo;
+
 // ...with double-spaced registers (non-updating versions for disassembly only):
 def VLD3q8      : VLD3D<0b0101, 0b0000, "8">;
 def VLD3q16     : VLD3D<0b0101, 0b0100, "16">;
@@ -322,10 +388,14 @@ def VLD3q8_UPD  : VLD3DWB<0b0101, 0b0000, "8">;
 def VLD3q16_UPD : VLD3DWB<0b0101, 0b0100, "16">;
 def VLD3q32_UPD : VLD3DWB<0b0101, 0b1000, "32">;
 
+def VLD3q8Pseudo_UPD  : VLDQQQQWBPseudo;
+def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo;
+def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo;
+
 // ...alternate versions to be allocated odd register numbers:
-def VLD3q8odd_UPD  : VLD3DWB<0b0101, 0b0000, "8">;
-def VLD3q16odd_UPD : VLD3DWB<0b0101, 0b0100, "16">;
-def VLD3q32odd_UPD : VLD3DWB<0b0101, 0b1000, "32">;
+def VLD3q8oddPseudo_UPD  : VLDQQQQWBPseudo;
+def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo;
+def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo;
 
 //   VLD4     : Vector Load (multiple 4-element structures)
 class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -338,6 +408,10 @@ def  VLD4d8   : VLD4D<0b0000, 0b0000, "8">;
 def  VLD4d16  : VLD4D<0b0000, 0b0100, "16">;
 def  VLD4d32  : VLD4D<0b0000, 0b1000, "32">;
 
+def  VLD4d8Pseudo  : VLDQQPseudo;
+def  VLD4d16Pseudo : VLDQQPseudo;
+def  VLD4d32Pseudo : VLDQQPseudo;
+
 // ...with address register writeback:
 class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, op11_8, op7_4,
@@ -350,6 +424,10 @@ def VLD4d8_UPD  : VLD4DWB<0b0000, 0b0000, "8">;
 def VLD4d16_UPD : VLD4DWB<0b0000, 0b0100, "16">;
 def VLD4d32_UPD : VLD4DWB<0b0000, 0b1000, "32">;
 
+def VLD4d8Pseudo_UPD  : VLDQQWBPseudo;
+def VLD4d16Pseudo_UPD : VLDQQWBPseudo;
+def VLD4d32Pseudo_UPD : VLDQQWBPseudo;
+
 // ...with double-spaced registers (non-updating versions for disassembly only):
 def VLD4q8      : VLD4D<0b0001, 0b0000, "8">;
 def VLD4q16     : VLD4D<0b0001, 0b0100, "16">;
@@ -358,10 +436,14 @@ def VLD4q8_UPD  : VLD4DWB<0b0001, 0b0000, "8">;
 def VLD4q16_UPD : VLD4DWB<0b0001, 0b0100, "16">;
 def VLD4q32_UPD : VLD4DWB<0b0001, 0b1000, "32">;
 
+def VLD4q8Pseudo_UPD  : VLDQQQQWBPseudo;
+def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo;
+def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo;
+
 // ...alternate versions to be allocated odd register numbers:
-def VLD4q8odd_UPD  : VLD4DWB<0b0001, 0b0000, "8">;
-def VLD4q16odd_UPD : VLD4DWB<0b0001, 0b0100, "16">;
-def VLD4q32odd_UPD : VLD4DWB<0b0001, 0b1000, "32">;
+def VLD4q8oddPseudo_UPD  : VLDQQQQWBPseudo;
+def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo;
+def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo;
 
 //   VLD1LN   : Vector Load (single element to one lane)
 //   FIXME: Not yet implemented.
@@ -486,6 +568,25 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32">;
 
 let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
 
+// Classes for VST* pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VSTQPseudo
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src), IIC_VST, "">;
+class VSTQWBPseudo
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QPR:$src), IIC_VST,
+                "$addr.addr = $wb">;
+class VSTQQPseudo
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src), IIC_VST, "">;
+class VSTQQWBPseudo
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQPR:$src), IIC_VST,
+                "$addr.addr = $wb">;
+class VSTQQQQWBPseudo
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), IIC_VST,
+                "$addr.addr = $wb">;
+
 //   VST1     : Vector Store (multiple single elements)
 class VST1D<bits<4> op7_4, string Dt>
   : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST,
@@ -505,6 +606,11 @@ def  VST1q16  : VST1Q<0b0100, "16">;
 def  VST1q32  : VST1Q<0b1000, "32">;
 def  VST1q64  : VST1Q<0b1100, "64">;
 
+def  VST1q8Pseudo  : VSTQPseudo;
+def  VST1q16Pseudo : VSTQPseudo;
+def  VST1q32Pseudo : VSTQPseudo;
+def  VST1q64Pseudo : VSTQPseudo;
+
 // ...with address register writeback:
 class VST1DWB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0111, op7_4, (outs GPR:$wb),
@@ -525,6 +631,11 @@ def VST1q16_UPD : VST1QWB<0b0100, "16">;
 def VST1q32_UPD : VST1QWB<0b1000, "32">;
 def VST1q64_UPD : VST1QWB<0b1100, "64">;
 
+def VST1q8Pseudo_UPD  : VSTQWBPseudo;
+def VST1q16Pseudo_UPD : VSTQWBPseudo;
+def VST1q32Pseudo_UPD : VSTQWBPseudo;
+def VST1q64Pseudo_UPD : VSTQWBPseudo;
+
 // ...with 3 registers (some of these are only for the disassembler):
 class VST1D3<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0110, op7_4, (outs),
@@ -547,6 +658,9 @@ def VST1d16T_UPD : VST1D3WB<0b0100, "16">;
 def VST1d32T_UPD : VST1D3WB<0b1000, "32">;
 def VST1d64T_UPD : VST1D3WB<0b1100, "64">;
 
+def VST1d64TPseudo     : VSTQQPseudo;
+def VST1d64TPseudo_UPD : VSTQQWBPseudo;
+
 // ...with 4 registers (some of these are only for the disassembler):
 class VST1D4<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
@@ -570,6 +684,9 @@ def VST1d16Q_UPD : VST1D4WB<0b0100, "16">;
 def VST1d32Q_UPD : VST1D4WB<0b1000, "32">;
 def VST1d64Q_UPD : VST1D4WB<0b1100, "64">;
 
+def VST1d64QPseudo     : VSTQQPseudo;
+def VST1d64QPseudo_UPD : VSTQQWBPseudo;
+
 //   VST2     : Vector Store (multiple 2-element structures)
 class VST2D<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, op11_8, op7_4, (outs),
@@ -589,6 +706,14 @@ def  VST2q8   : VST2Q<0b0000, "8">;
 def  VST2q16  : VST2Q<0b0100, "16">;
 def  VST2q32  : VST2Q<0b1000, "32">;
 
+def  VST2d8Pseudo  : VSTQPseudo;
+def  VST2d16Pseudo : VSTQPseudo;
+def  VST2d32Pseudo : VSTQPseudo;
+
+def  VST2q8Pseudo  : VSTQQPseudo;
+def  VST2q16Pseudo : VSTQQPseudo;
+def  VST2q32Pseudo : VSTQQPseudo;
+
 // ...with address register writeback:
 class VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
@@ -610,6 +735,14 @@ def VST2q8_UPD  : VST2QWB<0b0000, "8">;
 def VST2q16_UPD : VST2QWB<0b0100, "16">;
 def VST2q32_UPD : VST2QWB<0b1000, "32">;
 
+def VST2d8Pseudo_UPD  : VSTQWBPseudo;
+def VST2d16Pseudo_UPD : VSTQWBPseudo;
+def VST2d32Pseudo_UPD : VSTQWBPseudo;
+
+def VST2q8Pseudo_UPD  : VSTQQWBPseudo;
+def VST2q16Pseudo_UPD : VSTQQWBPseudo;
+def VST2q32Pseudo_UPD : VSTQQWBPseudo;
+
 // ...with double-spaced registers (for disassembly only):
 def VST2b8      : VST2D<0b1001, 0b0000, "8">;
 def VST2b16     : VST2D<0b1001, 0b0100, "16">;
@@ -628,6 +761,10 @@ def  VST3d8   : VST3D<0b0100, 0b0000, "8">;
 def  VST3d16  : VST3D<0b0100, 0b0100, "16">;
 def  VST3d32  : VST3D<0b0100, 0b1000, "32">;
 
+def  VST3d8Pseudo  : VSTQQPseudo;
+def  VST3d16Pseudo : VSTQQPseudo;
+def  VST3d32Pseudo : VSTQQPseudo;
+
 // ...with address register writeback:
 class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
@@ -640,6 +777,10 @@ def VST3d8_UPD  : VST3DWB<0b0100, 0b0000, "8">;
 def VST3d16_UPD : VST3DWB<0b0100, 0b0100, "16">;
 def VST3d32_UPD : VST3DWB<0b0100, 0b1000, "32">;
 
+def VST3d8Pseudo_UPD  : VSTQQWBPseudo;
+def VST3d16Pseudo_UPD : VSTQQWBPseudo;
+def VST3d32Pseudo_UPD : VSTQQWBPseudo;
+
 // ...with double-spaced registers (non-updating versions for disassembly only):
 def VST3q8      : VST3D<0b0101, 0b0000, "8">;
 def VST3q16     : VST3D<0b0101, 0b0100, "16">;
@@ -648,10 +789,14 @@ def VST3q8_UPD  : VST3DWB<0b0101, 0b0000, "8">;
 def VST3q16_UPD : VST3DWB<0b0101, 0b0100, "16">;
 def VST3q32_UPD : VST3DWB<0b0101, 0b1000, "32">;
 
+def VST3q8Pseudo_UPD  : VSTQQQQWBPseudo;
+def VST3q16Pseudo_UPD : VSTQQQQWBPseudo;
+def VST3q32Pseudo_UPD : VSTQQQQWBPseudo;
+
 // ...alternate versions to be allocated odd register numbers:
-def VST3q8odd_UPD  : VST3DWB<0b0101, 0b0000, "8">;
-def VST3q16odd_UPD : VST3DWB<0b0101, 0b0100, "16">;
-def VST3q32odd_UPD : VST3DWB<0b0101, 0b1000, "32">;
+def VST3q8oddPseudo_UPD  : VSTQQQQWBPseudo;
+def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo;
+def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo;
 
 //   VST4     : Vector Store (multiple 4-element structures)
 class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -664,6 +809,10 @@ def  VST4d8   : VST4D<0b0000, 0b0000, "8">;
 def  VST4d16  : VST4D<0b0000, 0b0100, "16">;
 def  VST4d32  : VST4D<0b0000, 0b1000, "32">;
 
+def  VST4d8Pseudo  : VSTQQPseudo;
+def  VST4d16Pseudo : VSTQQPseudo;
+def  VST4d32Pseudo : VSTQQPseudo;
+
 // ...with address register writeback:
 class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
@@ -676,6 +825,10 @@ def VST4d8_UPD  : VST4DWB<0b0000, 0b0000, "8">;
 def VST4d16_UPD : VST4DWB<0b0000, 0b0100, "16">;
 def VST4d32_UPD : VST4DWB<0b0000, 0b1000, "32">;
 
+def VST4d8Pseudo_UPD  : VSTQQWBPseudo;
+def VST4d16Pseudo_UPD : VSTQQWBPseudo;
+def VST4d32Pseudo_UPD : VSTQQWBPseudo;
+
 // ...with double-spaced registers (non-updating versions for disassembly only):
 def VST4q8      : VST4D<0b0001, 0b0000, "8">;
 def VST4q16     : VST4D<0b0001, 0b0100, "16">;
@@ -684,10 +837,14 @@ def VST4q8_UPD  : VST4DWB<0b0001, 0b0000, "8">;
 def VST4q16_UPD : VST4DWB<0b0001, 0b0100, "16">;
 def VST4q32_UPD : VST4DWB<0b0001, 0b1000, "32">;
 
+def VST4q8Pseudo_UPD  : VSTQQQQWBPseudo;
+def VST4q16Pseudo_UPD : VSTQQQQWBPseudo;
+def VST4q32Pseudo_UPD : VSTQQQQWBPseudo;
+
 // ...alternate versions to be allocated odd register numbers:
-def VST4q8odd_UPD  : VST4DWB<0b0001, 0b0000, "8">;
-def VST4q16odd_UPD : VST4DWB<0b0001, 0b0100, "16">;
-def VST4q32odd_UPD : VST4DWB<0b0001, 0b1000, "32">;
+def VST4q8oddPseudo_UPD  : VSTQQQQWBPseudo;
+def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo;
+def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo;
 
 //   VST1LN   : Vector Store (single element from one lane)
 //   FIXME: Not yet implemented.
@@ -879,6 +1036,15 @@ class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
         (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "",
         [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
 
+// Narrow 2-register operations.
+class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType TyD, ValueType TyQ, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$dst),
+        (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "",
+        [(set DPR:$dst, (TyD (OpNode (TyQ QPR:$src))))]>;
+
 // Narrow 2-register intrinsics.
 class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
               bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
@@ -888,14 +1054,14 @@ class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
         (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "",
         [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src))))]>;
 
-// Long 2-register intrinsics (currently only used for VMOVL).
-class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
-              bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
-              InstrItinClass itin, string OpcodeStr, string Dt,
-              ValueType TyQ, ValueType TyD, Intrinsic IntOp>
+// Long 2-register operations (currently only used for VMOVL).
+class N2VL<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType TyQ, ValueType TyD, SDNode OpNode>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$dst),
         (ins DPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "",
-        [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src))))]>;
+        [(set QPR:$dst, (TyQ (OpNode (TyD DPR:$src))))]>;
 
 // 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register.
 class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr, string Dt>
@@ -1150,6 +1316,24 @@ class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                                    (ResTy (NEONvduplane (OpTy DPR_8:$src3),
                                                         imm:$lane)))))))]>;
 
+// Neon Intrinsic-Op instructions (VABA): double- and quad-register.
+class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, Intrinsic IntOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+        [(set DPR:$dst, (Ty (OpNode DPR:$src1,
+                             (Ty (IntOp (Ty DPR:$src2), (Ty DPR:$src3))))))]>;
+class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, Intrinsic IntOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+        [(set QPR:$dst, (Ty (OpNode QPR:$src1,
+                             (Ty (IntOp (Ty QPR:$src2), (Ty QPR:$src3))))))]>;
+
 // Neon 3-argument intrinsics, both double- and quad-register.
 // The destination register is also used as the first source operand register.
 class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@@ -1169,6 +1353,53 @@ class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
         [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1),
                                       (OpTy QPR:$src2), (OpTy QPR:$src3))))]>;
 
+// Long Multiply-Add/Sub operations.
+class N3VLMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+        [(set QPR:$dst, (OpNode (TyQ QPR:$src1),
+                                (TyQ (MulOp (TyD DPR:$src2),
+                                            (TyD DPR:$src3)))))]>;
+class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8,
+                  InstrItinClass itin, string OpcodeStr, string Dt,
+                  ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+  : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst),
+        (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
+        [(set QPR:$dst,
+          (OpNode (TyQ QPR:$src1),
+                  (TyQ (MulOp (TyD DPR:$src2),
+                              (TyD (NEONvduplane (TyD DPR_VFP2:$src3),
+                                                 imm:$lane))))))]>;
+class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+                    InstrItinClass itin, string OpcodeStr, string Dt,
+                    ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+  : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$dst),
+        (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
+        [(set QPR:$dst,
+          (OpNode (TyQ QPR:$src1),
+                  (TyQ (MulOp (TyD DPR:$src2),
+                              (TyD (NEONvduplane (TyD DPR_8:$src3),
+                                                 imm:$lane))))))]>;
+
+// Long Intrinsic-Op vector operations with explicit extend (VABAL).
+class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                   InstrItinClass itin, string OpcodeStr, string Dt,
+                   ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
+                   SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+        [(set QPR:$dst, (OpNode (TyQ QPR:$src1),
+                                (TyQ (ExtOp (TyD (IntOp (TyD DPR:$src2),
+                                                        (TyD DPR:$src3)))))))]>;
+
 // Neon Long 3-argument intrinsic.  The destination register is
 // a quad-register and is also used as the first source operand register.
 class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
@@ -1217,6 +1448,61 @@ class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
   let isCommutable = Commutable;
 }
 
+// Long 3-register operations.
+class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType TyQ, ValueType TyD, SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
+        OpcodeStr, Dt, "$dst, $src1, $src2", "",
+        [(set QPR:$dst, (TyQ (OpNode (TyD DPR:$src1), (TyD DPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType TyQ, ValueType TyD, SDNode OpNode>
+  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        [(set QPR:$dst,
+          (TyQ (OpNode (TyD DPR:$src1),
+                       (TyD (NEONvduplane (TyD DPR_VFP2:$src2),imm:$lane)))))]>;
+class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType TyQ, ValueType TyD, SDNode OpNode>
+  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), 
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        [(set QPR:$dst,
+          (TyQ (OpNode (TyD DPR:$src1),
+                       (TyD (NEONvduplane (TyD DPR_8:$src2), imm:$lane)))))]>;
+
+// Long 3-register operations with explicitly extended operands.
+class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,
+              bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
+        OpcodeStr, Dt, "$dst, $src1, $src2", "",
+        [(set QPR:$dst, (OpNode (TyQ (ExtOp (TyD DPR:$src1))),
+                                (TyQ (ExtOp (TyD DPR:$src2)))))]> {
+  let isCommutable = Commutable;
+}
+
+// Long 3-register intrinsics with explicit extend (VABDL).
+class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                 InstrItinClass itin, string OpcodeStr, string Dt,
+                 ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp,
+                 bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
+        OpcodeStr, Dt, "$dst, $src1, $src2", "",
+        [(set QPR:$dst, (TyQ (ExtOp (TyD (IntOp (TyD DPR:$src1),
+                                                (TyD DPR:$src2))))))]> {
+  let isCommutable = Commutable;
+}
+
 // Long 3-register intrinsics.
 class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
@@ -1248,14 +1534,15 @@ class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                             (OpTy (NEONvduplane (OpTy DPR_8:$src2),
                                                 imm:$lane)))))]>;
 
-// Wide 3-register intrinsics.
-class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-              string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
-              Intrinsic IntOp, bit Commutable>
+// Wide 3-register operations.
+class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
+           SDNode OpNode, SDNode ExtOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), N3RegFrm, IIC_VSUBiD,
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
-        [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> {
+        [(set QPR:$dst, (OpNode (TyQ QPR:$src1),
+                                (TyQ (ExtOp (TyD DPR:$src2)))))]> {
   let isCommutable = Commutable;
 }
 
@@ -1488,6 +1775,23 @@ multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 }
 
 
+// Neon Narrowing 2-register vector operations,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                    bits<5> op11_7, bit op6, bit op4, 
+                    InstrItinClass itin, string OpcodeStr, string Dt,
+                    SDNode OpNode> {
+  def v8i8  : N2VN<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "16"),
+                   v8i8, v8i16, OpNode>;
+  def v4i16 : N2VN<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "32"),
+                   v4i16, v4i32, OpNode>;
+  def v2i32 : N2VN<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "64"),
+                   v2i32, v2i64, OpNode>;
+}
+
 // Neon Narrowing 2-register vector intrinsics,
 //   source operand element sizes of 16, 32 and 64 bits:
 multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
@@ -1508,14 +1812,14 @@ multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 
 // Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL).
 //   source operand element sizes of 16, 32 and 64 bits:
-multiclass N2VLInt_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4,
-                       string OpcodeStr, string Dt, Intrinsic IntOp> {
-  def v8i16 : N2VLInt<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
-                      OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
-  def v4i32 : N2VLInt<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
-                      OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
-  def v2i64 : N2VLInt<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
-                      OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+multiclass N2VL_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4,
+                    string OpcodeStr, string Dt, SDNode OpNode> {
+  def v8i16 : N2VL<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode>;
+  def v4i32 : N2VL<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode>;
+  def v2i64 : N2VL<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode>;
 }
 
 
@@ -1607,6 +1911,47 @@ multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 }
 
 
+// Neon Long 3-register vector operations.
+
+multiclass N3VL_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                    InstrItinClass itin16, InstrItinClass itin32,
+                    string OpcodeStr, string Dt,
+                    SDNode OpNode, bit Commutable = 0> {
+  def v8i16 : N3VL<op24, op23, 0b00, op11_8, op4, itin16,
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v8i16, v8i8, OpNode, Commutable>;
+  def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16, 
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v4i32, v4i16, OpNode, Commutable>;
+  def v2i64 : N3VL<op24, op23, 0b10, op11_8, op4, itin32,
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v2i64, v2i32, OpNode, Commutable>;
+}
+
+multiclass N3VLSL_HS<bit op24, bits<4> op11_8,
+                     InstrItinClass itin, string OpcodeStr, string Dt,
+                     SDNode OpNode> {
+  def v4i16 : N3VLSL16<op24, 0b01, op11_8, itin, OpcodeStr,
+                       !strconcat(Dt, "16"), v4i32, v4i16, OpNode>;
+  def v2i32 : N3VLSL<op24, 0b10, op11_8, itin, OpcodeStr,
+                     !strconcat(Dt, "32"), v2i64, v2i32, OpNode>;
+}
+
+multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin16, InstrItinClass itin32,
+                       string OpcodeStr, string Dt,
+                       SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
+  def v8i16 : N3VLExt<op24, op23, 0b00, op11_8, op4, itin16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i16, v8i8, OpNode, ExtOp, Commutable>;
+  def v4i32 : N3VLExt<op24, op23, 0b01, op11_8, op4, itin16, 
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i32, v4i16, OpNode, ExtOp, Commutable>;
+  def v2i64 : N3VLExt<op24, op23, 0b10, op11_8, op4, itin32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i64, v2i32, OpNode, ExtOp, Commutable>;
+}
+
 // Neon Long 3-register vector intrinsics.
 
 // First with only element sizes of 16 and 32 bits:
@@ -1643,21 +1988,36 @@ multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                       v8i16, v8i8, IntOp, Commutable>;
 }
 
+// ....with explicit extend (VABDL).
+multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       Intrinsic IntOp, SDNode ExtOp, bit Commutable = 0> {
+  def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin,
+                         OpcodeStr, !strconcat(Dt, "8"),
+                         v8i16, v8i8, IntOp, ExtOp, Commutable>;
+  def v4i32 : N3VLIntExt<op24, op23, 0b01, op11_8, op4, itin, 
+                         OpcodeStr, !strconcat(Dt, "16"),
+                         v4i32, v4i16, IntOp, ExtOp, Commutable>;
+  def v2i64 : N3VLIntExt<op24, op23, 0b10, op11_8, op4, itin,
+                         OpcodeStr, !strconcat(Dt, "32"),
+                         v2i64, v2i32, IntOp, ExtOp, Commutable>;
+}
+
 
 // Neon Wide 3-register vector intrinsics,
 //   source operand element sizes of 8, 16 and 32 bits:
-multiclass N3VWInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
-                       string OpcodeStr, string Dt,
-                       Intrinsic IntOp, bit Commutable = 0> {
-  def v8i16 : N3VWInt<op24, op23, 0b00, op11_8, op4,
-                      OpcodeStr, !strconcat(Dt, "8"),
-                      v8i16, v8i8, IntOp, Commutable>;
-  def v4i32 : N3VWInt<op24, op23, 0b01, op11_8, op4,
-                      OpcodeStr, !strconcat(Dt, "16"),
-                      v4i32, v4i16, IntOp, Commutable>;
-  def v2i64 : N3VWInt<op24, op23, 0b10, op11_8, op4,
-                      OpcodeStr, !strconcat(Dt, "32"),
-                      v2i64, v2i32, IntOp, Commutable>;
+multiclass N3VW_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                    string OpcodeStr, string Dt,
+                    SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
+  def v8i16 : N3VW<op24, op23, 0b00, op11_8, op4,
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v8i16, v8i8, OpNode, ExtOp, Commutable>;
+  def v4i32 : N3VW<op24, op23, 0b01, op11_8, op4,
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v4i32, v4i16, OpNode, ExtOp, Commutable>;
+  def v2i64 : N3VW<op24, op23, 0b10, op11_8, op4,
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v2i64, v2i32, OpNode, ExtOp, Commutable>;
 }
 
 
@@ -1700,6 +2060,29 @@ multiclass N3VMulOpSL_HS<bits<4> op11_8,
                           mul, ShOp>;
 }
 
+// Neon Intrinsic-Op vector operations,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        InstrItinClass itinD, InstrItinClass itinQ,
+                        string OpcodeStr, string Dt, Intrinsic IntOp,
+                        SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N3VDIntOp<op24, op23, 0b00, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i8, IntOp, OpNode>;
+  def v4i16 : N3VDIntOp<op24, op23, 0b01, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp, OpNode>;
+  def v2i32 : N3VDIntOp<op24, op23, 0b10, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp, OpNode>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQIntOp<op24, op23, 0b00, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "8"), v16i8, IntOp, OpNode>;
+  def v8i16 : N3VQIntOp<op24, op23, 0b01, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, IntOp, OpNode>;
+  def v4i32 : N3VQIntOp<op24, op23, 0b10, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, IntOp, OpNode>;
+}
+
 // Neon 3-argument intrinsics,
 //   element sizes of 8, 16 and 32 bits:
 multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
@@ -1723,6 +2106,29 @@ multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
 }
 
 
+// Neon Long Multiply-Op vector operations,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VLMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                         InstrItinClass itin16, InstrItinClass itin32,
+                         string OpcodeStr, string Dt, SDNode MulOp,
+                         SDNode OpNode> {
+  def v8i16 : N3VLMulOp<op24, op23, 0b00, op11_8, op4, itin16, OpcodeStr,
+                        !strconcat(Dt, "8"), v8i16, v8i8, MulOp, OpNode>;
+  def v4i32 : N3VLMulOp<op24, op23, 0b01, op11_8, op4, itin16, OpcodeStr,
+                        !strconcat(Dt, "16"), v4i32, v4i16, MulOp, OpNode>;
+  def v2i64 : N3VLMulOp<op24, op23, 0b10, op11_8, op4, itin32, OpcodeStr,
+                        !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;
+}
+
+multiclass N3VLMulOpSL_HS<bit op24, bits<4> op11_8, string OpcodeStr,
+                          string Dt, SDNode MulOp, SDNode OpNode> {
+  def v4i16 : N3VLMulOpSL16<op24, 0b01, op11_8, IIC_VMACi16D, OpcodeStr,
+                            !strconcat(Dt,"16"), v4i32, v4i16, MulOp, OpNode>;
+  def v2i32 : N3VLMulOpSL<op24, 0b10, op11_8, IIC_VMACi32D, OpcodeStr,
+                          !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;
+}
+
+
 // Neon Long 3-argument intrinsics.
 
 // First with only element sizes of 16 and 32 bits:
@@ -1752,6 +2158,21 @@ multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                        OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
 }
 
+// ....with explicit extend (VABAL).
+multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                            InstrItinClass itin, string OpcodeStr, string Dt,
+                            Intrinsic IntOp, SDNode ExtOp, SDNode OpNode> {
+  def v8i16 : N3VLIntExtOp<op24, op23, 0b00, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8,
+                           IntOp, ExtOp, OpNode>;
+  def v4i32 : N3VLIntExtOp<op24, op23, 0b01, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16,
+                           IntOp, ExtOp, OpNode>;
+  def v2i64 : N3VLIntExtOp<op24, op23, 0b10, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32,
+                           IntOp, ExtOp, OpNode>;
+}
+
 
 // Neon 2-register vector intrinsics,
 //   element sizes of 8, 16 and 32 bits:
@@ -1996,13 +2417,13 @@ def  VADDfd   : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32",
 def  VADDfq   : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32",
                      v4f32, v4f32, fadd, 1>;
 //   VADDL    : Vector Add Long (Q = D + D)
-defm VADDLs   : N3VLInt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
-                            "vaddl", "s", int_arm_neon_vaddls, 1>;
-defm VADDLu   : N3VLInt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
-                            "vaddl", "u", int_arm_neon_vaddlu, 1>;
+defm VADDLs   : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vaddl", "s", add, sext, 1>;
+defm VADDLu   : N3VLExt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vaddl", "u", add, zext, 1>;
 //   VADDW    : Vector Add Wide (Q = Q + D)
-defm VADDWs   : N3VWInt_QHS<0,1,0b0001,0, "vaddw", "s", int_arm_neon_vaddws, 0>;
-defm VADDWu   : N3VWInt_QHS<1,1,0b0001,0, "vaddw", "u", int_arm_neon_vaddwu, 0>;
+defm VADDWs   : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>;
+defm VADDWu   : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>;
 //   VHADD    : Vector Halving Add
 defm VHADDs   : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm,
                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
@@ -2113,16 +2534,14 @@ def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
                                   (SubReg_i32_lane imm:$lane)))>;
 
 //   VMULL    : Vector Multiply Long (integer and polynomial) (Q = D * D)
-defm VMULLs   : N3VLInt_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
-                            "vmull", "s", int_arm_neon_vmulls, 1>;
-defm VMULLu   : N3VLInt_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
-                            "vmull", "u", int_arm_neon_vmullu, 1>;
+defm VMULLs   : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+                         "vmull", "s", NEONvmulls, 1>;
+defm VMULLu   : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+                         "vmull", "u", NEONvmullu, 1>;
 def  VMULLp   : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
                         v8i16, v8i8, int_arm_neon_vmullp, 1>;
-defm VMULLsls : N3VLIntSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s",
-                             int_arm_neon_vmulls>;
-defm VMULLslu : N3VLIntSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u",
-                             int_arm_neon_vmullu>;
+defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;
+defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;
 
 //   VQDMULL  : Vector Saturating Doubling Multiply Long (Q = D * D)
 defm VQDMULL  : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D,
@@ -2172,13 +2591,13 @@ def : Pat<(v4f32 (fadd (v4f32 QPR:$src1),
                            (SubReg_i32_lane imm:$lane)))>;
 
 //   VMLAL    : Vector Multiply Accumulate Long (Q += D * D)
-defm VMLALs   : N3VLInt3_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
-                             "vmlal", "s", int_arm_neon_vmlals>;
-defm VMLALu   : N3VLInt3_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
-                             "vmlal", "u", int_arm_neon_vmlalu>;
+defm VMLALs   : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlal", "s", NEONvmulls, add>;
+defm VMLALu   : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlal", "u", NEONvmullu, add>;
 
-defm VMLALsls : N3VLInt3SL_HS<0, 0b0010, "vmlal", "s", int_arm_neon_vmlals>;
-defm VMLALslu : N3VLInt3SL_HS<1, 0b0010, "vmlal", "u", int_arm_neon_vmlalu>;
+defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>;
+defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
 
 //   VQDMLAL  : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
 defm VQDMLAL  : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
@@ -2224,13 +2643,13 @@ def : Pat<(v4f32 (fsub (v4f32 QPR:$src1),
                            (SubReg_i32_lane imm:$lane)))>;
 
 //   VMLSL    : Vector Multiply Subtract Long (Q -= D * D)
-defm VMLSLs   : N3VLInt3_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
-                             "vmlsl", "s", int_arm_neon_vmlsls>;
-defm VMLSLu   : N3VLInt3_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
-                             "vmlsl", "u", int_arm_neon_vmlslu>;
+defm VMLSLs   : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlsl", "s", NEONvmulls, sub>;
+defm VMLSLu   : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlsl", "u", NEONvmullu, sub>;
 
-defm VMLSLsls : N3VLInt3SL_HS<0, 0b0110, "vmlsl", "s", int_arm_neon_vmlsls>;
-defm VMLSLslu : N3VLInt3SL_HS<1, 0b0110, "vmlsl", "u", int_arm_neon_vmlslu>;
+defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>;
+defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;
 
 //   VQDMLSL  : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
 defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
@@ -2247,13 +2666,13 @@ def  VSUBfd   : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32",
 def  VSUBfq   : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32",
                      v4f32, v4f32, fsub, 0>;
 //   VSUBL    : Vector Subtract Long (Q = D - D)
-defm VSUBLs   : N3VLInt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
-                            "vsubl", "s", int_arm_neon_vsubls, 1>;
-defm VSUBLu   : N3VLInt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
-                            "vsubl", "u", int_arm_neon_vsublu, 1>;
+defm VSUBLs   : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vsubl", "s", sub, sext, 0>;
+defm VSUBLu   : N3VLExt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vsubl", "u", sub, zext, 0>;
 //   VSUBW    : Vector Subtract Wide (Q = Q - D)
-defm VSUBWs   : N3VWInt_QHS<0,1,0b0011,0, "vsubw", "s", int_arm_neon_vsubws, 0>;
-defm VSUBWu   : N3VWInt_QHS<1,1,0b0011,0, "vsubw", "u", int_arm_neon_vsubwu, 0>;
+defm VSUBWs   : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>;
+defm VSUBWu   : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>;
 //   VHSUB    : Vector Halving Subtract
 defm VHSUBs   : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm,
                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
@@ -2469,32 +2888,32 @@ def  VBITq    : N3VX<1, 0, 0b10, 0b0001, 1, 1,
 //   VABD     : Vector Absolute Difference
 defm VABDs    : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm,
                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
-                           "vabd", "s", int_arm_neon_vabds, 0>;
+                           "vabd", "s", int_arm_neon_vabds, 1>;
 defm VABDu    : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm,
                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
-                           "vabd", "u", int_arm_neon_vabdu, 0>;
+                           "vabd", "u", int_arm_neon_vabdu, 1>;
 def  VABDfd   : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND,
-                        "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 0>;
+                        "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>;
 def  VABDfq   : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ,
-                        "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 0>;
+                        "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>;
 
 //   VABDL    : Vector Absolute Difference Long (Q = | D - D |)
-defm VABDLs   : N3VLInt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q,
-                            "vabdl", "s", int_arm_neon_vabdls, 0>;
-defm VABDLu   : N3VLInt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, IIC_VSUBi4Q,
-                             "vabdl", "u", int_arm_neon_vabdlu, 0>;
+defm VABDLs   : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
+                               "vabdl", "s", int_arm_neon_vabds, zext, 1>;
+defm VABDLu   : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q,
+                               "vabdl", "u", int_arm_neon_vabdu, zext, 1>;
 
 //   VABA     : Vector Absolute Difference and Accumulate
-defm VABAs    : N3VInt3_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
-                            "vaba", "s", int_arm_neon_vabas>;
-defm VABAu    : N3VInt3_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
-                            "vaba", "u", int_arm_neon_vabau>;
+defm VABAs    : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
+                             "vaba", "s", int_arm_neon_vabds, add>;
+defm VABAu    : N3VIntOp_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
+                             "vaba", "u", int_arm_neon_vabdu, add>;
 
 //   VABAL    : Vector Absolute Difference and Accumulate Long (Q += | D - D |)
-defm VABALs   : N3VLInt3_QHS<0,1,0b0101,0, IIC_VABAD, IIC_VABAD,
-                             "vabal", "s", int_arm_neon_vabals>;
-defm VABALu   : N3VLInt3_QHS<1,1,0b0101,0, IIC_VABAD, IIC_VABAD,
-                             "vabal", "u", int_arm_neon_vabalu>;
+defm VABALs   : N3VLIntExtOp_QHS<0,1,0b0101,0, IIC_VABAD,
+                                 "vabal", "s", int_arm_neon_vabds, zext, add>;
+defm VABALu   : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD,
+                                 "vabal", "u", int_arm_neon_vabdu, zext, add>;
 
 // Vector Maximum and Minimum.
 
@@ -3113,8 +3532,8 @@ def  VDUPfqf  : N2V<0b11, 0b11, {?,1}, {0,0}, 0b11000, 1, 0,
                     [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>;
 
 //   VMOVN    : Vector Narrowing Move
-defm VMOVN    : N2VNInt_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVD,
-                            "vmovn", "i", int_arm_neon_vmovn>;
+defm VMOVN    : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVD,
+                         "vmovn", "i", trunc>;
 //   VQMOVN   : Vector Saturating Narrowing Move
 defm VQMOVNs  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, IIC_VQUNAiD,
                             "vqmovn", "s", int_arm_neon_vqmovns>;
@@ -3123,10 +3542,8 @@ defm VQMOVNu  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD,
 defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD,
                             "vqmovun", "s", int_arm_neon_vqmovnsu>;
 //   VMOVL    : Vector Lengthening Move
-defm VMOVLs   : N2VLInt_QHS<0b01,0b10100,0,1, "vmovl", "s",
-                            int_arm_neon_vmovls>;
-defm VMOVLu   : N2VLInt_QHS<0b11,0b10100,0,1, "vmovl", "u",
-                            int_arm_neon_vmovlu>;
+defm VMOVLs   : N2VL_QHS<0b01,0b10100,0,1, "vmovl", "s", sext>;
+defm VMOVLu   : N2VL_QHS<0b11,0b10100,0,1, "vmovl", "u", zext>;
 
 // Vector Conversions.
 
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index bc0790dccbb5f..a13ff12327491 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -221,9 +221,13 @@ def tADDrPCi : T1I<(outs tGPR:$dst), (ins t_imm_s4:$rhs), IIC_iALUi,
                T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10
 
 // ADD rd, sp, #imm8
+// This is rematerializable, which is particularly useful for taking the
+// address of locals.
+let isReMaterializable = 1 in {
 def tADDrSPi : T1I<(outs tGPR:$dst), (ins GPR:$sp, t_imm_s4:$rhs), IIC_iALUi,
                   "add\t$dst, $sp, $rhs", []>,
                T1Encoding<{1,0,1,0,1,?}>; // A6.2 & A8.6.8
+}
 
 // ADD sp, sp, #imm7
 def tADDspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs), IIC_iALUi,
@@ -251,19 +255,6 @@ def tADDspr : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
   let Inst{2-0} = 0b101;
 }
 
-// Pseudo instruction that will expand into a tSUBspi + a copy.
-let usesCustomInserter = 1 in { // Expanded after instruction selection.
-def tSUBspi_ : PseudoInst<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs),
-               NoItinerary, "${:comment} sub\t$dst, $rhs", []>;
-
-def tADDspr_ : PseudoInst<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
-               NoItinerary, "${:comment} add\t$dst, $rhs", []>;
-
-let Defs = [CPSR] in
-def tANDsp : PseudoInst<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
-             NoItinerary, "${:comment} and\t$dst, $rhs", []>;
-} // usesCustomInserter
-
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions.
 //
@@ -378,7 +369,7 @@ let isBranch = 1, isTerminator = 1 in {
 
   def tBR_JTr : T1JTI<(outs),
                       (ins tGPR:$target, jtblock_operand:$jt, i32imm:$id),
-                      IIC_Br, "mov\tpc, $target\n\t.align\t2\n$jt",
+                      IIC_Br, "mov\tpc, $target\n\t.align\t2$jt",
                       [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]>,
                 Encoding16 {
     let Inst{15-7} = 0b010001101;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index bbe675e81ab1d..6ba0a44be4700 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -32,7 +32,7 @@ def t2_so_reg : Operand<i32>,    // reg imm
                 ComplexPattern<i32, 2, "SelectT2ShifterOperandReg",
                                [shl,srl,sra,rotr]> {
   let PrintMethod = "printT2SOOperand";
-  let MIOperandInfo = (ops GPR, i32imm);
+  let MIOperandInfo = (ops rGPR, i32imm);
 }
 
 // t2_so_imm_not_XFORM - Return the complement of a t2_so_imm value
@@ -51,10 +51,7 @@ def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
 // represented in the imm field in the same 12-bit form that they are encoded
 // into t2_so_imm instructions: the 8-bit immediate is the least significant
 // bits [bits 0-7], the 4-bit shift/splat amount is the next 4 bits [bits 8-11].
-def t2_so_imm : Operand<i32>,
-                PatLeaf<(imm), [{
-  return ARM_AM::getT2SOImmVal((uint32_t)N->getZExtValue()) != -1;
-}]>;
+def t2_so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_t2_so_imm(N); }]>;
 
 // t2_so_imm_not - Match an immediate that is a complement
 // of a t2_so_imm.
@@ -162,7 +159,7 @@ def t2am_imm8s4_offset : Operand<i32> {
 def t2addrmode_so_reg : Operand<i32>,
                         ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> {
   let PrintMethod = "printT2AddrModeSoRegOperand";
-  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+  let MIOperandInfo = (ops GPR:$base, rGPR:$offsreg, i32imm:$offsimm);
 }
 
 
@@ -176,9 +173,9 @@ def t2addrmode_so_reg : Operand<i32>,
 multiclass T2I_un_irs<bits<4> opcod, string opc, PatFrag opnode,
                       bit Cheap = 0, bit ReMat = 0> {
    // shifted imm
-   def i : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi,
+   def i : T2sI<(outs rGPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi,
                 opc, "\t$dst, $src",
-                [(set GPR:$dst, (opnode t2_so_imm:$src))]> {
+                [(set rGPR:$dst, (opnode t2_so_imm:$src))]> {
      let isAsCheapAsAMove = Cheap;
      let isReMaterializable = ReMat;
      let Inst{31-27} = 0b11110;
@@ -189,9 +186,9 @@ multiclass T2I_un_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{15} = 0;
    }
    // register
-   def r : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr,
+   def r : T2sI<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVr,
                 opc, ".w\t$dst, $src",
-                [(set GPR:$dst, (opnode GPR:$src))]> {
+                [(set rGPR:$dst, (opnode rGPR:$src))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -202,9 +199,9 @@ multiclass T2I_un_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def s : T2sI<(outs GPR:$dst), (ins t2_so_reg:$src), IIC_iMOVsi,
+   def s : T2sI<(outs rGPR:$dst), (ins t2_so_reg:$src), IIC_iMOVsi,
                 opc, ".w\t$dst, $src",
-                [(set GPR:$dst, (opnode t2_so_reg:$src))]> {
+                [(set rGPR:$dst, (opnode t2_so_reg:$src))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -217,11 +214,11 @@ multiclass T2I_un_irs<bits<4> opcod, string opc, PatFrag opnode,
 /// binary operation that produces a value. These are predicable and can be
 /// changed to modify CPSR.
 multiclass T2I_bin_irs<bits<4> opcod, string opc, PatFrag opnode,
-                       bit Commutable = 0, string wide =""> {
+                       bit Commutable = 0, string wide = ""> {
    // shifted imm
-   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+   def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
                  opc, "\t$dst, $lhs, $rhs",
-                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> {
+                 [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_imm:$rhs))]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -229,9 +226,9 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{15} = 0;
    }
    // register
-   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+   def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, rGPR:$rhs), IIC_iALUr,
                  opc, !strconcat(wide, "\t$dst, $lhs, $rhs"),
-                 [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> {
+                 [(set rGPR:$dst, (opnode rGPR:$lhs, rGPR:$rhs))]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -242,9 +239,9 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+   def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
                  opc, !strconcat(wide, "\t$dst, $lhs, $rhs"),
-                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> {
+                 [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_reg:$rhs))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -259,23 +256,35 @@ multiclass T2I_bin_w_irs<bits<4> opcod, string opc, PatFrag opnode,
     T2I_bin_irs<opcod, opc, opnode, Commutable, ".w">;
 
 /// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are
-/// reversed. It doesn't define the 'rr' form since it's handled by its
-/// T2I_bin_irs counterpart.
-multiclass T2I_rbin_is<bits<4> opcod, string opc, PatFrag opnode> {
+/// reversed.  The 'rr' form is only defined for the disassembler; for codegen
+/// it is equivalent to the T2I_bin_irs counterpart.
+multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
    // shifted imm
-   def ri : T2sI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs), IIC_iALUi,
+   def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_imm:$lhs), IIC_iALUi,
                  opc, ".w\t$dst, $rhs, $lhs",
-                 [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]> {
+                 [(set rGPR:$dst, (opnode t2_so_imm:$lhs, rGPR:$rhs))]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
      let Inst{20} = ?; // The S bit.
      let Inst{15} = 0;
    }
+   // register
+   def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, rGPR:$lhs), IIC_iALUr,
+                 opc, "\t$dst, $rhs, $lhs",
+                 [/* For disassembly only; pattern left blank */]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = ?; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
    // shifted register
-   def rs : T2sI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi,
+   def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi,
                  opc, "\t$dst, $rhs, $lhs",
-                 [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]> {
+                 [(set rGPR:$dst, (opnode t2_so_reg:$lhs, rGPR:$rhs))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -289,9 +298,9 @@ let Defs = [CPSR] in {
 multiclass T2I_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
                          bit Commutable = 0> {
    // shifted imm
-   def ri : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+   def ri : T2I<(outs rGPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
                 !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs",
-                [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> {
+                [(set rGPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -299,9 +308,9 @@ multiclass T2I_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{15} = 0;
    }
    // register
-   def rr : T2I<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+   def rr : T2I<(outs rGPR:$dst), (ins GPR:$lhs, rGPR:$rhs), IIC_iALUr,
                 !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs",
-                [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> {
+                [(set rGPR:$dst, (opnode GPR:$lhs, rGPR:$rhs))]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -312,9 +321,9 @@ multiclass T2I_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+   def rs : T2I<(outs rGPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
                 !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs",
-                [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> {
+                [(set rGPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -328,9 +337,12 @@ multiclass T2I_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
 multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
                           bit Commutable = 0> {
    // shifted imm
-   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+   // The register-immediate version is re-materializable. This is useful
+   // in particular for taking the address of a local.
+   let isReMaterializable = 1 in {
+   def ri : T2sI<(outs rGPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
                  opc, ".w\t$dst, $lhs, $rhs",
-                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> {
+                 [(set rGPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24} = 1;
@@ -338,10 +350,11 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
      let Inst{20} = 0; // The S bit.
      let Inst{15} = 0;
    }
+   }
    // 12-bit imm
-   def ri12 : T2I<(outs GPR:$dst), (ins GPR:$lhs, imm0_4095:$rhs), IIC_iALUi,
+   def ri12 : T2I<(outs rGPR:$dst), (ins GPR:$lhs, imm0_4095:$rhs), IIC_iALUi,
                   !strconcat(opc, "w"), "\t$dst, $lhs, $rhs",
-                  [(set GPR:$dst, (opnode GPR:$lhs, imm0_4095:$rhs))]> {
+                  [(set rGPR:$dst, (opnode GPR:$lhs, imm0_4095:$rhs))]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 1;
      let Inst{24} = 0;
@@ -350,9 +363,9 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
      let Inst{15} = 0;
    }
    // register
-   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+   def rr : T2sI<(outs rGPR:$dst), (ins GPR:$lhs, rGPR:$rhs), IIC_iALUr,
                  opc, ".w\t$dst, $lhs, $rhs",
-                 [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> {
+                 [(set rGPR:$dst, (opnode GPR:$lhs, rGPR:$rhs))]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -364,9 +377,9 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+   def rs : T2sI<(outs rGPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
                  opc, ".w\t$dst, $lhs, $rhs",
-                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> {
+                 [(set rGPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24} = 1;
@@ -382,9 +395,9 @@ let Uses = [CPSR] in {
 multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
                              bit Commutable = 0> {
    // shifted imm
-   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+   def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
                  opc, "\t$dst, $lhs, $rhs",
-                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>,
+                 [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_imm:$rhs))]>,
                  Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
@@ -393,9 +406,9 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{15} = 0;
    }
    // register
-   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+   def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, rGPR:$rhs), IIC_iALUr,
                  opc, ".w\t$dst, $lhs, $rhs",
-                 [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>,
+                 [(set rGPR:$dst, (opnode rGPR:$lhs, rGPR:$rhs))]>,
                  Requires<[IsThumb2]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
@@ -407,9 +420,9 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+   def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
                  opc, ".w\t$dst, $lhs, $rhs",
-                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>,
+                 [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_reg:$rhs))]>,
                  Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -423,9 +436,9 @@ let Defs = [CPSR] in {
 multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
                                bit Commutable = 0> {
    // shifted imm
-   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+   def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
                  opc, "\t$dst, $lhs, $rhs",
-                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>,
+                 [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_imm:$rhs))]>,
                  Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
@@ -434,9 +447,9 @@ multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{15} = 0;
    }
    // register
-   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+   def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, rGPR:$rhs), IIC_iALUr,
                  opc, ".w\t$dst, $lhs, $rhs",
-                 [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>,
+                 [(set rGPR:$dst, (opnode rGPR:$lhs, rGPR:$rhs))]>,
                  Requires<[IsThumb2]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
@@ -448,9 +461,9 @@ multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+   def rs : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
                  opc, ".w\t$dst, $lhs, $rhs",
-                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>,
+                 [(set rGPR:$dst, (opnode rGPR:$lhs, t2_so_reg:$rhs))]>,
                  Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -461,13 +474,14 @@ multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
 }
 }
 
-/// T2I_rbin_s_is - Same as T2I_rbin_is except sets 's' bit.
+/// T2I_rbin_s_is - Same as T2I_rbin_irs except sets 's' bit and the register
+/// version is not needed since this is only for codegen.
 let Defs = [CPSR] in {
 multiclass T2I_rbin_s_is<bits<4> opcod, string opc, PatFrag opnode> {
    // shifted imm
-   def ri : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs), IIC_iALUi,
+   def ri : T2I<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_imm:$lhs), IIC_iALUi,
                 !strconcat(opc, "s"), ".w\t$dst, $rhs, $lhs",
-                [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]> {
+                [(set rGPR:$dst, (opnode t2_so_imm:$lhs, rGPR:$rhs))]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -475,9 +489,9 @@ multiclass T2I_rbin_s_is<bits<4> opcod, string opc, PatFrag opnode> {
      let Inst{15} = 0;
    }
    // shifted register
-   def rs : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi,
+   def rs : T2I<(outs rGPR:$dst), (ins rGPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi,
                 !strconcat(opc, "s"), "\t$dst, $rhs, $lhs",
-                [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]> {
+                [(set rGPR:$dst, (opnode t2_so_reg:$lhs, rGPR:$rhs))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -490,18 +504,18 @@ multiclass T2I_rbin_s_is<bits<4> opcod, string opc, PatFrag opnode> {
 //  rotate operation that produces a value.
 multiclass T2I_sh_ir<bits<2> opcod, string opc, PatFrag opnode> {
    // 5-bit imm
-   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
+   def ri : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
                  opc, ".w\t$dst, $lhs, $rhs",
-                 [(set GPR:$dst, (opnode GPR:$lhs, imm1_31:$rhs))]> {
+                 [(set rGPR:$dst, (opnode rGPR:$lhs, imm1_31:$rhs))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-21} = 0b010010;
      let Inst{19-16} = 0b1111; // Rn
      let Inst{5-4} = opcod;
    }
    // register
-   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iMOVsr,
+   def rr : T2sI<(outs rGPR:$dst), (ins rGPR:$lhs, rGPR:$rhs), IIC_iMOVsr,
                  opc, ".w\t$dst, $lhs, $rhs",
-                 [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> {
+                 [(set rGPR:$dst, (opnode rGPR:$lhs, rGPR:$rhs))]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-21} = opcod;
@@ -513,7 +527,7 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, PatFrag opnode> {
 /// T2I_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
 /// patterns. Similar to T2I_bin_irs except the instruction does not produce
 /// a explicit result, only implicitly set CPSR.
-let Defs = [CPSR] in {
+let isCompare = 1, Defs = [CPSR] in {
 multiclass T2I_cmp_irs<bits<4> opcod, string opc, PatFrag opnode> {
    // shifted imm
    def ri : T2I<(outs), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iCMPi,
@@ -527,9 +541,9 @@ multiclass T2I_cmp_irs<bits<4> opcod, string opc, PatFrag opnode> {
      let Inst{11-8} = 0b1111; // Rd
    }
    // register
-   def rr : T2I<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr,
+   def rr : T2I<(outs), (ins GPR:$lhs, rGPR:$rhs), IIC_iCMPr,
                 opc, ".w\t$lhs, $rhs",
-                [(opnode GPR:$lhs, GPR:$rhs)]> {
+                [(opnode GPR:$lhs, rGPR:$rhs)]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -639,9 +653,9 @@ multiclass T2I_st<bits<2> opcod, string opc, PatFrag opnode> {
 /// T2I_unary_rrot - A unary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
 multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> {
-  def r     : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+  def r     : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
                   opc, ".w\t$dst, $src",
-                 [(set GPR:$dst, (opnode GPR:$src))]> {
+                 [(set rGPR:$dst, (opnode rGPR:$src))]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
@@ -650,9 +664,9 @@ multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> {
      let Inst{7} = 1;
      let Inst{5-4} = 0b00; // rotate
    }
-  def r_rot : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$rot), IIC_iUNAsi,
+  def r_rot : T2I<(outs rGPR:$dst), (ins rGPR:$src, i32imm:$rot), IIC_iUNAsi,
                   opc, ".w\t$dst, $src, ror $rot",
-                 [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]> {
+                 [(set rGPR:$dst, (opnode (rotr rGPR:$src, rot_imm:$rot)))]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
@@ -665,9 +679,9 @@ multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> {
 
 // UXTB16 - Requres T2ExtractPack, does not need the .w qualifier.
 multiclass T2I_unary_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> {
-  def r     : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+  def r     : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
                   opc, "\t$dst, $src",
-                 [(set GPR:$dst, (opnode GPR:$src))]>,
+                 [(set rGPR:$dst, (opnode rGPR:$src))]>,
                  Requires<[HasT2ExtractPack]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
@@ -677,9 +691,9 @@ multiclass T2I_unary_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> {
      let Inst{7} = 1;
      let Inst{5-4} = 0b00; // rotate
    }
-  def r_rot : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$rot), IIC_iUNAsi,
+  def r_rot : T2I<(outs rGPR:$dst), (ins rGPR:$src, i32imm:$rot), IIC_iUNAsi,
                   opc, "\t$dst, $src, ror $rot",
-                 [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>,
+                 [(set rGPR:$dst, (opnode (rotr rGPR:$src, rot_imm:$rot)))]>,
                  Requires<[HasT2ExtractPack]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
@@ -694,7 +708,7 @@ multiclass T2I_unary_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> {
 // SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern
 // supported yet.
 multiclass T2I_unary_rrot_sxtb16<bits<3> opcod, string opc> {
-  def r     : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+  def r     : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
                   opc, "\t$dst, $src", []> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
@@ -704,7 +718,7 @@ multiclass T2I_unary_rrot_sxtb16<bits<3> opcod, string opc> {
      let Inst{7} = 1;
      let Inst{5-4} = 0b00; // rotate
    }
-  def r_rot : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$rot), IIC_iUNAsi,
+  def r_rot : T2I<(outs rGPR:$dst), (ins rGPR:$src, i32imm:$rot), IIC_iUNAsi,
                   opc, "\t$dst, $src, ror $rot", []> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
@@ -719,9 +733,9 @@ multiclass T2I_unary_rrot_sxtb16<bits<3> opcod, string opc> {
 /// T2I_bin_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
 multiclass T2I_bin_rrot<bits<3> opcod, string opc, PatFrag opnode> {
-  def rr     : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), IIC_iALUr,
+  def rr     : T2I<(outs rGPR:$dst), (ins rGPR:$LHS, rGPR:$RHS), IIC_iALUr,
                   opc, "\t$dst, $LHS, $RHS",
-                  [(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>,
+                  [(set rGPR:$dst, (opnode rGPR:$LHS, rGPR:$RHS))]>,
                   Requires<[HasT2ExtractPack]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
@@ -730,10 +744,10 @@ multiclass T2I_bin_rrot<bits<3> opcod, string opc, PatFrag opnode> {
      let Inst{7} = 1;
      let Inst{5-4} = 0b00; // rotate
    }
-  def rr_rot : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot),
+  def rr_rot : T2I<(outs rGPR:$dst), (ins rGPR:$LHS, rGPR:$RHS, i32imm:$rot),
                   IIC_iALUsr, opc, "\t$dst, $LHS, $RHS, ror $rot",
-                  [(set GPR:$dst, (opnode GPR:$LHS,
-                                          (rotr GPR:$RHS, rot_imm:$rot)))]>,
+                  [(set rGPR:$dst, (opnode rGPR:$LHS,
+                                          (rotr rGPR:$RHS, rot_imm:$rot)))]>,
                   Requires<[HasT2ExtractPack]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
@@ -747,7 +761,7 @@ multiclass T2I_bin_rrot<bits<3> opcod, string opc, PatFrag opnode> {
 // DO variant - disassembly only, no pattern
 
 multiclass T2I_bin_rrot_DO<bits<3> opcod, string opc> {
-  def rr     : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), IIC_iALUr,
+  def rr     : T2I<(outs rGPR:$dst), (ins rGPR:$LHS, rGPR:$RHS), IIC_iALUr,
                   opc, "\t$dst, $LHS, $RHS", []> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
@@ -756,7 +770,7 @@ multiclass T2I_bin_rrot_DO<bits<3> opcod, string opc> {
      let Inst{7} = 1;
      let Inst{5-4} = 0b00; // rotate
    }
-  def rr_rot : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot),
+  def rr_rot : T2I<(outs rGPR:$dst), (ins rGPR:$LHS, rGPR:$RHS, i32imm:$rot),
                   IIC_iALUsr, opc, "\t$dst, $LHS, $RHS, ror $rot", []> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
@@ -779,8 +793,8 @@ multiclass T2I_bin_rrot_DO<bits<3> opcod, string opc> {
 // assembler.
 let neverHasSideEffects = 1 in {
 let isReMaterializable = 1 in
-def t2LEApcrel : T2XI<(outs GPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
-                      "adr$p.w\t$dst, #$label", []> {
+def t2LEApcrel : T2XI<(outs rGPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
+                      "adr${p}.w\t$dst, #$label", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25-24} = 0b10;
   // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE)
@@ -790,9 +804,9 @@ def t2LEApcrel : T2XI<(outs GPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
   let Inst{15} = 0;
 }
 } // neverHasSideEffects
-def t2LEApcrelJT : T2XI<(outs GPR:$dst),
+def t2LEApcrelJT : T2XI<(outs rGPR:$dst),
                         (ins i32imm:$label, nohash_imm:$id, pred:$p), IIC_iALUi,
-                        "adr$p.w\t$dst, #${label}_${id}", []> {
+                        "adr${p}.w\t$dst, #${label}_${id}", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25-24} = 0b10;
   // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE)
@@ -866,9 +880,9 @@ def t2SUBrSPs   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
 }
 
 // Signed and unsigned division on v7-M
-def t2SDIV : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALUi, 
+def t2SDIV : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iALUi, 
                  "sdiv", "\t$dst, $a, $b",
-                 [(set GPR:$dst, (sdiv GPR:$a, GPR:$b))]>,
+                 [(set rGPR:$dst, (sdiv rGPR:$a, rGPR:$b))]>,
                  Requires<[HasDivide]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-21} = 0b011100;
@@ -877,9 +891,9 @@ def t2SDIV : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALUi,
   let Inst{7-4} = 0b1111;
 }
 
-def t2UDIV : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALUi, 
+def t2UDIV : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iALUi, 
                  "udiv", "\t$dst, $a, $b",
-                 [(set GPR:$dst, (udiv GPR:$a, GPR:$b))]>,
+                 [(set rGPR:$dst, (udiv rGPR:$a, rGPR:$b))]>,
                  Requires<[HasDivide]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-21} = 0b011101;
@@ -888,17 +902,6 @@ def t2UDIV : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALUi,
   let Inst{7-4} = 0b1111;
 }
 
-// Pseudo instruction that will expand into a t2SUBrSPi + a copy.
-let usesCustomInserter = 1 in { // Expanded after instruction selection.
-def t2SUBrSPi_   : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
-                   NoItinerary, "${:comment} sub.w\t$dst, $sp, $imm", []>;
-def t2SUBrSPi12_ : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm),
-                   NoItinerary, "${:comment} subw\t$dst, $sp, $imm", []>;
-def t2SUBrSPs_   : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
-                   NoItinerary, "${:comment} sub\t$dst, $sp, $rhs", []>;
-} // usesCustomInserter
-
-
 //===----------------------------------------------------------------------===//
 //  Load / store Instructions.
 //
@@ -917,10 +920,10 @@ defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", UnOpFrag<(sextloadi8  node:$Src)>>;
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
-def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs GPR:$dst1, GPR:$dst2),
+def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs rGPR:$dst1, rGPR:$dst2),
                         (ins t2addrmode_imm8s4:$addr),
                         IIC_iLoadi, "ldrd", "\t$dst1, $addr", []>;
-def t2LDRDpci : T2Ii8s4<1, 0, 1, (outs GPR:$dst1, GPR:$dst2),
+def t2LDRDpci : T2Ii8s4<1, 0, 1, (outs rGPR:$dst1, rGPR:$dst2),
                         (ins i32imm:$addr), IIC_iLoadi,
                        "ldrd", "\t$dst1, $addr", []> {
   let Inst{19-16} = 0b1111; // Rn
@@ -967,6 +970,11 @@ def : T2Pat<(extloadi16 t2addrmode_so_reg:$addr),
 def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
             (t2LDRHpci  tconstpool:$addr)>;
 
+// FIXME: The destination register of the loads and stores can't be PC, but
+//        can be SP. We need another regclass (similar to rGPR) to represent
+//        that. Not a pressing issue since these are selected manually,
+//        not via pattern.
+
 // Indexed loads
 let mayLoad = 1, neverHasSideEffects = 1 in {
 def t2LDR_PRE  : T2Iidxldst<0, 0b10, 1, 1, (outs GPR:$dst, GPR:$base_wb),
@@ -1286,9 +1294,9 @@ def t2MOVr : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr,
 
 // AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, AddedComplexity = 1 in
-def t2MOVi : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi,
+def t2MOVi : T2sI<(outs rGPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi,
                    "mov", ".w\t$dst, $src",
-                   [(set GPR:$dst, t2_so_imm:$src)]> {
+                   [(set rGPR:$dst, t2_so_imm:$src)]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 0;
   let Inst{24-21} = 0b0010;
@@ -1298,9 +1306,9 @@ def t2MOVi : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi,
 }
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def t2MOVi16 : T2I<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi,
+def t2MOVi16 : T2I<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVi,
                    "movw", "\t$dst, $src",
-                   [(set GPR:$dst, imm0_65535:$src)]> {
+                   [(set rGPR:$dst, imm0_65535:$src)]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-21} = 0b0010;
@@ -1309,10 +1317,10 @@ def t2MOVi16 : T2I<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi,
 }
 
 let Constraints = "$src = $dst" in
-def t2MOVTi16 : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$imm), IIC_iMOVi,
+def t2MOVTi16 : T2I<(outs rGPR:$dst), (ins rGPR:$src, i32imm:$imm), IIC_iMOVi,
                     "movt", "\t$dst, $imm",
-                    [(set GPR:$dst,
-                          (or (and GPR:$src, 0xffff), lo16AllZero:$imm))]> {
+                    [(set rGPR:$dst,
+                          (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-21} = 0b0110;
@@ -1320,7 +1328,7 @@ def t2MOVTi16 : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$imm), IIC_iMOVi,
   let Inst{15} = 0;
 }
 
-def : T2Pat<(or GPR:$src, 0xffff0000), (t2MOVTi16 GPR:$src, 0xffff)>;
+def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
 
 //===----------------------------------------------------------------------===//
 //  Extend Instructions.
@@ -1352,10 +1360,14 @@ defm t2UXTH   : T2I_unary_rrot<0b001, "uxth",
 defm t2UXTB16 : T2I_unary_rrot_uxtb16<0b011, "uxtb16",
                                UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
 
-def : T2Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
-            (t2UXTB16r_rot GPR:$Src, 24)>, Requires<[HasT2ExtractPack]>;
-def : T2Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
-            (t2UXTB16r_rot GPR:$Src, 8)>, Requires<[HasT2ExtractPack]>;
+// FIXME: This pattern incorrectly assumes the shl operator is a rotate.
+//        The transformation should probably be done as a combiner action
+//        instead so we can include a check for masking back in the upper
+//        eight bits of the source into the lower eight bits of the result.
+//def : T2Pat<(and (shl rGPR:$Src, (i32 8)), 0xFF00FF),
+//            (t2UXTB16r_rot rGPR:$Src, 24)>, Requires<[HasT2ExtractPack]>;
+def : T2Pat<(and (srl rGPR:$Src, (i32 8)), 0xFF00FF),
+            (t2UXTB16r_rot rGPR:$Src, 8)>, Requires<[HasT2ExtractPack]>;
 
 defm t2UXTAB : T2I_bin_rrot<0b101, "uxtab",
                            BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
@@ -1389,7 +1401,7 @@ defm t2SBCS : T2I_adde_sube_s_irs<0b1011, "sbc",
                           BinOpFrag<(sube_live_carry node:$LHS, node:$RHS)>>;
 
 // RSB
-defm t2RSB  : T2I_rbin_is   <0b1110, "rsb",
+defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb",
                              BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
 defm t2RSBS : T2I_rbin_s_is <0b1110, "rsb",
                              BinOpFrag<(subc node:$LHS, node:$RHS)>>;
@@ -1409,18 +1421,18 @@ def : T2Pat<(add        GPR:$src, t2_so_imm_neg:$imm),
 def : T2Pat<(add        GPR:$src, imm0_4095_neg:$imm),
             (t2SUBri12  GPR:$src, imm0_4095_neg:$imm)>;
 let AddedComplexity = 1 in
-def : T2Pat<(addc       GPR:$src, imm0_255_neg:$imm),
-            (t2SUBSri   GPR:$src, imm0_255_neg:$imm)>;
-def : T2Pat<(addc       GPR:$src, t2_so_imm_neg:$imm),
-            (t2SUBSri   GPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(addc       rGPR:$src, imm0_255_neg:$imm),
+            (t2SUBSri   rGPR:$src, imm0_255_neg:$imm)>;
+def : T2Pat<(addc       rGPR:$src, t2_so_imm_neg:$imm),
+            (t2SUBSri   rGPR:$src, t2_so_imm_neg:$imm)>;
 // The with-carry-in form matches bitwise not instead of the negation.
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
 let AddedComplexity = 1 in
-def : T2Pat<(adde       GPR:$src, imm0_255_not:$imm),
-            (t2SBCSri   GPR:$src, imm0_255_not:$imm)>;
-def : T2Pat<(adde       GPR:$src, t2_so_imm_not:$imm),
-            (t2SBCSri   GPR:$src, t2_so_imm_not:$imm)>;
+def : T2Pat<(adde       rGPR:$src, imm0_255_not:$imm),
+            (t2SBCSri   rGPR:$src, imm0_255_not:$imm)>;
+def : T2Pat<(adde       rGPR:$src, t2_so_imm_not:$imm),
+            (t2SBCSri   rGPR:$src, t2_so_imm_not:$imm)>;
 
 // Select Bytes -- for disassembly only
 
@@ -1437,9 +1449,10 @@ def t2SEL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), NoItinerary, "sel",
 
 // A6.3.13, A6.3.14, A6.3.15 Parallel addition and subtraction (signed/unsigned)
 // And Miscellaneous operations -- for disassembly only
-class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc>
-  : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), NoItinerary, opc,
-        "\t$dst, $a, $b", [/* For disassembly only; pattern left blank */]> {
+class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc,
+              list<dag> pat = [/* For disassembly only; pattern left blank */]>
+  : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), NoItinerary, opc,
+        "\t$dst, $a, $b", pat> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0101;
   let Inst{22-20} = op22_20;
@@ -1449,14 +1462,16 @@ class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc>
 
 // Saturating add/subtract -- for disassembly only
 
-def t2QADD    : T2I_pam<0b000, 0b1000, "qadd">;
+def t2QADD    : T2I_pam<0b000, 0b1000, "qadd",
+                        [(set rGPR:$dst, (int_arm_qadd rGPR:$a, rGPR:$b))]>;
 def t2QADD16  : T2I_pam<0b001, 0b0001, "qadd16">;
 def t2QADD8   : T2I_pam<0b000, 0b0001, "qadd8">;
 def t2QASX    : T2I_pam<0b010, 0b0001, "qasx">;
 def t2QDADD   : T2I_pam<0b000, 0b1001, "qdadd">;
 def t2QDSUB   : T2I_pam<0b000, 0b1011, "qdsub">;
 def t2QSAX    : T2I_pam<0b110, 0b0001, "qsax">;
-def t2QSUB    : T2I_pam<0b000, 0b1010, "qsub">;
+def t2QSUB    : T2I_pam<0b000, 0b1010, "qsub",
+                        [(set rGPR:$dst, (int_arm_qsub rGPR:$a, rGPR:$b))]>;
 def t2QSUB16  : T2I_pam<0b101, 0b0001, "qsub16">;
 def t2QSUB8   : T2I_pam<0b100, 0b0001, "qsub8">;
 def t2UQADD16 : T2I_pam<0b001, 0b0101, "uqadd16">;
@@ -1498,37 +1513,27 @@ def t2UHSUB8  : T2I_pam<0b100, 0b0110, "uhsub8">;
 
 // Unsigned Sum of Absolute Differences [and Accumulate] -- for disassembly only
 
-def t2USAD8   : T2I_mac<0, 0b111, 0b0000, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+def t2USAD8   : T2I_mac<0, 0b111, 0b0000, (outs rGPR:$dst),
+                                           (ins rGPR:$a, rGPR:$b),
                         NoItinerary, "usad8", "\t$dst, $a, $b", []> {
   let Inst{15-12} = 0b1111;
 }
-def t2USADA8  : T2I_mac<0, 0b111, 0b0000, (outs GPR:$dst),
-                        (ins GPR:$a, GPR:$b, GPR:$acc), NoItinerary, "usada8",
+def t2USADA8  : T2I_mac<0, 0b111, 0b0000, (outs rGPR:$dst),
+                       (ins rGPR:$a, rGPR:$b, rGPR:$acc), NoItinerary, "usada8",
                         "\t$dst, $a, $b, $acc", []>;
 
 // Signed/Unsigned saturate -- for disassembly only
 
-def t2SSATlsl : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos,GPR:$a,i32imm:$shamt),
-                    NoItinerary, "ssat", "\t$dst, $bit_pos, $a, lsl $shamt",
-                    [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25-22} = 0b1100;
-  let Inst{20} = 0;
-  let Inst{15} = 0;
-  let Inst{21} = 0;        // sh = '0'
-}
-
-def t2SSATasr : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos,GPR:$a,i32imm:$shamt),
-                    NoItinerary, "ssat", "\t$dst, $bit_pos, $a, asr $shamt",
-                    [/* For disassembly only; pattern left blank */]> {
+def t2SSAT: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a, shift_imm:$sh),
+                NoItinerary, "ssat", "\t$dst, $bit_pos, $a$sh",
+                [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
   let Inst{20} = 0;
   let Inst{15} = 0;
-  let Inst{21} = 1;        // sh = '1'
 }
 
-def t2SSAT16 : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), NoItinerary,
+def t2SSAT16: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a), NoItinerary,
                    "ssat16", "\t$dst, $bit_pos, $a",
                    [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
@@ -1540,27 +1545,16 @@ def t2SSAT16 : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), NoItinerary,
   let Inst{7-6} = 0b00;    // imm2 = '00'
 }
 
-def t2USATlsl : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos,GPR:$a,i32imm:$shamt),
-                     NoItinerary, "usat", "\t$dst, $bit_pos, $a, lsl $shamt",
-                     [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25-22} = 0b1110;
-  let Inst{20} = 0;
-  let Inst{15} = 0;
-  let Inst{21} = 0;        // sh = '0'
-}
-
-def t2USATasr : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos,GPR:$a,i32imm:$shamt),
-                     NoItinerary, "usat", "\t$dst, $bit_pos, $a, asr $shamt",
-                     [/* For disassembly only; pattern left blank */]> {
+def t2USAT: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a, shift_imm:$sh),
+                NoItinerary, "usat", "\t$dst, $bit_pos, $a$sh",
+                [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1110;
   let Inst{20} = 0;
   let Inst{15} = 0;
-  let Inst{21} = 1;        // sh = '1'
 }
 
-def t2USAT16 : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), NoItinerary,
+def t2USAT16: T2I<(outs rGPR:$dst), (ins i32imm:$bit_pos, rGPR:$a), NoItinerary,
                    "usat16", "\t$dst, $bit_pos, $a",
                    [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
@@ -1572,6 +1566,9 @@ def t2USAT16 : T2I<(outs GPR:$dst), (ins i32imm:$bit_pos, GPR:$a), NoItinerary,
   let Inst{7-6} = 0b00;    // imm2 = '00'
 }
 
+def : T2Pat<(int_arm_ssat GPR:$a, imm:$pos), (t2SSAT imm:$pos, GPR:$a, 0)>;
+def : T2Pat<(int_arm_usat GPR:$a, imm:$pos), (t2USAT imm:$pos, GPR:$a, 0)>;
+
 //===----------------------------------------------------------------------===//
 //  Shift and rotate Instructions.
 //
@@ -1582,9 +1579,9 @@ defm t2ASR  : T2I_sh_ir<0b10, "asr", BinOpFrag<(sra  node:$LHS, node:$RHS)>>;
 defm t2ROR  : T2I_sh_ir<0b11, "ror", BinOpFrag<(rotr node:$LHS, node:$RHS)>>;
 
 let Uses = [CPSR] in {
-def t2MOVrx : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+def t2MOVrx : T2sI<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVsi,
                    "rrx", "\t$dst, $src",
-                   [(set GPR:$dst, (ARMrrx GPR:$src))]> {
+                   [(set rGPR:$dst, (ARMrrx rGPR:$src))]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -1596,9 +1593,9 @@ def t2MOVrx : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
 }
 
 let Defs = [CPSR] in {
-def t2MOVsrl_flag : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+def t2MOVsrl_flag : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVsi,
                         "lsrs", ".w\t$dst, $src, #1",
-                        [(set GPR:$dst, (ARMsrl_flag GPR:$src))]> {
+                        [(set rGPR:$dst, (ARMsrl_flag rGPR:$src))]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -1609,9 +1606,9 @@ def t2MOVsrl_flag : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
   let Inst{14-12} = 0b000;
   let Inst{7-6} = 0b01;
 }
-def t2MOVsra_flag : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+def t2MOVsra_flag : T2I<(outs rGPR:$dst), (ins rGPR:$src), IIC_iMOVsi,
                         "asrs", ".w\t$dst, $src, #1",
-                        [(set GPR:$dst, (ARMsra_flag GPR:$src))]> {
+                        [(set rGPR:$dst, (ARMsra_flag rGPR:$src))]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -1638,10 +1635,13 @@ defm t2EOR  : T2I_bin_w_irs<0b0100, "eor",
 defm t2BIC  : T2I_bin_w_irs<0b0001, "bic",
                             BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
 
+defm t2ANDS : T2I_bin_s_irs<0b0000, "and",
+                            BinOpFrag<(ARMand node:$LHS, node:$RHS)>, 1>;
+
 let Constraints = "$src = $dst" in
-def t2BFC : T2I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
+def t2BFC : T2I<(outs rGPR:$dst), (ins rGPR:$src, bf_inv_mask_imm:$imm),
                 IIC_iUNAsi, "bfc", "\t$dst, $imm",
-                [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]> {
+                [(set rGPR:$dst, (and rGPR:$src, bf_inv_mask_imm:$imm))]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-20} = 0b10110;
@@ -1649,7 +1649,7 @@ def t2BFC : T2I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
   let Inst{15} = 0;
 }
 
-def t2SBFX : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+def t2SBFX: T2I<(outs rGPR:$dst), (ins rGPR:$src, imm0_31:$lsb, imm0_31:$width),
                  IIC_iALUi, "sbfx", "\t$dst, $src, $lsb, $width", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
@@ -1657,7 +1657,7 @@ def t2SBFX : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
   let Inst{15} = 0;
 }
 
-def t2UBFX : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+def t2UBFX: T2I<(outs rGPR:$dst), (ins rGPR:$src, imm0_31:$lsb, imm0_31:$width),
                  IIC_iALUi, "ubfx", "\t$dst, $src, $lsb, $width", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
@@ -1666,10 +1666,12 @@ def t2UBFX : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
 }
 
 // A8.6.18  BFI - Bitfield insert (Encoding T1)
-// Added for disassembler with the pattern field purposely left blank.
-// FIXME: Utilize this instruction in codgen.
-def t2BFI : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
-                IIC_iALUi, "bfi", "\t$dst, $src, $lsb, $width", []> {
+let Constraints = "$src = $dst" in
+def t2BFI : T2I<(outs rGPR:$dst),
+                (ins rGPR:$src, rGPR:$val, bf_inv_mask_imm:$imm),
+                IIC_iALUi, "bfi", "\t$dst, $val, $imm",
+                [(set rGPR:$dst, (ARMbfi rGPR:$src, rGPR:$val,
+                                 bf_inv_mask_imm:$imm))]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-20} = 0b10110;
@@ -1677,19 +1679,20 @@ def t2BFI : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
 }
 
 defm t2ORN  : T2I_bin_irs<0b0011, "orn", BinOpFrag<(or  node:$LHS,
-                          (not node:$RHS))>>;
+                          (not node:$RHS))>, 0, "">;
 
 // Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version
 let AddedComplexity = 1 in
 defm t2MVN  : T2I_un_irs <0b0011, "mvn", UnOpFrag<(not node:$Src)>, 1, 1>;
 
 
-def : T2Pat<(and     GPR:$src, t2_so_imm_not:$imm),
-            (t2BICri GPR:$src, t2_so_imm_not:$imm)>;
+let AddedComplexity = 1 in
+def : T2Pat<(and     rGPR:$src, t2_so_imm_not:$imm),
+            (t2BICri rGPR:$src, t2_so_imm_not:$imm)>;
 
 // FIXME: Disable this pattern on Darwin to workaround an assembler bug.
-def : T2Pat<(or      GPR:$src, t2_so_imm_not:$imm),
-            (t2ORNri GPR:$src, t2_so_imm_not:$imm)>,
+def : T2Pat<(or      rGPR:$src, t2_so_imm_not:$imm),
+            (t2ORNri rGPR:$src, t2_so_imm_not:$imm)>,
             Requires<[IsThumb2]>;
 
 def : T2Pat<(t2_so_imm_not:$src),
@@ -1699,9 +1702,9 @@ def : T2Pat<(t2_so_imm_not:$src),
 //  Multiply Instructions.
 //
 let isCommutable = 1 in
-def t2MUL: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+def t2MUL: T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
                 "mul", "\t$dst, $a, $b",
-                [(set GPR:$dst, (mul GPR:$a, GPR:$b))]> {
+                [(set rGPR:$dst, (mul rGPR:$a, rGPR:$b))]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b000;
@@ -1709,9 +1712,9 @@ def t2MUL: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
   let Inst{7-4} = 0b0000; // Multiply
 }
 
-def t2MLA: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+def t2MLA: T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32,
 		"mla", "\t$dst, $a, $b, $c",
-		[(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]> {
+		[(set rGPR:$dst, (add (mul rGPR:$a, rGPR:$b), rGPR:$c))]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b000;
@@ -1719,9 +1722,9 @@ def t2MLA: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
   let Inst{7-4} = 0b0000; // Multiply
 }
 
-def t2MLS: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+def t2MLS: T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32,
 		"mls", "\t$dst, $a, $b, $c",
-                [(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]> {
+                [(set rGPR:$dst, (sub rGPR:$c, (mul rGPR:$a, rGPR:$b)))]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b000;
@@ -1732,7 +1735,8 @@ def t2MLS: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
 // Extra precision multiplies with low / high results
 let neverHasSideEffects = 1 in {
 let isCommutable = 1 in {
-def t2SMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMUL64,
+def t2SMULL : T2I<(outs rGPR:$ldst, rGPR:$hdst),
+                  (ins rGPR:$a, rGPR:$b), IIC_iMUL64,
                    "smull", "\t$ldst, $hdst, $a, $b", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0111;
@@ -1740,7 +1744,8 @@ def t2SMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMUL64,
   let Inst{7-4} = 0b0000;
 }
 
-def t2UMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMUL64,
+def t2UMULL : T2I<(outs rGPR:$ldst, rGPR:$hdst),
+                  (ins rGPR:$a, rGPR:$b), IIC_iMUL64,
                    "umull", "\t$ldst, $hdst, $a, $b", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0111;
@@ -1750,7 +1755,8 @@ def t2UMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMUL64,
 } // isCommutable
 
 // Multiply + accumulate
-def t2SMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
+def t2SMLAL : T2I<(outs rGPR:$ldst, rGPR:$hdst),
+                  (ins rGPR:$a, rGPR:$b), IIC_iMAC64,
                   "smlal", "\t$ldst, $hdst, $a, $b", []>{
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0111;
@@ -1758,7 +1764,8 @@ def t2SMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
   let Inst{7-4} = 0b0000;
 }
 
-def t2UMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
+def t2UMLAL : T2I<(outs rGPR:$ldst, rGPR:$hdst),
+                  (ins rGPR:$a, rGPR:$b), IIC_iMAC64,
                   "umlal", "\t$ldst, $hdst, $a, $b", []>{
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0111;
@@ -1766,7 +1773,8 @@ def t2UMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
   let Inst{7-4} = 0b0000;
 }
 
-def t2UMAAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
+def t2UMAAL : T2I<(outs rGPR:$ldst, rGPR:$hdst),
+                  (ins rGPR:$a, rGPR:$b), IIC_iMAC64,
                   "umaal", "\t$ldst, $hdst, $a, $b", []>{
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0111;
@@ -1778,9 +1786,9 @@ def t2UMAAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
 // Rounding variants of the below included for disassembly only
 
 // Most significant word multiply
-def t2SMMUL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+def t2SMMUL : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
                   "smmul", "\t$dst, $a, $b",
-                  [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]> {
+                  [(set rGPR:$dst, (mulhs rGPR:$a, rGPR:$b))]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b101;
@@ -1788,7 +1796,7 @@ def t2SMMUL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
   let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
 }
 
-def t2SMMULR : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+def t2SMMULR : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
                   "smmulr", "\t$dst, $a, $b", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
@@ -1797,9 +1805,9 @@ def t2SMMULR : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
   let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
 }
 
-def t2SMMLA : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+def t2SMMLA : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32,
                   "smmla", "\t$dst, $a, $b, $c",
-                  [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]> {
+                  [(set rGPR:$dst, (add (mulhs rGPR:$a, rGPR:$b), rGPR:$c))]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b101;
@@ -1807,7 +1815,7 @@ def t2SMMLA : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
   let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
 }
 
-def t2SMMLAR : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+def t2SMMLAR: T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32,
                   "smmlar", "\t$dst, $a, $b, $c", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
@@ -1816,9 +1824,9 @@ def t2SMMLAR : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
   let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
 }
 
-def t2SMMLS : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+def t2SMMLS: T2I <(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32,
                    "smmls", "\t$dst, $a, $b, $c",
-                   [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]> {
+                   [(set rGPR:$dst, (sub rGPR:$c, (mulhs rGPR:$a, rGPR:$b)))]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b110;
@@ -1826,7 +1834,7 @@ def t2SMMLS : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
   let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
 }
 
-def t2SMMLSR : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+def t2SMMLSR:T2I <(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$c), IIC_iMAC32,
                    "smmlsr", "\t$dst, $a, $b, $c", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
@@ -1836,10 +1844,10 @@ def t2SMMLSR : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
 }
 
 multiclass T2I_smul<string opc, PatFrag opnode> {
-  def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+  def BB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
               !strconcat(opc, "bb"), "\t$dst, $a, $b",
-              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
-                                      (sext_inreg GPR:$b, i16)))]> {
+              [(set rGPR:$dst, (opnode (sext_inreg rGPR:$a, i16),
+                                      (sext_inreg rGPR:$b, i16)))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -1848,10 +1856,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b00;
   }
 
-  def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+  def BT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
               !strconcat(opc, "bt"), "\t$dst, $a, $b",
-              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
-                                      (sra GPR:$b, (i32 16))))]> {
+              [(set rGPR:$dst, (opnode (sext_inreg rGPR:$a, i16),
+                                      (sra rGPR:$b, (i32 16))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -1860,10 +1868,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b01;
   }
 
-  def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+  def TB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
               !strconcat(opc, "tb"), "\t$dst, $a, $b",
-              [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
-                                      (sext_inreg GPR:$b, i16)))]> {
+              [(set rGPR:$dst, (opnode (sra rGPR:$a, (i32 16)),
+                                      (sext_inreg rGPR:$b, i16)))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -1872,10 +1880,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b10;
   }
 
-  def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+  def TT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL32,
               !strconcat(opc, "tt"), "\t$dst, $a, $b",
-              [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
-                                      (sra GPR:$b, (i32 16))))]> {
+              [(set rGPR:$dst, (opnode (sra rGPR:$a, (i32 16)),
+                                      (sra rGPR:$b, (i32 16))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -1884,10 +1892,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b11;
   }
 
-  def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL16,
+  def WB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL16,
               !strconcat(opc, "wb"), "\t$dst, $a, $b",
-              [(set GPR:$dst, (sra (opnode GPR:$a,
-                                    (sext_inreg GPR:$b, i16)), (i32 16)))]> {
+              [(set rGPR:$dst, (sra (opnode rGPR:$a,
+                                    (sext_inreg rGPR:$b, i16)), (i32 16)))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
@@ -1896,10 +1904,10 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b00;
   }
 
-  def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL16,
+  def WT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iMUL16,
               !strconcat(opc, "wt"), "\t$dst, $a, $b",
-              [(set GPR:$dst, (sra (opnode GPR:$a,
-                                    (sra GPR:$b, (i32 16))), (i32 16)))]> {
+              [(set rGPR:$dst, (sra (opnode rGPR:$a,
+                                    (sra rGPR:$b, (i32 16))), (i32 16)))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
@@ -1911,11 +1919,11 @@ multiclass T2I_smul<string opc, PatFrag opnode> {
 
 
 multiclass T2I_smla<string opc, PatFrag opnode> {
-  def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+  def BB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16,
               !strconcat(opc, "bb"), "\t$dst, $a, $b, $acc",
-              [(set GPR:$dst, (add GPR:$acc,
-                               (opnode (sext_inreg GPR:$a, i16),
-                                       (sext_inreg GPR:$b, i16))))]> {
+              [(set rGPR:$dst, (add rGPR:$acc,
+                               (opnode (sext_inreg rGPR:$a, i16),
+                                       (sext_inreg rGPR:$b, i16))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -1924,10 +1932,10 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b00;
   }
 
-  def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+  def BT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16,
              !strconcat(opc, "bt"), "\t$dst, $a, $b, $acc",
-             [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
-                                                   (sra GPR:$b, (i32 16)))))]> {
+             [(set rGPR:$dst, (add rGPR:$acc, (opnode (sext_inreg rGPR:$a, i16),
+                                                  (sra rGPR:$b, (i32 16)))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -1936,10 +1944,10 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b01;
   }
 
-  def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+  def TB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16,
               !strconcat(opc, "tb"), "\t$dst, $a, $b, $acc",
-              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
-                                                 (sext_inreg GPR:$b, i16))))]> {
+              [(set rGPR:$dst, (add rGPR:$acc, (opnode (sra rGPR:$a, (i32 16)),
+                                                (sext_inreg rGPR:$b, i16))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -1948,10 +1956,10 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b10;
   }
 
-  def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+  def TT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16,
               !strconcat(opc, "tt"), "\t$dst, $a, $b, $acc",
-             [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
-                                                   (sra GPR:$b, (i32 16)))))]> {
+             [(set rGPR:$dst, (add rGPR:$acc, (opnode (sra rGPR:$a, (i32 16)),
+                                                  (sra rGPR:$b, (i32 16)))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -1960,10 +1968,10 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b11;
   }
 
-  def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+  def WB : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16,
               !strconcat(opc, "wb"), "\t$dst, $a, $b, $acc",
-              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
-                                      (sext_inreg GPR:$b, i16)), (i32 16))))]> {
+              [(set rGPR:$dst, (add rGPR:$acc, (sra (opnode rGPR:$a,
+                                     (sext_inreg rGPR:$b, i16)), (i32 16))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
@@ -1972,10 +1980,10 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
     let Inst{5-4} = 0b00;
   }
 
-  def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+  def WT : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC16,
               !strconcat(opc, "wt"), "\t$dst, $a, $b, $acc",
-              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
-                                        (sra GPR:$b, (i32 16))), (i32 16))))]> {
+              [(set rGPR:$dst, (add rGPR:$acc, (sra (opnode rGPR:$a,
+                                       (sra rGPR:$b, (i32 16))), (i32 16))))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
@@ -1989,61 +1997,61 @@ defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 
 // Halfword multiple accumulate long: SMLAL<x><y> -- for disassembly only
-def t2SMLALBB : T2I_mac<1, 0b100, 0b1000, (outs GPR:$ldst,GPR:$hdst),
-           (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlalbb", "\t$ldst, $hdst, $a, $b",
+def t2SMLALBB : T2I_mac<1, 0b100, 0b1000, (outs rGPR:$ldst,rGPR:$hdst),
+         (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlalbb", "\t$ldst, $hdst, $a, $b",
            [/* For disassembly only; pattern left blank */]>;
-def t2SMLALBT : T2I_mac<1, 0b100, 0b1001, (outs GPR:$ldst,GPR:$hdst),
-           (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlalbt", "\t$ldst, $hdst, $a, $b",
+def t2SMLALBT : T2I_mac<1, 0b100, 0b1001, (outs rGPR:$ldst,rGPR:$hdst),
+         (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlalbt", "\t$ldst, $hdst, $a, $b",
            [/* For disassembly only; pattern left blank */]>;
-def t2SMLALTB : T2I_mac<1, 0b100, 0b1010, (outs GPR:$ldst,GPR:$hdst),
-           (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlaltb", "\t$ldst, $hdst, $a, $b",
+def t2SMLALTB : T2I_mac<1, 0b100, 0b1010, (outs rGPR:$ldst,rGPR:$hdst),
+         (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlaltb", "\t$ldst, $hdst, $a, $b",
            [/* For disassembly only; pattern left blank */]>;
-def t2SMLALTT : T2I_mac<1, 0b100, 0b1011, (outs GPR:$ldst,GPR:$hdst),
-           (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlaltt", "\t$ldst, $hdst, $a, $b",
+def t2SMLALTT : T2I_mac<1, 0b100, 0b1011, (outs rGPR:$ldst,rGPR:$hdst),
+         (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlaltt", "\t$ldst, $hdst, $a, $b",
            [/* For disassembly only; pattern left blank */]>;
 
 // Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
 // These are for disassembly only.
 
-def t2SMUAD   : T2I_mac<0, 0b010, 0b0000, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-                        IIC_iMAC32, "smuad", "\t$dst, $a, $b", []> {
+def t2SMUAD: T2I_mac<0, 0b010, 0b0000, (outs rGPR:$dst), (ins rGPR:$a, rGPR:$b),
+                     IIC_iMAC32, "smuad", "\t$dst, $a, $b", []> {
   let Inst{15-12} = 0b1111;
 }
-def t2SMUADX  : T2I_mac<0, 0b010, 0b0001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-                        IIC_iMAC32, "smuadx", "\t$dst, $a, $b", []> {
+def t2SMUADX:T2I_mac<0, 0b010, 0b0001, (outs rGPR:$dst), (ins rGPR:$a, rGPR:$b),
+                     IIC_iMAC32, "smuadx", "\t$dst, $a, $b", []> {
   let Inst{15-12} = 0b1111;
 }
-def t2SMUSD   : T2I_mac<0, 0b100, 0b0000, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-                        IIC_iMAC32, "smusd", "\t$dst, $a, $b", []> {
+def t2SMUSD: T2I_mac<0, 0b100, 0b0000, (outs rGPR:$dst), (ins rGPR:$a, rGPR:$b),
+                     IIC_iMAC32, "smusd", "\t$dst, $a, $b", []> {
   let Inst{15-12} = 0b1111;
 }
-def t2SMUSDX  : T2I_mac<0, 0b100, 0b0001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-                        IIC_iMAC32, "smusdx", "\t$dst, $a, $b", []> {
+def t2SMUSDX:T2I_mac<0, 0b100, 0b0001, (outs rGPR:$dst), (ins rGPR:$a, rGPR:$b),
+                     IIC_iMAC32, "smusdx", "\t$dst, $a, $b", []> {
   let Inst{15-12} = 0b1111;
 }
-def t2SMLAD   : T2I_mac<0, 0b010, 0b0000, (outs GPR:$dst),
-                        (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC32, "smlad",
+def t2SMLAD   : T2I_mac<0, 0b010, 0b0000, (outs rGPR:$dst),
+                        (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC32, "smlad",
                         "\t$dst, $a, $b, $acc", []>;
-def t2SMLADX  : T2I_mac<0, 0b010, 0b0001, (outs GPR:$dst),
-                        (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC32, "smladx",
+def t2SMLADX  : T2I_mac<0, 0b010, 0b0001, (outs rGPR:$dst),
+                        (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC32, "smladx",
                         "\t$dst, $a, $b, $acc", []>;
-def t2SMLSD   : T2I_mac<0, 0b100, 0b0000, (outs GPR:$dst),
-                        (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC32, "smlsd",
+def t2SMLSD   : T2I_mac<0, 0b100, 0b0000, (outs rGPR:$dst),
+                        (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC32, "smlsd",
                         "\t$dst, $a, $b, $acc", []>;
-def t2SMLSDX  : T2I_mac<0, 0b100, 0b0001, (outs GPR:$dst),
-                        (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC32, "smlsdx",
+def t2SMLSDX  : T2I_mac<0, 0b100, 0b0001, (outs rGPR:$dst),
+                        (ins rGPR:$a, rGPR:$b, rGPR:$acc), IIC_iMAC32, "smlsdx",
                         "\t$dst, $a, $b, $acc", []>;
-def t2SMLALD  : T2I_mac<1, 0b100, 0b1100, (outs GPR:$ldst,GPR:$hdst),
-                        (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlald",
+def t2SMLALD  : T2I_mac<1, 0b100, 0b1100, (outs rGPR:$ldst,rGPR:$hdst),
+                        (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlald",
                         "\t$ldst, $hdst, $a, $b", []>;
-def t2SMLALDX : T2I_mac<1, 0b100, 0b1101, (outs GPR:$ldst,GPR:$hdst),
-                        (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlaldx",
+def t2SMLALDX : T2I_mac<1, 0b100, 0b1101, (outs rGPR:$ldst,rGPR:$hdst),
+                        (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlaldx",
                         "\t$ldst, $hdst, $a, $b", []>;
-def t2SMLSLD  : T2I_mac<1, 0b101, 0b1100, (outs GPR:$ldst,GPR:$hdst),
-                        (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlsld",
+def t2SMLSLD  : T2I_mac<1, 0b101, 0b1100, (outs rGPR:$ldst,rGPR:$hdst),
+                        (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlsld",
                         "\t$ldst, $hdst, $a, $b", []>;
-def t2SMLSLDX : T2I_mac<1, 0b101, 0b1101, (outs GPR:$ldst,GPR:$hdst),
-                        (ins GPR:$a,GPR:$b), IIC_iMAC64, "smlsldx",
+def t2SMLSLDX : T2I_mac<1, 0b101, 0b1101, (outs rGPR:$ldst,rGPR:$hdst),
+                        (ins rGPR:$a,rGPR:$b), IIC_iMAC64, "smlsldx",
                         "\t$ldst, $hdst, $a, $b", []>;
 
 //===----------------------------------------------------------------------===//
@@ -2061,35 +2069,35 @@ class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops,
   let Inst{5-4} = op2;
 }
 
-def t2CLZ : T2I_misc<0b11, 0b00, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
-                    "clz", "\t$dst, $src", [(set GPR:$dst, (ctlz GPR:$src))]>;
+def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
+                    "clz", "\t$dst, $src", [(set rGPR:$dst, (ctlz rGPR:$src))]>;
 
-def t2RBIT : T2I_misc<0b01, 0b10, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
                       "rbit", "\t$dst, $src",
-                      [(set GPR:$dst, (ARMrbit GPR:$src))]>;
+                      [(set rGPR:$dst, (ARMrbit rGPR:$src))]>;
 
-def t2REV : T2I_misc<0b01, 0b00, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
-                   "rev", ".w\t$dst, $src", [(set GPR:$dst, (bswap GPR:$src))]>;
+def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
+                 "rev", ".w\t$dst, $src", [(set rGPR:$dst, (bswap rGPR:$src))]>;
 
-def t2REV16 : T2I_misc<0b01, 0b01, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
                        "rev16", ".w\t$dst, $src",
-                [(set GPR:$dst,
-                    (or (and (srl GPR:$src, (i32 8)), 0xFF),
-                        (or (and (shl GPR:$src, (i32 8)), 0xFF00),
-                            (or (and (srl GPR:$src, (i32 8)), 0xFF0000),
-                                (and (shl GPR:$src, (i32 8)), 0xFF000000)))))]>;
+                [(set rGPR:$dst,
+                    (or (and (srl rGPR:$src, (i32 8)), 0xFF),
+                        (or (and (shl rGPR:$src, (i32 8)), 0xFF00),
+                            (or (and (srl rGPR:$src, (i32 8)), 0xFF0000),
+                               (and (shl rGPR:$src, (i32 8)), 0xFF000000)))))]>;
 
-def t2REVSH : T2I_misc<0b01, 0b11, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$dst), (ins rGPR:$src), IIC_iUNAr,
                        "revsh", ".w\t$dst, $src",
-                 [(set GPR:$dst,
+                 [(set rGPR:$dst,
                     (sext_inreg
-                      (or (srl (and GPR:$src, 0xFF00), (i32 8)),
-                          (shl GPR:$src, (i32 8))), i16))]>;
+                      (or (srl (and rGPR:$src, 0xFF00), (i32 8)),
+                          (shl rGPR:$src, (i32 8))), i16))]>;
 
-def t2PKHBT : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
-                  IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2, lsl $shamt",
-                  [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF),
-                                      (and (shl GPR:$src2, (i32 imm:$shamt)),
+def t2PKHBT : T2I<(outs rGPR:$dst), (ins rGPR:$src1, rGPR:$src2, shift_imm:$sh),
+                  IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2$sh",
+                  [(set rGPR:$dst, (or (and rGPR:$src1, 0xFFFF),
+                                      (and (shl rGPR:$src2, lsl_amt:$sh),
                                            0xFFFF0000)))]>,
                   Requires<[HasT2ExtractPack]> {
   let Inst{31-27} = 0b11101;
@@ -2100,18 +2108,20 @@ def t2PKHBT : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
 }
 
 // Alternate cases for PKHBT where identities eliminate some nodes.
-def : T2Pat<(or (and GPR:$src1, 0xFFFF), (and GPR:$src2, 0xFFFF0000)),
-            (t2PKHBT GPR:$src1, GPR:$src2, 0)>,
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (and rGPR:$src2, 0xFFFF0000)),
+            (t2PKHBT rGPR:$src1, rGPR:$src2, 0)>,
             Requires<[HasT2ExtractPack]>;
-def : T2Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)),
-            (t2PKHBT GPR:$src1, GPR:$src2, imm16_31:$shamt)>,
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (shl rGPR:$src2, imm16_31:$sh)),
+            (t2PKHBT rGPR:$src1, rGPR:$src2, (lsl_shift_imm imm16_31:$sh))>,
             Requires<[HasT2ExtractPack]>;
 
-def t2PKHTB : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
-                  IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2, asr $shamt",
-                  [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000),
-                                      (and (sra GPR:$src2, imm16_31:$shamt),
-                                           0xFFFF)))]>,
+// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
+// will match the pattern below.
+def t2PKHTB : T2I<(outs rGPR:$dst), (ins rGPR:$src1, rGPR:$src2, shift_imm:$sh),
+                  IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2$sh",
+                  [(set rGPR:$dst, (or (and rGPR:$src1, 0xFFFF0000),
+                                       (and (sra rGPR:$src2, asr_amt:$sh),
+                                            0xFFFF)))]>,
                   Requires<[HasT2ExtractPack]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
@@ -2122,18 +2132,17 @@ def t2PKHTB : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
 
 // Alternate cases for PKHTB where identities eliminate some nodes.  Note that
 // a shift amount of 0 is *not legal* here, it is PKHBT instead.
-def : T2Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, (i32 16))),
-            (t2PKHTB GPR:$src1, GPR:$src2, 16)>,
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16_31:$sh)),
+            (t2PKHTB rGPR:$src1, rGPR:$src2, (asr_shift_imm imm16_31:$sh))>,
             Requires<[HasT2ExtractPack]>;
-def : T2Pat<(or (and GPR:$src1, 0xFFFF0000),
-                     (and (srl GPR:$src2, imm1_15:$shamt), 0xFFFF)),
-            (t2PKHTB GPR:$src1, GPR:$src2, imm1_15:$shamt)>,
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
+                (and (srl rGPR:$src2, imm1_15:$sh), 0xFFFF)),
+            (t2PKHTB rGPR:$src1, rGPR:$src2, (asr_shift_imm imm1_15:$sh))>,
             Requires<[HasT2ExtractPack]>;
 
 //===----------------------------------------------------------------------===//
 //  Comparison Instructions...
 //
-
 defm t2CMP  : T2I_cmp_irs<0b1101, "cmp",
                           BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
 defm t2CMPz : T2I_cmp_irs<0b1101, "cmp",
@@ -2157,18 +2166,13 @@ defm t2TST  : T2I_cmp_irs<0b0000, "tst",
 defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
                           BinOpFrag<(ARMcmpZ (xor node:$LHS, node:$RHS), 0)>>;
 
-// A8.6.27  CBNZ, CBZ - Compare and branch on (non)zero.
-// Short range conditional branch. Looks awesome for loops. Need to figure
-// out how to use this one.
-
-
 // Conditional moves
 // FIXME: should be able to write a pattern for ARMcmov, but can't use
 // a two-value operand where a dag node expects two operands. :(
 let neverHasSideEffects = 1 in {
-def t2MOVCCr : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true), IIC_iCMOVr,
+def t2MOVCCr : T2I<(outs rGPR:$dst), (ins rGPR:$false, rGPR:$true), IIC_iCMOVr,
                    "mov", ".w\t$dst, $true",
-      [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>,
+   [/*(set rGPR:$dst, (ARMcmov rGPR:$false, rGPR:$true, imm:$cc, CCR:$ccr))*/]>,
                 RegConstraint<"$false = $dst"> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
@@ -2179,9 +2183,9 @@ def t2MOVCCr : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true), IIC_iCMOVr,
   let Inst{7-4} = 0b0000;
 }
 
-def t2MOVCCi : T2I<(outs GPR:$dst), (ins GPR:$false, t2_so_imm:$true),
+def t2MOVCCi : T2I<(outs rGPR:$dst), (ins rGPR:$false, t2_so_imm:$true),
                    IIC_iCMOVi, "mov", ".w\t$dst, $true",
-[/*(set GPR:$dst, (ARMcmov GPR:$false, t2_so_imm:$true, imm:$cc, CCR:$ccr))*/]>,
+[/*(set rGPR:$dst,(ARMcmov rGPR:$false,t2_so_imm:$true, imm:$cc, CCR:$ccr))*/]>,
                    RegConstraint<"$false = $dst"> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 0;
@@ -2201,20 +2205,20 @@ class T2I_movcc_sh<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
   let Inst{19-16} = 0b1111; // Rn
   let Inst{5-4} = opcod; // Shift type.
 }
-def t2MOVCClsl : T2I_movcc_sh<0b00, (outs GPR:$dst),
-                             (ins GPR:$false, GPR:$true, i32imm:$rhs),
+def t2MOVCClsl : T2I_movcc_sh<0b00, (outs rGPR:$dst),
+                             (ins rGPR:$false, rGPR:$true, i32imm:$rhs),
                              IIC_iCMOVsi, "lsl", ".w\t$dst, $true, $rhs", []>,
                  RegConstraint<"$false = $dst">;
-def t2MOVCClsr : T2I_movcc_sh<0b01, (outs GPR:$dst),
-                             (ins GPR:$false, GPR:$true, i32imm:$rhs),
+def t2MOVCClsr : T2I_movcc_sh<0b01, (outs rGPR:$dst),
+                             (ins rGPR:$false, rGPR:$true, i32imm:$rhs),
                              IIC_iCMOVsi, "lsr", ".w\t$dst, $true, $rhs", []>,
                  RegConstraint<"$false = $dst">;
-def t2MOVCCasr : T2I_movcc_sh<0b10, (outs GPR:$dst),
-                             (ins GPR:$false, GPR:$true, i32imm:$rhs),
+def t2MOVCCasr : T2I_movcc_sh<0b10, (outs rGPR:$dst),
+                             (ins rGPR:$false, rGPR:$true, i32imm:$rhs),
                              IIC_iCMOVsi, "asr", ".w\t$dst, $true, $rhs", []>,
                  RegConstraint<"$false = $dst">;
-def t2MOVCCror : T2I_movcc_sh<0b11, (outs GPR:$dst),
-                             (ins GPR:$false, GPR:$true, i32imm:$rhs),
+def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$dst),
+                             (ins rGPR:$false, rGPR:$true, i32imm:$rhs),
                              IIC_iCMOVsi, "ror", ".w\t$dst, $true, $rhs", []>,
                  RegConstraint<"$false = $dst">;
 } // neverHasSideEffects
@@ -2225,21 +2229,15 @@ def t2MOVCCror : T2I_movcc_sh<0b11, (outs GPR:$dst),
 
 // memory barriers protect the atomic sequences
 let hasSideEffects = 1 in {
-def t2Int_MemBarrierV7 : AInoP<(outs), (ins),
-                        ThumbFrm, NoItinerary,
-                        "dmb", "",
-                        [(ARMMemBarrierV7)]>,
-                        Requires<[IsThumb2]> {
+def t2DMBsy : AInoP<(outs), (ins), ThumbFrm, NoItinerary, "dmb", "",
+                    [(ARMMemBarrier)]>, Requires<[IsThumb, HasDB]> {
   let Inst{31-4} = 0xF3BF8F5;
   // FIXME: add support for options other than a full system DMB
   let Inst{3-0} = 0b1111;
 }
 
-def t2Int_SyncBarrierV7 : AInoP<(outs), (ins),
-                        ThumbFrm, NoItinerary,
-                        "dsb", "",
-                        [(ARMSyncBarrierV7)]>,
-                        Requires<[IsThumb2]> {
+def t2DSBsy : AInoP<(outs), (ins), ThumbFrm, NoItinerary, "dsb", "",
+                    [(ARMSyncBarrier)]>, Requires<[IsThumb, HasDB]> {
   let Inst{31-4} = 0xF3BF8F4;
   // FIXME: add support for options other than a full system DSB
   let Inst{3-0} = 0b1111;
@@ -2329,13 +2327,13 @@ class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz,
 }
 
 let mayLoad = 1 in {
-def t2LDREXB : T2I_ldrex<0b00, (outs GPR:$dest), (ins GPR:$ptr), AddrModeNone,
+def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$dest), (ins rGPR:$ptr), AddrModeNone,
                          Size4Bytes, NoItinerary, "ldrexb", "\t$dest, [$ptr]",
                          "", []>;
-def t2LDREXH : T2I_ldrex<0b01, (outs GPR:$dest), (ins GPR:$ptr), AddrModeNone,
+def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$dest), (ins rGPR:$ptr), AddrModeNone,
                          Size4Bytes, NoItinerary, "ldrexh", "\t$dest, [$ptr]",
                          "", []>;
-def t2LDREX  : Thumb2I<(outs GPR:$dest), (ins GPR:$ptr), AddrModeNone,
+def t2LDREX  : Thumb2I<(outs rGPR:$dest), (ins rGPR:$ptr), AddrModeNone,
                        Size4Bytes, NoItinerary,
                        "ldrex", "\t$dest, [$ptr]", "",
                       []> {
@@ -2344,20 +2342,20 @@ def t2LDREX  : Thumb2I<(outs GPR:$dest), (ins GPR:$ptr), AddrModeNone,
   let Inst{11-8} = 0b1111;
   let Inst{7-0} = 0b00000000; // imm8 = 0
 }
-def t2LDREXD : T2I_ldrex<0b11, (outs GPR:$dest, GPR:$dest2), (ins GPR:$ptr),
+def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$dest, rGPR:$dest2), (ins rGPR:$ptr),
                          AddrModeNone, Size4Bytes, NoItinerary,
                          "ldrexd", "\t$dest, $dest2, [$ptr]", "",
                          [], {?, ?, ?, ?}>;
 }
 
 let mayStore = 1, Constraints = "@earlyclobber $success" in {
-def t2STREXB : T2I_strex<0b00, (outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+def t2STREXB : T2I_strex<0b00, (outs rGPR:$success), (ins rGPR:$src, rGPR:$ptr),
                          AddrModeNone, Size4Bytes, NoItinerary,
                          "strexb", "\t$success, $src, [$ptr]", "", []>;
-def t2STREXH : T2I_strex<0b01, (outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+def t2STREXH : T2I_strex<0b01, (outs rGPR:$success), (ins rGPR:$src, rGPR:$ptr),
                          AddrModeNone, Size4Bytes, NoItinerary,
                          "strexh", "\t$success, $src, [$ptr]", "", []>;
-def t2STREX  : Thumb2I<(outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+def t2STREX  : Thumb2I<(outs rGPR:$success), (ins rGPR:$src, rGPR:$ptr),
                        AddrModeNone, Size4Bytes, NoItinerary,
                        "strex", "\t$success, $src, [$ptr]", "",
                       []> {
@@ -2365,8 +2363,8 @@ def t2STREX  : Thumb2I<(outs GPR:$success), (ins GPR:$src, GPR:$ptr),
   let Inst{26-20} = 0b0000100;
   let Inst{7-0} = 0b00000000; // imm8 = 0
 }
-def t2STREXD : T2I_strex<0b11, (outs GPR:$success),
-                         (ins GPR:$src, GPR:$src2, GPR:$ptr),
+def t2STREXD : T2I_strex<0b11, (outs rGPR:$success),
+                         (ins rGPR:$src, rGPR:$src2, rGPR:$ptr),
                          AddrModeNone, Size4Bytes, NoItinerary,
                          "strexd", "\t$success, $src, $src2, [$ptr]", "", [],
                          {?, ?, ?, ?}>;
@@ -2416,7 +2414,7 @@ let Defs =
     D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
     D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
     D31 ], hasSideEffects = 1, isBarrier = 1 in {
-  def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins GPR:$src, tGPR:$val),
+  def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
                                AddrModeNone, SizeSpecial, NoItinerary,
                                "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t"
                                "adds\t$val, #7\n\t"
@@ -2425,14 +2423,14 @@ let Defs =
                                "b\t1f\n\t"
                                "movs\tr0, #1\t${:comment} end eh.setjmp\n\t"
                                "1:", "",
-                          [(set R0, (ARMeh_sjlj_setjmp GPR:$src, tGPR:$val))]>,
+                          [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>,
                              Requires<[IsThumb2, HasVFP2]>;
 }
 
 let Defs =
   [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ],
   hasSideEffects = 1, isBarrier = 1 in {
-  def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins GPR:$src, tGPR:$val),
+  def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
                                AddrModeNone, SizeSpecial, NoItinerary,
                                "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t"
                                "adds\t$val, #7\n\t"
@@ -2441,7 +2439,7 @@ let Defs =
                                "b\t1f\n\t"
                                "movs\tr0, #1\t${:comment} end eh.setjmp\n\t"
                                "1:", "",
-                          [(set R0, (ARMeh_sjlj_setjmp GPR:$src, tGPR:$val))]>,
+                          [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>,
                                   Requires<[IsThumb2, NoVFP]>;
 }
 
@@ -2482,7 +2480,7 @@ let isNotDuplicable = 1, isIndirectBranch = 1 in {
 def t2BR_JT :
     T2JTI<(outs),
           (ins GPR:$target, GPR:$index, jt2block_operand:$jt, i32imm:$id),
-           IIC_Br, "mov\tpc, $target\n$jt",
+           IIC_Br, "mov\tpc, $target$jt",
           [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0100100;
@@ -2496,7 +2494,7 @@ def t2BR_JT :
 def t2TBB :
     T2JTI<(outs),
         (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id),
-         IIC_Br, "tbb\t$index\n$jt", []> {
+         IIC_Br, "tbb\t$index$jt", []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0001101;
   let Inst{19-16} = 0b1111; // Rn = pc (table follows this instruction)
@@ -2507,7 +2505,7 @@ def t2TBB :
 def t2TBH :
     T2JTI<(outs),
         (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id),
-         IIC_Br, "tbh\t$index\n$jt", []> {
+         IIC_Br, "tbh\t$index$jt", []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0001101;
   let Inst{19-16} = 0b1111; // Rn = pc (table follows this instruction)
@@ -2560,7 +2558,7 @@ def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
 
 // Branch and Exchange Jazelle -- for disassembly only
 // Rm = Inst{19-16}
-def t2BXJ : T2I<(outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
+def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func",
               [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
   let Inst{26} = 0;
@@ -2647,25 +2645,25 @@ def t2SRSIA  : T2I<(outs), (ins i32imm:$mode),NoItinerary,"srsia","\tsp, $mode",
 }
 
 // Return From Exception is a system instruction -- for disassembly only
-def t2RFEDBW : T2I<(outs), (ins GPR:$base), NoItinerary, "rfedb", "\t$base!",
+def t2RFEDBW : T2I<(outs), (ins rGPR:$base), NoItinerary, "rfedb", "\t$base!",
                    [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0000011; // W = 1
 }
 
-def t2RFEDB  : T2I<(outs), (ins GPR:$base), NoItinerary, "rfeab", "\t$base",
+def t2RFEDB  : T2I<(outs), (ins rGPR:$base), NoItinerary, "rfeab", "\t$base",
                    [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0000001; // W = 0
 }
 
-def t2RFEIAW : T2I<(outs), (ins GPR:$base), NoItinerary, "rfeia", "\t$base!",
+def t2RFEIAW : T2I<(outs), (ins rGPR:$base), NoItinerary, "rfeia", "\t$base!",
                    [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0011011; // W = 1
 }
 
-def t2RFEIA  : T2I<(outs), (ins GPR:$base), NoItinerary, "rfeia", "\t$base",
+def t2RFEIA  : T2I<(outs), (ins rGPR:$base), NoItinerary, "rfeia", "\t$base",
                    [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0011001; // W = 0
@@ -2676,26 +2674,26 @@ def t2RFEIA  : T2I<(outs), (ins GPR:$base), NoItinerary, "rfeia", "\t$base",
 //
 
 // Two piece so_imms.
-def : T2Pat<(or GPR:$LHS, t2_so_imm2part:$RHS),
-             (t2ORRri (t2ORRri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
+def : T2Pat<(or rGPR:$LHS, t2_so_imm2part:$RHS),
+             (t2ORRri (t2ORRri rGPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
                     (t2_so_imm2part_2 imm:$RHS))>;
-def : T2Pat<(xor GPR:$LHS, t2_so_imm2part:$RHS),
-             (t2EORri (t2EORri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
+def : T2Pat<(xor rGPR:$LHS, t2_so_imm2part:$RHS),
+             (t2EORri (t2EORri rGPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
                     (t2_so_imm2part_2 imm:$RHS))>;
-def : T2Pat<(add GPR:$LHS, t2_so_imm2part:$RHS),
-             (t2ADDri (t2ADDri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
+def : T2Pat<(add rGPR:$LHS, t2_so_imm2part:$RHS),
+             (t2ADDri (t2ADDri rGPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
                     (t2_so_imm2part_2 imm:$RHS))>;
-def : T2Pat<(add GPR:$LHS, t2_so_neg_imm2part:$RHS),
-             (t2SUBri (t2SUBri GPR:$LHS, (t2_so_neg_imm2part_1 imm:$RHS)),
+def : T2Pat<(add rGPR:$LHS, t2_so_neg_imm2part:$RHS),
+             (t2SUBri (t2SUBri rGPR:$LHS, (t2_so_neg_imm2part_1 imm:$RHS)),
                     (t2_so_neg_imm2part_2 imm:$RHS))>;
 
 // 32-bit immediate using movw + movt.
 // This is a single pseudo instruction to make it re-materializable. Remove
 // when we can do generalized remat.
 let isReMaterializable = 1 in
-def t2MOVi32imm : T2Ix2<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi,
+def t2MOVi32imm : T2Ix2<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVi,
                    "movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}",
-                     [(set GPR:$dst, (i32 imm:$src))]>;
+                     [(set rGPR:$dst, (i32 imm:$src))]>;
 
 // ConstantPool, GlobalAddress, and JumpTable
 def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>,
@@ -2723,7 +2721,7 @@ def t2LDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
 //
 
 // Rd = Instr{11-8}
-def t2MRS : T2I<(outs GPR:$dst), (ins), NoItinerary, "mrs", "\t$dst, cpsr",
+def t2MRS : T2I<(outs rGPR:$dst), (ins), NoItinerary, "mrs", "\t$dst, cpsr",
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
   let Inst{26} = 0;
@@ -2734,7 +2732,7 @@ def t2MRS : T2I<(outs GPR:$dst), (ins), NoItinerary, "mrs", "\t$dst, cpsr",
 }
 
 // Rd = Instr{11-8}
-def t2MRSsys : T2I<(outs GPR:$dst), (ins), NoItinerary, "mrs", "\t$dst, spsr",
+def t2MRSsys : T2I<(outs rGPR:$dst), (ins), NoItinerary, "mrs", "\t$dst, spsr",
                    [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
   let Inst{26} = 0;
@@ -2745,7 +2743,7 @@ def t2MRSsys : T2I<(outs GPR:$dst), (ins), NoItinerary, "mrs", "\t$dst, spsr",
 }
 
 // Rn = Inst{19-16}
-def t2MSR : T2I<(outs), (ins GPR:$src, msr_mask:$mask), NoItinerary, "msr",
+def t2MSR : T2I<(outs), (ins rGPR:$src, msr_mask:$mask), NoItinerary, "msr",
                 "\tcpsr$mask, $src",
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
@@ -2757,7 +2755,7 @@ def t2MSR : T2I<(outs), (ins GPR:$src, msr_mask:$mask), NoItinerary, "msr",
 }
 
 // Rn = Inst{19-16}
-def t2MSRsys : T2I<(outs), (ins GPR:$src, msr_mask:$mask), NoItinerary, "msr",
+def t2MSRsys : T2I<(outs), (ins rGPR:$src, msr_mask:$mask), NoItinerary, "msr",
                    "\tspsr$mask, $src",
                    [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 84c23e1a784cc..c29e09606bd48 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -77,61 +77,61 @@ def VSTRS  : ASI5<0b1101, 0b00, (outs), (ins SPR:$src, addrmode5:$addr),
 //
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
-def VLDMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$dsts,
+def VLDMD : AXDI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$dsts,
                            variable_ops), IndexModeNone, IIC_fpLoadm,
-                  "vldm${addr:submode}${p}\t${addr:base}, $dsts", "", []> {
+                  "vldm${addr:submode}${p}\t$addr, $dsts", "", []> {
   let Inst{20} = 1;
 }
 
-def VLDMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$dsts,
+def VLDMS : AXSI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$dsts,
                            variable_ops), IndexModeNone, IIC_fpLoadm,
-                  "vldm${addr:submode}${p}\t${addr:base}, $dsts", "", []> {
+                  "vldm${addr:submode}${p}\t$addr, $dsts", "", []> {
   let Inst{20} = 1;
 }
 
-def VLDMD_UPD : AXDI5<(outs GPR:$wb), (ins addrmode5:$addr, pred:$p,
+def VLDMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
                                        reglist:$dsts, variable_ops),
                       IndexModeUpd, IIC_fpLoadm,
-                      "vldm${addr:submode}${p}\t${addr:base}!, $dsts",
-                      "$addr.base = $wb", []> {
+                      "vldm${addr:submode}${p}\t$addr!, $dsts",
+                      "$addr.addr = $wb", []> {
   let Inst{20} = 1;
 }
 
-def VLDMS_UPD : AXSI5<(outs GPR:$wb), (ins addrmode5:$addr, pred:$p,
+def VLDMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
                                        reglist:$dsts, variable_ops),
                       IndexModeUpd, IIC_fpLoadm, 
-                      "vldm${addr:submode}${p}\t${addr:base}!, $dsts",
-                      "$addr.base = $wb", []> {
+                      "vldm${addr:submode}${p}\t$addr!, $dsts",
+                      "$addr.addr = $wb", []> {
   let Inst{20} = 1;
 }
 } // mayLoad, neverHasSideEffects, hasExtraDefRegAllocReq
 
 let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
-def VSTMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$srcs,
+def VSTMD : AXDI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$srcs,
                            variable_ops), IndexModeNone, IIC_fpStorem,
-                  "vstm${addr:submode}${p}\t${addr:base}, $srcs", "", []> {
+                  "vstm${addr:submode}${p}\t$addr, $srcs", "", []> {
   let Inst{20} = 0;
 }
 
-def VSTMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$srcs,
+def VSTMS : AXSI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$srcs,
                            variable_ops), IndexModeNone, IIC_fpStorem,
-                  "vstm${addr:submode}${p}\t${addr:base}, $srcs", "", []> {
+                  "vstm${addr:submode}${p}\t$addr, $srcs", "", []> {
   let Inst{20} = 0;
 }
 
-def VSTMD_UPD : AXDI5<(outs GPR:$wb), (ins addrmode5:$addr, pred:$p,
+def VSTMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
                                        reglist:$srcs, variable_ops),
                       IndexModeUpd, IIC_fpStorem,
-                      "vstm${addr:submode}${p}\t${addr:base}!, $srcs",
-                      "$addr.base = $wb", []> {
+                      "vstm${addr:submode}${p}\t$addr!, $srcs",
+                      "$addr.addr = $wb", []> {
   let Inst{20} = 0;
 }
 
-def VSTMS_UPD : AXSI5<(outs GPR:$wb), (ins addrmode5:$addr, pred:$p,
+def VSTMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
                                        reglist:$srcs, variable_ops),
                       IndexModeUpd, IIC_fpStorem,
-                      "vstm${addr:submode}${p}\t${addr:base}!, $srcs",
-                      "$addr.base = $wb", []> {
+                      "vstm${addr:submode}${p}\t$addr!, $srcs",
+                      "$addr.addr = $wb", []> {
   let Inst{20} = 0;
 }
 } // mayStore, neverHasSideEffects, hasExtraSrcRegAllocReq
@@ -420,34 +420,35 @@ def VTOUIZS : AVConv1In<0b11101, 0b11, 0b1100, 0b1010,
 
 // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
 // For disassembly only.
-
+let Uses = [FPSCR] in {
 def VTOSIRD : AVConv1I<0b11101, 0b11, 0b1101, 0b1011,
                        (outs SPR:$dst), (ins DPR:$a),
                  IIC_fpCVTDI, "vcvtr", ".s32.f64\t$dst, $a",
-                 [/* For disassembly only; pattern left blank */]> {
+                 [(set SPR:$dst, (int_arm_vcvtr (f64 DPR:$a)))]> {
   let Inst{7} = 0; // Z bit
 }
 
 def VTOSIRS : AVConv1In<0b11101, 0b11, 0b1101, 0b1010,
                         (outs SPR:$dst), (ins SPR:$a),
                  IIC_fpCVTSI, "vcvtr", ".s32.f32\t$dst, $a",
-                 [/* For disassembly only; pattern left blank */]> {
+                 [(set SPR:$dst, (int_arm_vcvtr SPR:$a))]> {
   let Inst{7} = 0; // Z bit
 }
 
 def VTOUIRD : AVConv1I<0b11101, 0b11, 0b1100, 0b1011,
                        (outs SPR:$dst), (ins DPR:$a),
                  IIC_fpCVTDI, "vcvtr", ".u32.f64\t$dst, $a",
-                 [/* For disassembly only; pattern left blank */]> {
+                 [(set SPR:$dst, (int_arm_vcvtru (f64 DPR:$a)))]> {
   let Inst{7} = 0; // Z bit
 }
 
 def VTOUIRS : AVConv1In<0b11101, 0b11, 0b1100, 0b1010,
                         (outs SPR:$dst), (ins SPR:$a),
                  IIC_fpCVTSI, "vcvtr", ".u32.f32\t$dst, $a",
-                 [/* For disassembly only; pattern left blank */]> {
+                 [(set SPR:$dst, (int_arm_vcvtru SPR:$a))]> {
   let Inst{7} = 0; // Z bit
 }
+}
 
 // Convert between floating-point and fixed-point
 // Data type for fixed-point naming convention:
@@ -460,6 +461,7 @@ let Constraints = "$a = $dst" in {
 
 // FP to Fixed-Point:
 
+let isCodeGenOnly = 1 in {
 def VTOSHS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits",
@@ -499,9 +501,11 @@ def VTOULD : AVConv1XI<0b11101, 0b11, 0b1111, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits),
                  IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]>;
+}
 
 // Fixed-Point to FP:
 
+let isCodeGenOnly = 1 in {
 def VSHTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits",
@@ -541,6 +545,7 @@ def VULTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits),
                  IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits",
                  [/* For disassembly only; pattern left blank */]>;
+}
 
 } // End of 'let Constraints = "$src = $dst" in'
 
@@ -654,32 +659,27 @@ def FMSTAT : VFPAI<(outs), (ins), VFPMiscFrm, IIC_fpSTAT, "vmrs",
 }
 
 // FPSCR <-> GPR (for disassembly only)
-
-let neverHasSideEffects = 1 in {
-let Uses = [FPSCR] in {
-def VMRS : VFPAI<(outs GPR:$dst), (ins), VFPMiscFrm, IIC_fpSTAT, "vmrs",
-                 "\t$dst, fpscr",
-             [/* For disassembly only; pattern left blank */]> {
+let hasSideEffects = 1, Uses = [FPSCR] in
+def VMRS : VFPAI<(outs GPR:$dst), (ins), VFPMiscFrm, IIC_fpSTAT,
+                 "vmrs", "\t$dst, fpscr",
+             [(set GPR:$dst, (int_arm_get_fpscr))]> {
   let Inst{27-20} = 0b11101111;
   let Inst{19-16} = 0b0001;
   let Inst{11-8}  = 0b1010;
   let Inst{7}     = 0;
   let Inst{4}     = 1;
 }
-}
 
-let Defs = [FPSCR] in {
-def VMSR : VFPAI<(outs), (ins GPR:$src), VFPMiscFrm, IIC_fpSTAT, "vmsr",
-                 "\tfpscr, $src",
-             [/* For disassembly only; pattern left blank */]> {
+let Defs = [FPSCR] in 
+def VMSR : VFPAI<(outs), (ins GPR:$src), VFPMiscFrm, IIC_fpSTAT, 
+                 "vmsr", "\tfpscr, $src",
+             [(int_arm_set_fpscr GPR:$src)]> {
   let Inst{27-20} = 0b11101110;
   let Inst{19-16} = 0b0001;
   let Inst{11-8}  = 0b1010;
   let Inst{7}     = 0;
   let Inst{4}     = 1;
 }
-}
-} // neverHasSideEffects
 
 // Materialize FP immediates. VFP3 only.
 let isReMaterializable = 1 in {
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index f80e316d23e85..2b7645a42119e 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -57,7 +57,7 @@ STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
 namespace {
   struct ARMLoadStoreOpt : public MachineFunctionPass {
     static char ID;
-    ARMLoadStoreOpt() : MachineFunctionPass(&ID) {}
+    ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
 
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
@@ -193,20 +193,17 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
     return false;
 
   ARM_AM::AMSubMode Mode = ARM_AM::ia;
-  bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
-  if (isAM4 && Offset == 4) {
-    if (isThumb2)
-      // Thumb2 does not support ldmib / stmib.
-      return false;
+  // VFP and Thumb2 do not support IB or DA modes.
+  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
+  bool haveIBAndDA = isNotVFP && !isThumb2;
+  if (Offset == 4 && haveIBAndDA)
     Mode = ARM_AM::ib;
-  } else if (isAM4 && Offset == -4 * (int)NumRegs + 4) {
-    if (isThumb2)
-      // Thumb2 does not support ldmda / stmda.
-      return false;
+  else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA)
     Mode = ARM_AM::da;
-  } else if (isAM4 && Offset == -4 * (int)NumRegs) {
+  else if (Offset == -4 * (int)NumRegs && isNotVFP)
+    // VLDM/VSTM do not support DB mode without also updating the base reg.
     Mode = ARM_AM::db;
-  } else if (Offset != 0) {
+  else if (Offset != 0) {
     // If starting offset isn't zero, insert a MI to materialize a new base.
     // But only do so if it is cost effective, i.e. merging more than two
     // loads / stores.
@@ -246,18 +243,12 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
     BaseKill = true;  // New base is always killed right its use.
   }
 
-  bool isDPR = (Opcode == ARM::VLDRD || Opcode == ARM::VSTRD);
   bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS ||
                 Opcode == ARM::VLDRD);
   Opcode = getLoadStoreMultipleOpcode(Opcode);
-  MachineInstrBuilder MIB = (isAM4)
-    ? BuildMI(MBB, MBBI, dl, TII->get(Opcode))
-        .addReg(Base, getKillRegState(BaseKill))
-        .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg)
-    : BuildMI(MBB, MBBI, dl, TII->get(Opcode))
-        .addReg(Base, getKillRegState(BaseKill))
-        .addImm(ARM_AM::getAM5Opc(Mode, isDPR ? NumRegs<<1 : NumRegs))
-        .addImm(Pred).addReg(PredReg);
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode))
+    .addReg(Base, getKillRegState(BaseKill))
+    .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg);
   for (unsigned i = 0; i != NumRegs; ++i)
     MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
                      | getKillRegState(Regs[i].second));
@@ -333,6 +324,7 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
       if (KilledRegs.count(Reg)) {
         unsigned j = Killer[Reg];
         memOps[j].MBBI->getOperand(0).setIsKill(false);
+        memOps[j].isKill = false;
       }
     }
     MBB.erase(memOps[i].MBBI);
@@ -348,7 +340,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
                           ARMCC::CondCodes Pred, unsigned PredReg,
                           unsigned Scratch, MemOpQueue &MemOps,
                           SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
-  bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
+  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
   int Offset = MemOps[SIndex].Offset;
   int SOffset = Offset;
   unsigned insertAfter = SIndex;
@@ -366,12 +358,12 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
     unsigned Reg = MO.getReg();
     unsigned RegNum = MO.isUndef() ? UINT_MAX
       : ARMRegisterInfo::getRegisterNumbering(Reg);
-    // AM4 - register numbers in ascending order.
-    // AM5 - consecutive register numbers in ascending order.
-    //       Can only do up to 16 double-word registers per insn.
+    // Register numbers must be in ascending order.  For VFP, the registers
+    // must also be consecutive and there is a limit of 16 double-word
+    // registers per instruction.
     if (Reg != ARM::SP &&
         NewOffset == Offset + (int)Size &&
-        ((isAM4 && RegNum > PRegNum)
+        ((isNotVFP && RegNum > PRegNum)
          || ((Size < 8 || Count < 16) && RegNum == PRegNum+1))) {
       Offset += Size;
       PRegNum = RegNum;
@@ -409,7 +401,7 @@ static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
     return false;
 
   // Make sure the offset fits in 8 bits.
-  if (Bytes <= 0 || (Limit && Bytes >= Limit))
+  if (Bytes == 0 || (Limit && Bytes >= Limit))
     return false;
 
   unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME
@@ -433,7 +425,7 @@ static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
       MI->getOpcode() != ARM::ADDri)
     return false;
 
-  if (Bytes <= 0 || (Limit && Bytes >= Limit))
+  if (Bytes == 0 || (Limit && Bytes >= Limit))
     // Make sure the offset fits in 8 bits.
     return false;
 
@@ -464,12 +456,12 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
   case ARM::STM:
   case ARM::t2LDM:
   case ARM::t2STM:
-    return (MI->getNumOperands() - 4) * 4;
   case ARM::VLDMS:
   case ARM::VSTMS:
+    return (MI->getNumOperands() - 4) * 4;
   case ARM::VLDMD:
   case ARM::VSTMD:
-    return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4;
+    return (MI->getNumOperands() - 4) * 8;
   }
 }
 
@@ -512,26 +504,17 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
   ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
   int Opcode = MI->getOpcode();
   DebugLoc dl = MI->getDebugLoc();
-  bool isAM4 = (Opcode == ARM::LDM || Opcode == ARM::t2LDM ||
-                Opcode == ARM::STM || Opcode == ARM::t2STM);
 
   bool DoMerge = false;
   ARM_AM::AMSubMode Mode = ARM_AM::ia;
-  unsigned Offset = 0;
 
-  if (isAM4) {
-    // Can't use an updating ld/st if the base register is also a dest
-    // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
-    for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) {
-      if (MI->getOperand(i).getReg() == Base)
-        return false;
-    }
-    Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm());
-  } else {
-    // VLDM{D|S}, VSTM{D|S} addressing mode 5 ops.
-    Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm());
-    Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm());
+  // Can't use an updating ld/st if the base register is also a dest
+  // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
+  for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) {
+    if (MI->getOperand(i).getReg() == Base)
+      return false;
   }
+  Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm());
 
   // Try merging with the previous instruction.
   MachineBasicBlock::iterator BeginMBBI = MBB.begin();
@@ -539,22 +522,14 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
     MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
     while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
       --PrevMBBI;
-    if (isAM4) {
-      if (Mode == ARM_AM::ia &&
-          isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
-        DoMerge = true;
-        Mode = ARM_AM::db;
-      } else if (isAM4 && Mode == ARM_AM::ib &&
-                 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
-        DoMerge = true;
-        Mode = ARM_AM::da;
-      }
-    } else {
-      if (Mode == ARM_AM::ia &&
-          isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
-        Mode = ARM_AM::db;
-        DoMerge = true;
-      }
+    if (Mode == ARM_AM::ia &&
+        isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      Mode = ARM_AM::db;
+      DoMerge = true;
+    } else if (Mode == ARM_AM::ib &&
+               isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      Mode = ARM_AM::da;
+      DoMerge = true;
     }
     if (DoMerge)
       MBB.erase(PrevMBBI);
@@ -566,19 +541,12 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
     MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
     while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
       ++NextMBBI;
-    if (isAM4) {
-      if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
-          isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
-        DoMerge = true;
-      } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
-                 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
-        DoMerge = true;
-      }
-    } else {
-      if (Mode == ARM_AM::ia &&
-          isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
-        DoMerge = true;
-      }
+    if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
+        isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      DoMerge = true;
+    } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
+               isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      DoMerge = true;
     }
     if (DoMerge) {
       if (NextMBBI == I) {
@@ -595,16 +563,9 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
   unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode);
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
     .addReg(Base, getDefRegState(true)) // WB base register
-    .addReg(Base, getKillRegState(BaseKill));
-  if (isAM4) {
-    // [t2]LDM_UPD, [t2]STM_UPD
-    MIB.addImm(ARM_AM::getAM4ModeImm(Mode))
-      .addImm(Pred).addReg(PredReg);
-  } else {
-    // VLDM[SD}_UPD, VSTM[SD]_UPD
-    MIB.addImm(ARM_AM::getAM5Opc(Mode, Offset))
-      .addImm(Pred).addReg(PredReg);
-  }
+    .addReg(Base, getKillRegState(BaseKill))
+    .addImm(ARM_AM::getAM4ModeImm(Mode))
+    .addImm(Pred).addReg(PredReg);
   // Transfer the rest of operands.
   for (unsigned OpNum = 4, e = MI->getNumOperands(); OpNum != e; ++OpNum)
     MIB.addOperand(MI->getOperand(OpNum));
@@ -736,11 +697,10 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   if (!DoMerge)
     return false;
 
-  bool isDPR = NewOpc == ARM::VLDMD || NewOpc == ARM::VSTMD;
   unsigned Offset = 0;
   if (isAM5)
-    Offset = ARM_AM::getAM5Opc(AddSub == ARM_AM::sub ? ARM_AM::db : ARM_AM::ia,
-                               (isDPR ? 2 : 1));
+    Offset = ARM_AM::getAM4ModeImm(AddSub == ARM_AM::sub ?
+                                   ARM_AM::db : ARM_AM::ia);
   else if (isAM2)
     Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
   else
@@ -748,6 +708,9 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
 
   if (isAM5) {
     // VLDM[SD}_UPD, VSTM[SD]_UPD
+    // (There are no base-updating versions of VLDR/VSTR instructions, but the
+    // updating load/store-multiple instructions can be used with only one
+    // register.)
     MachineOperand &MO = MI->getOperand(0);
     BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
       .addReg(Base, getDefRegState(true)) // WB base register
@@ -1268,7 +1231,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
 namespace {
   struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
     static char ID;
-    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(&ID) {}
+    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
 
     const TargetData *TD;
     const TargetInstrInfo *TII;
diff --git a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index ab2b06b60783b..ab2b06b60783b 100644
--- a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
diff --git a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.h b/lib/Target/ARM/ARMMCInstLower.h
index b81a30690ce24..b81a30690ce24 100644
--- a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.h
+++ b/lib/Target/ARM/ARMMCInstLower.h
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 7e57a1ca55762..514c26b4daf03 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -43,6 +43,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// processFunctionBeforeCalleeSavedScan().
   bool HasStackFrame;
 
+  /// RestoreSPFromFP - True if epilogue should restore SP from FP. Set by
+  /// emitPrologue.
+  bool RestoreSPFromFP;
+
   /// LRSpilledForFarJump - True if the LR register has been for spilled to
   /// enable far jump.
   bool LRSpilledForFarJump;
@@ -95,7 +99,7 @@ public:
   ARMFunctionInfo() :
     isThumb(false),
     hasThumb2(false),
-    VarArgsRegSaveSize(0), HasStackFrame(false),
+    VarArgsRegSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false),
     LRSpilledForFarJump(false),
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
@@ -106,7 +110,7 @@ public:
   explicit ARMFunctionInfo(MachineFunction &MF) :
     isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
     hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()),
-    VarArgsRegSaveSize(0), HasStackFrame(false),
+    VarArgsRegSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false),
     LRSpilledForFarJump(false),
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
@@ -125,6 +129,9 @@ public:
   bool hasStackFrame() const { return HasStackFrame; }
   void setHasStackFrame(bool s) { HasStackFrame = s; }
 
+  bool shouldRestoreSPFromFP() const { return RestoreSPFromFP; }
+  void setShouldRestoreSPFromFP(bool s) { RestoreSPFromFP = s; }
+
   bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; }
   void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; }
 
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index d020f3c74bdea..305b232e6a99a 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -1,4 +1,4 @@
-//===- ARMRegisterInfo.td - ARM Register defs -------------------*- C++ -*-===//
+//===- ARMRegisterInfo.td - ARM Register defs --------------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -220,41 +220,11 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
   let MethodBodies = [{
-    // FP is R11, R9 is available.
-    static const unsigned ARM_GPR_AO_1[] = {
+    static const unsigned ARM_GPR_AO[] = {
       ARM::R0, ARM::R1, ARM::R2, ARM::R3,
       ARM::R12,ARM::LR,
       ARM::R4, ARM::R5, ARM::R6, ARM::R7,
-      ARM::R8, ARM::R9, ARM::R10,
-      ARM::R11 };
-    // FP is R11, R9 is not available.
-    static const unsigned ARM_GPR_AO_2[] = {
-      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
-      ARM::R12,ARM::LR,
-      ARM::R4, ARM::R5, ARM::R6, ARM::R7,
-      ARM::R8, ARM::R10,
-      ARM::R11 };
-    // FP is R7, R9 is available as non-callee-saved register.
-    // This is used by Darwin.
-    static const unsigned ARM_GPR_AO_3[] = {
-      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
-      ARM::R9, ARM::R12,ARM::LR,
-      ARM::R4, ARM::R5, ARM::R6,
-      ARM::R8, ARM::R10,ARM::R11,ARM::R7 };
-    // FP is R7, R9 is not available.
-    static const unsigned ARM_GPR_AO_4[] = {
-      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
-      ARM::R12,ARM::LR,
-      ARM::R4, ARM::R5, ARM::R6,
-      ARM::R8, ARM::R10,ARM::R11,
-      ARM::R7 };
-    // FP is R7, R9 is available as callee-saved register.
-    // This is used by non-Darwin platform in Thumb mode.
-    static const unsigned ARM_GPR_AO_5[] = {
-      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
-      ARM::R12,ARM::LR,
-      ARM::R4, ARM::R5, ARM::R6,
-      ARM::R8, ARM::R9, ARM::R10,ARM::R11,ARM::R7 };
+      ARM::R8, ARM::R9, ARM::R10, ARM::R11 };
 
     // For Thumb1 mode, we don't want to allocate hi regs at all, as we
     // don't know how to spill them. If we make our prologue/epilogue code
@@ -270,85 +240,71 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
       const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
       if (Subtarget.isThumb1Only())
         return THUMB_GPR_AO;
-      if (Subtarget.isTargetDarwin()) {
-        if (Subtarget.isR9Reserved())
-          return ARM_GPR_AO_4;
-        else
-          return ARM_GPR_AO_3;
-      } else {
-        if (Subtarget.isR9Reserved())
-          return ARM_GPR_AO_2;
-        else if (Subtarget.isThumb())
-          return ARM_GPR_AO_5;
-        else
-          return ARM_GPR_AO_1;
-      }
+      return ARM_GPR_AO;
     }
 
     GPRClass::iterator
     GPRClass::allocation_order_end(const MachineFunction &MF) const {
       const TargetMachine &TM = MF.getTarget();
-      const TargetRegisterInfo *RI = TM.getRegisterInfo();
       const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
-      GPRClass::iterator I;
-
-      if (Subtarget.isThumb1Only()) {
-        I = THUMB_GPR_AO + (sizeof(THUMB_GPR_AO)/sizeof(unsigned));
-        // Mac OS X requires FP not to be clobbered for backtracing purpose.
-        return (Subtarget.isTargetDarwin() || RI->hasFP(MF)) ? I-1 : I;
-      }
-
-      if (Subtarget.isTargetDarwin()) {
-        if (Subtarget.isR9Reserved())
-          I = ARM_GPR_AO_4 + (sizeof(ARM_GPR_AO_4)/sizeof(unsigned));
-        else
-          I = ARM_GPR_AO_3 + (sizeof(ARM_GPR_AO_3)/sizeof(unsigned));
-      } else {
-        if (Subtarget.isR9Reserved())
-          I = ARM_GPR_AO_2 + (sizeof(ARM_GPR_AO_2)/sizeof(unsigned));
-        else if (Subtarget.isThumb())
-          I = ARM_GPR_AO_5 + (sizeof(ARM_GPR_AO_5)/sizeof(unsigned));
-        else
-          I = ARM_GPR_AO_1 + (sizeof(ARM_GPR_AO_1)/sizeof(unsigned));
-      }
-
-      // Mac OS X requires FP not to be clobbered for backtracing purpose.
-      return (Subtarget.isTargetDarwin() || RI->hasFP(MF)) ? I-1 : I;
+      if (Subtarget.isThumb1Only())
+        return THUMB_GPR_AO + (sizeof(THUMB_GPR_AO)/sizeof(unsigned));
+      return ARM_GPR_AO + (sizeof(ARM_GPR_AO)/sizeof(unsigned));
     }
   }];
 }
 
-// Thumb registers are R0-R7 normally. Some instructions can still use
-// the general GPR register class above (MOV, e.g.)
-def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
+// restricted GPR register class. Many Thumb2 instructions allow the full
+// register range for operands, but have undefined behaviours when PC
+// or SP (R13 or R15) are used. The ARM ARM refers to these operands
+// via the BadReg() pseudo-code description.
+def rGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
+                                            R7, R8, R9, R10, R11, R12, LR]> {
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
   let MethodBodies = [{
-    static const unsigned THUMB_tGPR_AO[] = {
+    static const unsigned ARM_rGPR_AO[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+      ARM::R8, ARM::R9, ARM::R10,
+      ARM::R11 };
+
+    // For Thumb1 mode, we don't want to allocate hi regs at all, as we
+    // don't know how to spill them. If we make our prologue/epilogue code
+    // smarter at some point, we can go back to using the above allocation
+    // orders for the Thumb1 instructions that know how to use hi regs.
+    static const unsigned THUMB_rGPR_AO[] = {
       ARM::R0, ARM::R1, ARM::R2, ARM::R3,
       ARM::R4, ARM::R5, ARM::R6, ARM::R7 };
 
-    // FP is R7, only low registers available.
-    tGPRClass::iterator
-    tGPRClass::allocation_order_begin(const MachineFunction &MF) const {
-      return THUMB_tGPR_AO;
+    rGPRClass::iterator
+    rGPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.isThumb1Only())
+        return THUMB_rGPR_AO;
+      return ARM_rGPR_AO;
     }
 
-    tGPRClass::iterator
-    tGPRClass::allocation_order_end(const MachineFunction &MF) const {
+    rGPRClass::iterator
+    rGPRClass::allocation_order_end(const MachineFunction &MF) const {
       const TargetMachine &TM = MF.getTarget();
-      const TargetRegisterInfo *RI = TM.getRegisterInfo();
       const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
-      tGPRClass::iterator I =
-        THUMB_tGPR_AO + (sizeof(THUMB_tGPR_AO)/sizeof(unsigned));
-      // Mac OS X requires FP not to be clobbered for backtracing purpose.
-      return (Subtarget.isTargetDarwin() || RI->hasFP(MF)) ? I-1 : I;
+
+      if (Subtarget.isThumb1Only())
+        return THUMB_rGPR_AO + (sizeof(THUMB_rGPR_AO)/sizeof(unsigned));
+      return ARM_rGPR_AO + (sizeof(ARM_rGPR_AO)/sizeof(unsigned));
     }
   }];
 }
 
+// Thumb registers are R0-R7 normally. Some instructions can still use
+// the general GPR register class above (MOV, e.g.)
+def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {}
+
 // For tail calls, we can't use callee-saved registers, as they are restored
 // to the saved value before the tail call, which would clobber a call address.
 // Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of
@@ -381,36 +337,20 @@ def tcGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R9, R12]> {
       const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
       if (Subtarget.isThumb1Only())
         return THUMB_GPR_AO_TC;
-      if (Subtarget.isTargetDarwin()) {
-        if (Subtarget.isR9Reserved())
-          return ARM_GPR_NOR9_TC;
-        else
-          return ARM_GPR_R9_TC;
-      } else
-        // R9 is either callee-saved or reserved; can't use it.
-        return ARM_GPR_NOR9_TC;
+      return Subtarget.isTargetDarwin() ? ARM_GPR_R9_TC : ARM_GPR_NOR9_TC;
     }
 
     tcGPRClass::iterator
     tcGPRClass::allocation_order_end(const MachineFunction &MF) const {
       const TargetMachine &TM = MF.getTarget();
       const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
-      GPRClass::iterator I;
-
-      if (Subtarget.isThumb1Only()) {
-        I = THUMB_GPR_AO_TC + (sizeof(THUMB_GPR_AO_TC)/sizeof(unsigned));
-        return I;
-      }
-
-      if (Subtarget.isTargetDarwin()) {
-        if (Subtarget.isR9Reserved())
-          I = ARM_GPR_NOR9_TC + (sizeof(ARM_GPR_NOR9_TC)/sizeof(unsigned));
-        else
-          I = ARM_GPR_R9_TC + (sizeof(ARM_GPR_R9_TC)/sizeof(unsigned));
-      } else
-        // R9 is either callee-saved or reserved; can't use it.
-        I = ARM_GPR_NOR9_TC + (sizeof(ARM_GPR_NOR9_TC)/sizeof(unsigned));
-      return I;
+
+      if (Subtarget.isThumb1Only())
+        return THUMB_GPR_AO_TC + (sizeof(THUMB_GPR_AO_TC)/sizeof(unsigned));
+
+      return Subtarget.isTargetDarwin() ?
+        ARM_GPR_R9_TC + (sizeof(ARM_GPR_R9_TC)/sizeof(unsigned)) :
+        ARM_GPR_NOR9_TC + (sizeof(ARM_GPR_NOR9_TC)/sizeof(unsigned));
     }
   }];
 }
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 10fd257055fb5..cb539f4c01ec8 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -33,14 +33,19 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
   , ARMFPUType(None)
   , UseNEONForSinglePrecisionFP(false)
   , SlowVMLx(false)
+  , SlowFPBrcc(false)
   , IsThumb(isT)
   , ThumbMode(Thumb1)
+  , NoARM(false)
   , PostRAScheduler(false)
   , IsR9Reserved(ReserveR9)
   , UseMovt(UseMOVT)
   , HasFP16(false)
   , HasHardwareDivide(false)
   , HasT2ExtractPack(false)
+  , HasDataBarrier(false)
+  , Pref32BitThumb(false)
+  , FPOnlySP(false)
   , stackAlignment(4)
   , CPUString("generic")
   , TargetType(isELF) // Default to ELF unless otherwise specified.
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index e7d92ede9b984..67e58038ee779 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -26,7 +26,7 @@ class GlobalValue;
 class ARMSubtarget : public TargetSubtarget {
 protected:
   enum ARMArchEnum {
-    V4, V4T, V5T, V5TE, V6, V6T2, V7A, V7M
+    V4, V4T, V5T, V5TE, V6, V6M, V6T2, V7A, V7M
   };
 
   enum ARMFPEnum {
@@ -63,6 +63,9 @@ protected:
   /// ThumbMode - Indicates supported Thumb version.
   ThumbTypeEnum ThumbMode;
 
+  /// NoARM - True if subtarget does not support ARM mode execution.
+  bool NoARM;
+
   /// PostRAScheduler - True if using post-register-allocation scheduler.
   bool PostRAScheduler;
 
@@ -84,6 +87,18 @@ protected:
   /// instructions.
   bool HasT2ExtractPack;
 
+  /// HasDataBarrier - True if the subtarget supports DMB / DSB data barrier
+  /// instructions.
+  bool HasDataBarrier;
+
+  /// Pref32BitThumb - If true, codegen would prefer 32-bit Thumb instructions
+  /// over 16-bit ones.
+  bool Pref32BitThumb;
+
+  /// FPOnlySP - If true, the floating point unit only supports single
+  /// precision.
+  bool FPOnlySP;
+
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment;
@@ -128,6 +143,8 @@ protected:
   bool hasV6T2Ops() const { return ARMArchVersion >= V6T2; }
   bool hasV7Ops()   const { return ARMArchVersion >= V7A;  }
 
+  bool hasARMOps() const { return !NoARM; }
+
   bool hasVFP2() const { return ARMFPUType >= VFPv2; }
   bool hasVFP3() const { return ARMFPUType >= VFPv3; }
   bool hasNEON() const { return ARMFPUType >= NEON;  }
@@ -135,8 +152,11 @@ protected:
     return hasNEON() && UseNEONForSinglePrecisionFP; }
   bool hasDivide() const { return HasHardwareDivide; }
   bool hasT2ExtractPack() const { return HasT2ExtractPack; }
+  bool hasDataBarrier() const { return HasDataBarrier; }
   bool useVMLx() const {return hasVFP2() && !SlowVMLx; }
   bool isFPBrccSlow() const { return SlowFPBrcc; }
+  bool isFPOnlySP() const { return FPOnlySP; }
+  bool prefers32BitThumb() const { return Pref32BitThumb; }
 
   bool hasFP16() const { return HasFP16; }
 
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 09203f9304df0..30ff8276cdaac 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -31,7 +31,6 @@ static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
   }
 }
 
-
 extern "C" void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget);
@@ -66,6 +65,9 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT,
                            "v128:64:128-v64:64:64-n32")),
     TLInfo(*this),
     TSInfo(*this) {
+  if (!Subtarget.hasARMOps())
+    report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
+                       "support ARM mode execution!");
 }
 
 ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT,
@@ -85,9 +87,15 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT,
     TSInfo(*this) {
 }
 
+// Pass Pipeline Configuration
+bool ARMBaseTargetMachine::addPreISel(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel) {
+  if (OptLevel != CodeGenOpt::None)
+    PM.add(createARMGlobalMergePass(getTargetLowering()));
 
+  return false;
+}
 
-// Pass Pipeline Configuration
 bool ARMBaseTargetMachine::addInstSelector(PassManagerBase &PM,
                                            CodeGenOpt::Level OptLevel) {
   PM.add(createARMISelDag(*this, OptLevel));
@@ -132,7 +140,7 @@ bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM,
 
 bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
                                           CodeGenOpt::Level OptLevel) {
-  if (Subtarget.isThumb2())
+  if (Subtarget.isThumb2() && !Subtarget.prefers32BitThumb())
     PM.add(createThumb2SizeReductionPass());
 
   PM.add(createARMConstantIslandPass());
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index a222e57b13ff3..17e5425a9d370 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -50,6 +50,7 @@ public:
   }
 
   // Pass Pipeline Configuration
+  virtual bool addPreISel(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addPreSched2(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 4b083244b2413..75e2a739bf1f8 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
+#include "ARMSubtarget.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
@@ -18,8 +19,10 @@
 #include "llvm/Target/TargetAsmParser.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 using namespace llvm;
 
@@ -37,6 +40,7 @@ enum ShiftType {
 
 class ARMAsmParser : public TargetAsmParser {
   MCAsmParser &Parser;
+  TargetMachine &TM;
 
 private:
   MCAsmParser &getParser() const { return Parser; }
@@ -76,26 +80,33 @@ private:
 
   bool ParseDirectiveSyntax(SMLoc L);
 
-  // TODO - For now hacked versions of the next two are in here in this file to
-  // allow some parser testing until the table gen versions are implemented.
+  bool MatchInstruction(SMLoc IDLoc,
+                        const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                        MCInst &Inst) {
+    if (!MatchInstructionImpl(Operands, Inst))
+      return false;
+
+    // FIXME: We should give nicer diagnostics about the exact failure.
+    Error(IDLoc, "unrecognized instruction");
+
+    return true;
+  }
 
   /// @name Auto-generated Match Functions
   /// {
-  bool MatchInstruction(const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCInst &Inst);
 
-  /// MatchRegisterName - Match the given string to a register name and return
-  /// its register number, or -1 if there is no match.  To allow return values
-  /// to be used directly in register lists, arm registers have values between
-  /// 0 and 15.
-  int MatchRegisterName(StringRef Name);
+  unsigned ComputeAvailableFeatures(const ARMSubtarget *Subtarget) const;
+
+  bool MatchInstructionImpl(const SmallVectorImpl<MCParsedAsmOperand*>
+                              &Operands,
+                            MCInst &Inst);
 
   /// }
 
 
 public:
-  ARMAsmParser(const Target &T, MCAsmParser &_Parser)
-    : TargetAsmParser(T), Parser(_Parser) {}
+  ARMAsmParser(const Target &T, MCAsmParser &_Parser, TargetMachine &_TM)
+    : TargetAsmParser(T), Parser(_Parser), TM(_TM) {}
 
   virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc,
                                 SmallVectorImpl<MCParsedAsmOperand*> &Operands);
@@ -110,16 +121,21 @@ private:
   ARMOperand() {}
 public:
   enum KindTy {
-    Token,
-    Register,
+    CondCode,
     Immediate,
-    Memory
+    Memory,
+    Register,
+    Token
   } Kind;
 
   SMLoc StartLoc, EndLoc;
 
   union {
     struct {
+      ARMCC::CondCodes Val;
+    } CC;
+
+    struct {
       const char *Data;
       unsigned Length;
     } Tok;
@@ -151,16 +167,19 @@ public:
 
   };
   
-  ARMOperand(KindTy K, SMLoc S, SMLoc E)
-    : Kind(K), StartLoc(S), EndLoc(E) {}
+  //ARMOperand(KindTy K, SMLoc S, SMLoc E)
+  //  : Kind(K), StartLoc(S), EndLoc(E) {}
   
   ARMOperand(const ARMOperand &o) : MCParsedAsmOperand() {
     Kind = o.Kind;
     StartLoc = o.StartLoc;
     EndLoc = o.EndLoc;
     switch (Kind) {
+    case CondCode:
+      CC = o.CC;
+      break;
     case Token:
-    Tok = o.Tok;
+      Tok = o.Tok;
       break;
     case Register:
       Reg = o.Reg;
@@ -179,6 +198,11 @@ public:
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const { return EndLoc; }
 
+  ARMCC::CondCodes getCondCode() const {
+    assert(Kind == CondCode && "Invalid access!");
+    return CC.Val;
+  }
+
   StringRef getToken() const {
     assert(Kind == Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
@@ -194,15 +218,50 @@ public:
     return Imm.Val;
   }
 
-  bool isToken() const {return Kind == Token; }
+  bool isCondCode() const { return Kind == CondCode; }
+
+  bool isImm() const { return Kind == Immediate; }
 
   bool isReg() const { return Kind == Register; }
 
+  bool isToken() const {return Kind == Token; }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getCondCode())));
+    // FIXME: What belongs here?
+    Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
   void addRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  virtual void dump(raw_ostream &OS) const;
+
+  static void CreateCondCode(OwningPtr<ARMOperand> &Op, ARMCC::CondCodes CC,
+                             SMLoc S) {
+    Op.reset(new ARMOperand);
+    Op->Kind = CondCode;
+    Op->CC.Val = CC;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+  }
+
   static void CreateToken(OwningPtr<ARMOperand> &Op, StringRef Str,
                           SMLoc S) {
     Op.reset(new ARMOperand);
@@ -262,6 +321,33 @@ public:
 
 } // end anonymous namespace.
 
+void ARMOperand::dump(raw_ostream &OS) const {
+  switch (Kind) {
+  case CondCode:
+    OS << ARMCondCodeToString(getCondCode());
+    break;
+  case Immediate:
+    getImm()->print(OS);
+    break;
+  case Memory:
+    OS << "<memory>";
+    break;
+  case Register:
+    OS << "<register " << getReg() << ">";
+    break;
+  case Token:
+    OS << "'" << getToken() << "'";
+    break;
+  }
+}
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
 /// Try to parse a register name.  The token must be an Identifier when called,
 /// and if it is a register name a Reg operand is created, the token is eaten
 /// and false is returned.  Else true is returned and no token is eaten.
@@ -548,77 +634,6 @@ bool ARMAsmParser::ParseShift(ShiftType &St,
   return false;
 }
 
-/// A hack to allow some testing, to be replaced by a real table gen version.
-int ARMAsmParser::MatchRegisterName(StringRef Name) {
-  if (Name == "r0" || Name == "R0")
-    return 0;
-  else if (Name == "r1" || Name == "R1")
-    return 1;
-  else if (Name == "r2" || Name == "R2")
-    return 2;
-  else if (Name == "r3" || Name == "R3")
-    return 3;
-  else if (Name == "r3" || Name == "R3")
-    return 3;
-  else if (Name == "r4" || Name == "R4")
-    return 4;
-  else if (Name == "r5" || Name == "R5")
-    return 5;
-  else if (Name == "r6" || Name == "R6")
-    return 6;
-  else if (Name == "r7" || Name == "R7")
-    return 7;
-  else if (Name == "r8" || Name == "R8")
-    return 8;
-  else if (Name == "r9" || Name == "R9")
-    return 9;
-  else if (Name == "r10" || Name == "R10")
-    return 10;
-  else if (Name == "r11" || Name == "R11" || Name == "fp")
-    return 11;
-  else if (Name == "r12" || Name == "R12" || Name == "ip")
-    return 12;
-  else if (Name == "r13" || Name == "R13" || Name == "sp")
-    return 13;
-  else if (Name == "r14" || Name == "R14" || Name == "lr")
-      return 14;
-  else if (Name == "r15" || Name == "R15" || Name == "pc")
-    return 15;
-  return -1;
-}
-
-/// A hack to allow some testing, to be replaced by a real table gen version.
-bool ARMAsmParser::
-MatchInstruction(const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                 MCInst &Inst) {
-  ARMOperand &Op0 = *(ARMOperand*)Operands[0];
-  assert(Op0.Kind == ARMOperand::Token && "First operand not a Token");
-  StringRef Mnemonic = Op0.getToken();
-  if (Mnemonic == "add" ||
-      Mnemonic == "stmfd" ||
-      Mnemonic == "str" ||
-      Mnemonic == "ldmfd" ||
-      Mnemonic == "ldr" ||
-      Mnemonic == "mov" ||
-      Mnemonic == "sub" ||
-      Mnemonic == "bl" ||
-      Mnemonic == "push" ||
-      Mnemonic == "blx" ||
-      Mnemonic == "pop") {
-    // Hard-coded to a valid instruction, till we have a real matcher.
-    Inst = MCInst();
-    Inst.setOpcode(ARM::MOVr);
-    Inst.addOperand(MCOperand::CreateReg(2));
-    Inst.addOperand(MCOperand::CreateReg(2));
-    Inst.addOperand(MCOperand::CreateImm(0));
-    Inst.addOperand(MCOperand::CreateImm(0));
-    Inst.addOperand(MCOperand::CreateReg(0));
-    return false;
-  }
-
-  return true;
-}
-
 /// Parse a arm instruction operand.  For now this parses the operand regardless
 /// of the mnemonic.
 bool ARMAsmParser::ParseOperand(OwningPtr<ARMOperand> &Op) {
@@ -661,12 +676,56 @@ bool ARMAsmParser::ParseOperand(OwningPtr<ARMOperand> &Op) {
 bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   OwningPtr<ARMOperand> Op;
-  ARMOperand::CreateToken(Op, Name, NameLoc);
-  
+
+  // Create the leading tokens for the mnemonic, split by '.' characters.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+
+  // Determine the predicate, if any.
+  //
+  // FIXME: We need a way to check whether a prefix supports predication,
+  // otherwise we will end up with an ambiguity for instructions that happen to
+  // end with a predicate name.
+  unsigned CC = StringSwitch<unsigned>(Head.substr(Head.size()-2))
+    .Case("eq", ARMCC::EQ)
+    .Case("ne", ARMCC::NE)
+    .Case("hs", ARMCC::HS)
+    .Case("lo", ARMCC::LO)
+    .Case("mi", ARMCC::MI)
+    .Case("pl", ARMCC::PL)
+    .Case("vs", ARMCC::VS)
+    .Case("vc", ARMCC::VC)
+    .Case("hi", ARMCC::HI)
+    .Case("ls", ARMCC::LS)
+    .Case("ge", ARMCC::GE)
+    .Case("lt", ARMCC::LT)
+    .Case("gt", ARMCC::GT)
+    .Case("le", ARMCC::LE)
+    .Case("al", ARMCC::AL)
+    .Default(~0U);
+  if (CC != ~0U) {
+    Head = Head.slice(0, Head.size() - 2);
+  } else
+    CC = ARMCC::AL;
+
+  ARMOperand::CreateToken(Op, Head, NameLoc);
   Operands.push_back(Op.take());
 
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+  ARMOperand::CreateCondCode(Op, ARMCC::CondCodes(CC), NameLoc);
+  Operands.push_back(Op.take());
+
+  // Add the remaining tokens in the mnemonic.
+  while (Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start, Next);
 
+    ARMOperand::CreateToken(Op, Head, NameLoc);
+    Operands.push_back(Op.take());
+  }
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
     // Read the first operand.
     OwningPtr<ARMOperand> Op;
     if (ParseOperand(Op)) return true;
@@ -809,3 +868,5 @@ extern "C" void LLVMInitializeARMAsmParser() {
   RegisterAsmParser<ARMAsmParser> Y(TheThumbTarget);
   LLVMInitializeARMAsmLexer();
 }
+
+#include "ARMGenAsmMatcher.inc"
diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
index edc934549b288..8026e7718ca98 100644
--- a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
@@ -158,7 +158,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
   if ((MI->getOpcode() == ARM::VSTMS_UPD || MI->getOpcode() ==ARM::VSTMD_UPD) &&
       MI->getOperand(0).getReg() == ARM::SP) {
     const MCOperand &MO1 = MI->getOperand(2);
-    if (ARM_AM::getAM5SubMode(MO1.getImm()) == ARM_AM::db) {
+    if (ARM_AM::getAM4SubMode(MO1.getImm()) == ARM_AM::db) {
       O << '\t' << "vpush";
       printPredicateOperand(MI, 3, O);
       O << '\t';
@@ -171,7 +171,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
   if ((MI->getOpcode() == ARM::VLDMS_UPD || MI->getOpcode() ==ARM::VLDMD_UPD) &&
       MI->getOperand(0).getReg() == ARM::SP) {
     const MCOperand &MO1 = MI->getOperand(2);
-    if (ARM_AM::getAM5SubMode(MO1.getImm()) == ARM_AM::ia) {
+    if (ARM_AM::getAM4SubMode(MO1.getImm()) == ARM_AM::ia) {
       O << '\t' << "vpop";
       printPredicateOperand(MI, 3, O);
       O << '\t';
@@ -278,15 +278,13 @@ void ARMInstPrinter::printSORegOperand(const MCInst *MI, unsigned OpNum,
   O << getRegisterName(MO1.getReg());
   
   // Print the shift opc.
-  O << ", "
-    << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm()))
-    << ' ';
-  
+  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm());
+  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
   if (MO2.getReg()) {
-    O << getRegisterName(MO2.getReg());
+    O << ' ' << getRegisterName(MO2.getReg());
     assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
-  } else {
-    O << "#" << ARM_AM::getSORegOffset(MO3.getImm());
+  } else if (ShOpc != ARM_AM::rrx) {
+    O << " #" << ARM_AM::getSORegOffset(MO3.getImm());
   }
 }
 
@@ -414,16 +412,6 @@ void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
     return;
   }
   
-  if (Modifier && strcmp(Modifier, "submode") == 0) {
-    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MO2.getImm());
-    O << ARM_AM::getAMSubModeStr(Mode);
-    return;
-  } else if (Modifier && strcmp(Modifier, "base") == 0) {
-    // Used for FSTM{D|S} and LSTM{D|S} operations.
-    O << getRegisterName(MO1.getReg());
-    return;
-  }
-  
   O << "[" << getRegisterName(MO1.getReg());
   
   if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) {
@@ -463,9 +451,9 @@ void ARMInstPrinter::printAddrModePCOperand(const MCInst *MI, unsigned OpNum,
   assert(0 && "FIXME: Implement printAddrModePCOperand");
 }
 
-void ARMInstPrinter::printBitfieldInvMaskImmOperand (const MCInst *MI,
-                                                     unsigned OpNum,
-                                                     raw_ostream &O) {
+void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI,
+                                                    unsigned OpNum,
+                                                    raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
   uint32_t v = ~MO.getImm();
   int32_t lsb = CountTrailingZeros_32(v);
@@ -474,6 +462,31 @@ void ARMInstPrinter::printBitfieldInvMaskImmOperand (const MCInst *MI,
   O << '#' << lsb << ", #" << width;
 }
 
+void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  unsigned val = MI->getOperand(OpNum).getImm();
+  O << ARM_MB::MemBOptToString(val);
+}
+
+void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  unsigned ShiftOp = MI->getOperand(OpNum).getImm();
+  ARM_AM::ShiftOpc Opc = ARM_AM::getSORegShOp(ShiftOp);
+  switch (Opc) {
+  case ARM_AM::no_shift:
+    return;
+  case ARM_AM::lsl:
+    O << ", lsl #";
+    break;
+  case ARM_AM::asr:
+    O << ", asr #";
+    break;
+  default:
+    assert(0 && "unexpected shift opcode for shift immediate operand");
+  }
+  O << ARM_AM::getSORegOffset(ShiftOp);
+}
+
 void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
                                        raw_ostream &O) {
   O << "{";
@@ -669,12 +682,11 @@ void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum,
   O << getRegisterName(Reg);
 
   // Print the shift opc.
-  O << ", "
-    << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm()))
-    << " ";
-
   assert(MO2.isImm() && "Not a valid t2_so_reg value!");
-  O << "#" << ARM_AM::getSORegOffset(MO2.getImm());
+  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO2.getImm());
+  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
+  if (ShOpc != ARM_AM::rrx)
+    O << " #" << ARM_AM::getSORegOffset(MO2.getImm());
 }
 
 void ARMInstPrinter::printT2AddrModeImm12Operand(const MCInst *MI,
diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
index ddf5047793d29..e5ad0d07e9bab 100644
--- a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
@@ -57,6 +57,8 @@ public:
 
   void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum,
                                       raw_ostream &O);
+  void printMemBOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShiftImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
   void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printThumbITMask(const MCInst *MI, unsigned OpNum, raw_ostream &O);
diff --git a/lib/Target/ARM/AsmPrinter/CMakeLists.txt b/lib/Target/ARM/AsmPrinter/CMakeLists.txt
index 4e299f86ecb67..18645c0864a32 100644
--- a/lib/Target/ARM/AsmPrinter/CMakeLists.txt
+++ b/lib/Target/ARM/AsmPrinter/CMakeLists.txt
@@ -1,8 +1,6 @@
 include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
 
 add_llvm_library(LLVMARMAsmPrinter
-  ARMAsmPrinter.cpp
   ARMInstPrinter.cpp
-  ARMMCInstLower.cpp
   )
 add_dependencies(LLVMARMAsmPrinter ARMCodeGenTable_gen)
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 0df34666b959b..6b4dee5965d25 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -7,25 +7,32 @@ tablegen(ARMGenInstrNames.inc -gen-instr-enums)
 tablegen(ARMGenInstrInfo.inc -gen-instr-desc)
 tablegen(ARMGenCodeEmitter.inc -gen-emitter)
 tablegen(ARMGenAsmWriter.inc -gen-asm-writer)
+tablegen(ARMGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(ARMGenDAGISel.inc -gen-dag-isel)
+tablegen(ARMGenFastISel.inc -gen-fast-isel)
 tablegen(ARMGenCallingConv.inc -gen-callingconv)
 tablegen(ARMGenSubtarget.inc -gen-subtarget)
 tablegen(ARMGenEDInfo.inc -gen-enhanced-disassembly-info)
 
 add_llvm_target(ARMCodeGen
+  ARMAsmPrinter.cpp
   ARMBaseInstrInfo.cpp
   ARMBaseRegisterInfo.cpp
   ARMCodeEmitter.cpp
   ARMConstantIslandPass.cpp
   ARMConstantPoolValue.cpp
   ARMExpandPseudoInsts.cpp
+  ARMFastISel.cpp
+  ARMGlobalMerge.cpp
   ARMISelDAGToDAG.cpp
   ARMISelLowering.cpp
   ARMInstrInfo.cpp
   ARMJITInfo.cpp
   ARMLoadStoreOptimizer.cpp
   ARMMCAsmInfo.cpp
+  ARMMCInstLower.cpp
   ARMRegisterInfo.cpp
+  ARMSelectionDAGInfo.cpp
   ARMSubtarget.cpp
   ARMTargetMachine.cpp
   ARMTargetObjectFile.cpp
@@ -38,7 +45,6 @@ add_llvm_target(ARMCodeGen
   Thumb2InstrInfo.cpp
   Thumb2RegisterInfo.cpp
   Thumb2SizeReduction.cpp
-  ARMSelectionDAGInfo.cpp
   )
 
-target_link_libraries (LLVMARMCodeGen LLVMSelectionDAG)
+target_link_libraries (LLVMARMCodeGen LLVMARMAsmPrinter LLVMSelectionDAG)
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 4de697e8bf676..e22028985b46c 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -26,6 +26,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 
+//#define DEBUG(X) do { X; } while (0)
+
 /// ARMGenDecoderTables.inc - ARMDecoderTables.inc is tblgen'ed from
 /// ARMDecoderEmitter.cpp TableGen backend.  It contains:
 ///
@@ -87,6 +89,11 @@ static unsigned decodeARMInstruction(uint32_t &insn) {
       return ARM::BFI;
   }
 
+  // Ditto for STRBT, which is a super-instruction for A8.6.199 Encoding A1 & A2.
+  // As a result, the decoder fails to deocode USAT properly.
+  if (slice(insn, 27, 21) == 0x37 && slice(insn, 5, 4) == 1)
+    return ARM::USAT;
+
   // Ditto for ADDSrs, which is a super-instruction for A8.6.7 & A8.6.8.
   // As a result, the decoder fails to decode UMULL properly.
   if (slice(insn, 27, 21) == 0x04 && slice(insn, 7, 4) == 9) {
@@ -106,7 +113,7 @@ static unsigned decodeARMInstruction(uint32_t &insn) {
   // Ditto for STRT, which is a super-instruction for A8.6.210 Encoding A1 & A2.
   // As a result, the decoder fails to deocode SSAT properly.
   if (slice(insn, 27, 21) == 0x35 && slice(insn, 5, 4) == 1)
-    return slice(insn, 6, 6) == 0 ? ARM::SSATlsl : ARM::SSATasr;
+    return ARM::SSAT;
 
   // Ditto for RSCrs, which is a super-instruction for A8.6.146 & A8.6.147.
   // As a result, the decoder fails to decode STRHT/LDRHT/LDRSHT/LDRSBT.
@@ -291,7 +298,7 @@ static unsigned T2Morph2LoadLiteral(unsigned Opcode) {
 /// decodeInstruction(insn) is invoked on the original insn.
 ///
 /// Otherwise, decodeThumbInstruction is called with the original insn.
-static unsigned decodeThumbSideEffect(bool IsThumb2, uint32_t &insn) {
+static unsigned decodeThumbSideEffect(bool IsThumb2, unsigned &insn) {
   if (IsThumb2) {
     uint16_t op1 = slice(insn, 28, 27);
     uint16_t op2 = slice(insn, 26, 20);
@@ -429,7 +436,7 @@ bool ThumbDisassembler::getInstruction(MCInst &MI,
   // passed to decodeThumbInstruction().  For 16-bit Thumb instruction, the top
   // halfword of insn is 0x00 0x00; otherwise, the first halfword is moved to
   // the top half followed by the second halfword.
-  uint32_t insn = 0;
+  unsigned insn = 0;
   // Possible second halfword.
   uint16_t insn1 = 0;
 
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
index a07ff2832aa7e..9f493b9aee02a 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
@@ -20,6 +20,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+//#define DEBUG(X) do { X; } while (0)
+
 /// ARMGenInstrInfo.inc - ARMGenInstrInfo.inc contains the static const
 /// TargetInstrDesc ARMInsts[] definition and the TargetOperandInfo[]'s
 /// describing the operand info for each ARMInsts[i].
@@ -93,6 +95,9 @@ static unsigned getRegisterEnum(BO B, unsigned RegClassID, unsigned RawRegister,
     RegClassID = ARM::DPRRegClassID;
   }
 
+  // For this purpose, we can treat rGPR as if it were GPR.
+  if (RegClassID == ARM::rGPRRegClassID) RegClassID = ARM::GPRRegClassID;
+
   // See also decodeNEONRd(), decodeNEONRn(), decodeNEONRm().
   unsigned RegNum =
     RegClassID == ARM::QPRRegClassID ? RawRegister >> 1 : RawRegister;
@@ -451,12 +456,23 @@ static inline ARM_AM::ShiftOpc getShiftOpcForBits(unsigned bits) {
 //
 // A8-11: DecodeImmShift()
 static inline void getImmShiftSE(ARM_AM::ShiftOpc &ShOp, unsigned &ShImm) {
-  // If type == 0b11 and imm5 == 0, we have an rrx, instead.
-  if (ShOp == ARM_AM::ror && ShImm == 0)
-    ShOp = ARM_AM::rrx;
-  // If (lsr or asr) and imm5 == 0, shift amount is 32.
-  if ((ShOp == ARM_AM::lsr || ShOp == ARM_AM::asr) && ShImm == 0)
+  if (ShImm != 0)
+    return;
+  switch (ShOp) {
+  case ARM_AM::no_shift:
+  case ARM_AM::rrx:
+    break;
+  case ARM_AM::lsl:
+    ShOp = ARM_AM::no_shift;
+    break;
+  case ARM_AM::lsr:
+  case ARM_AM::asr:
     ShImm = 32;
+    break;
+  case ARM_AM::ror:
+    ShOp = ARM_AM::rrx;
+    break;
+  }
 }
 
 // getAMSubModeForBits - getAMSubModeForBits translates from the ARM encoding
@@ -490,9 +506,6 @@ static inline ARM_AM::AMSubMode getAMSubModeForBits(unsigned bits) {
 static bool DisassemblePseudo(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO) {
 
-  if (Opcode == ARM::Int_MemBarrierV7 || Opcode == ARM::Int_SyncBarrierV7)
-    return true;
-
   assert(0 && "Unexpected pseudo instruction!");
   return false;
 }
@@ -887,7 +900,6 @@ static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     return true;
   }
 
-  assert(0 && "Unexpected BrMiscFrm Opcode");
   return false;
 }
 
@@ -906,34 +918,6 @@ static inline bool getBFCInvMask(uint32_t insn, uint32_t &mask) {
   return true;
 }
 
-static inline bool SaturateOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  case ARM::SSATlsl: case ARM::SSATasr: case ARM::SSAT16:
-  case ARM::USATlsl: case ARM::USATasr: case ARM::USAT16:
-    return true;
-  default:
-    return false;
-  }
-}
-
-static inline unsigned decodeSaturatePos(unsigned Opcode, uint32_t insn) {
-  switch (Opcode) {
-  case ARM::SSATlsl:
-  case ARM::SSATasr:
-    return slice(insn, 20, 16) + 1;
-  case ARM::SSAT16:
-    return slice(insn, 19, 16) + 1;
-  case ARM::USATlsl:
-  case ARM::USATasr:
-    return slice(insn, 20, 16);
-  case ARM::USAT16:
-    return slice(insn, 19, 16);
-  default:
-    assert(0 && "Invalid opcode passed in");
-    return 0;
-  }
-}
-
 // A major complication is the fact that some of the saturating add/subtract
 // operations have Rd Rm Rn, instead of the "normal" Rd Rn Rm.
 // They are QADD, QDADD, QDSUB, and QSUB.
@@ -959,40 +943,14 @@ static bool DisassembleDPFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (OpIdx >= NumOps)
     return false;
 
-  // SSAT/SSAT16/USAT/USAT16 has imm operand after Rd.
-  if (SaturateOpcode(Opcode)) {
-    MI.addOperand(MCOperand::CreateImm(decodeSaturatePos(Opcode, insn)));
-
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-
-    if (Opcode == ARM::SSAT16 || Opcode == ARM::USAT16) {
-      OpIdx += 2;
-      return true;
-    }
-
-    // For SSAT operand reg (Rm) has been disassembled above.
-    // Now disassemble the shift amount.
-
-    // Inst{11-7} encodes the imm5 shift amount.
-    unsigned ShAmt = slice(insn, 11, 7);
-
-    // A8.6.183.  Possible ASR shift amount of 32...
-    if (Opcode == ARM::SSATasr && ShAmt == 0)
-      ShAmt = 32;
-
-    MI.addOperand(MCOperand::CreateImm(ShAmt));
-
-    OpIdx += 3;
-    return true;
-  }
-
   // Special-case handling of BFC/BFI/SBFX/UBFX.
   if (Opcode == ARM::BFC || Opcode == ARM::BFI) {
-    // TIED_TO operand skipped for BFC and Inst{3-0} (Reg) for BFI.
-    MI.addOperand(MCOperand::CreateReg(Opcode == ARM::BFC ? 0
-                                       : getRegisterEnum(B, ARM::GPRRegClassID,
+    MI.addOperand(MCOperand::CreateReg(0));
+    if (Opcode == ARM::BFI) {
+      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                          decodeRm(insn))));
+      ++OpIdx;
+    }
     uint32_t mask = 0;
     if (!getBFCInvMask(insn, mask))
       return false;
@@ -1498,13 +1456,55 @@ static bool DisassembleArithMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
       && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
     // Extract the 5-bit immediate field Inst{11-7}.
     unsigned ShiftAmt = (insn >> ARMII::ShiftShift) & 0x1F;
-    MI.addOperand(MCOperand::CreateImm(ShiftAmt));
+    ARM_AM::ShiftOpc Opc = ARM_AM::no_shift;
+    if (Opcode == ARM::PKHBT)
+      Opc = ARM_AM::lsl;
+    else if (Opcode == ARM::PKHBT)
+      Opc = ARM_AM::asr;
+    getImmShiftSE(Opc, ShiftAmt);
+    MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(Opc, ShiftAmt)));
     ++OpIdx;
   }
 
   return true;
 }
 
+/// DisassembleSatFrm - Disassemble saturate instructions:
+/// SSAT, SSAT16, USAT, and USAT16.
+static bool DisassembleSatFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
+    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
+
+  const TargetInstrDesc &TID = ARMInsts[Opcode];
+  NumOpsAdded = TID.getNumOperands() - 2; // ignore predicate operands
+
+  // Disassemble register def.
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+                                                     decodeRd(insn))));
+
+  unsigned Pos = slice(insn, 20, 16);
+  if (Opcode == ARM::SSAT || Opcode == ARM::SSAT16)
+    Pos += 1;
+  MI.addOperand(MCOperand::CreateImm(Pos));
+
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+                                                     decodeRm(insn))));
+
+  if (NumOpsAdded == 4) {
+    ARM_AM::ShiftOpc Opc = (slice(insn, 6, 6) != 0 ? ARM_AM::asr : ARM_AM::lsl);
+    // Inst{11-7} encodes the imm5 shift amount.
+    unsigned ShAmt = slice(insn, 11, 7);
+    if (ShAmt == 0) {
+      // A8.6.183.  Possible ASR shift amount of 32...
+      if (Opc == ARM_AM::asr)
+        ShAmt = 32;
+      else
+        Opc = ARM_AM::no_shift;
+    }
+    MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(Opc, ShAmt)));
+  }
+  return true;
+}
+
 // Extend instructions.
 // SXT* and UXT*: Rd [Rn] Rm [rot_imm].
 // The 2nd operand register is Rn and the 3rd operand regsiter is Rm for the
@@ -1863,7 +1863,7 @@ static bool DisassembleVFPLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   assert(NumOps >= 3 && "VFPLdStFrm expects NumOps >= 3");
 
-  bool isSPVFP = (Opcode == ARM::VLDRS || Opcode == ARM::VSTRS) ? true : false;
+  bool isSPVFP = (Opcode == ARM::VLDRS || Opcode == ARM::VSTRS);
   unsigned RegClassID = isSPVFP ? ARM::SPRRegClassID : ARM::DPRRegClassID;
 
   // Extract Dd/Sd for operand 0.
@@ -1886,7 +1886,7 @@ static bool DisassembleVFPLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
 // VFP Load/Store Multiple Instructions.
 // This is similar to the algorithm for LDM/STM in that operand 0 (the base) and
-// operand 1 (the AM5 mode imm) is followed by two predicate operands.  It is
+// operand 1 (the AM4 mode imm) is followed by two predicate operands.  It is
 // followed by a reglist of either DPR(s) or SPR(s).
 //
 // VLDMD[_UPD], VLDMS[_UPD], VSTMD[_UPD], VSTMS[_UPD]
@@ -1910,16 +1910,14 @@ static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   MI.addOperand(MCOperand::CreateReg(Base));
 
-  // Next comes the AM5 Opcode.
+  // Next comes the AM4 Opcode.
   ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn));
   // Must be either "ia" or "db" submode.
   if (SubMode != ARM_AM::ia && SubMode != ARM_AM::db) {
-    DEBUG(errs() << "Illegal addressing mode 5 sub-mode!\n");
+    DEBUG(errs() << "Illegal addressing mode 4 sub-mode!\n");
     return false;
   }
-
-  unsigned char Imm8 = insn & 0xFF;
-  MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(SubMode, Imm8)));
+  MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(SubMode)));
 
   // Handling the two predicate operands before the reglist.
   int64_t CondVal = insn >> ARMII::CondShift;
@@ -1929,13 +1927,14 @@ static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   OpIdx += 4;
 
   bool isSPVFP = (Opcode == ARM::VLDMS || Opcode == ARM::VLDMS_UPD ||
-     Opcode == ARM::VSTMS || Opcode == ARM::VSTMS_UPD) ? true : false;
+                  Opcode == ARM::VSTMS || Opcode == ARM::VSTMS_UPD);
   unsigned RegClassID = isSPVFP ? ARM::SPRRegClassID : ARM::DPRRegClassID;
 
   // Extract Dd/Sd.
   unsigned RegD = decodeVFPRd(insn, isSPVFP);
 
   // Fill the variadic part of reglist.
+  unsigned char Imm8 = insn & 0xFF;
   unsigned Regs = isSPVFP ? Imm8 : Imm8/2;
   for (unsigned i = 0; i < Regs; ++i) {
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID,
@@ -2244,9 +2243,10 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   // We have homogeneous NEON registers for Load/Store.
   unsigned RegClass = 0;
+  bool DRegPair = UseDRegPair(Opcode);
 
   // Double-spaced registers have increments of 2.
-  unsigned Inc = DblSpaced ? 2 : 1;
+  unsigned Inc = (DblSpaced || DRegPair) ? 2 : 1;
 
   unsigned Rn = decodeRn(insn);
   unsigned Rm = decodeRm(insn);
@@ -2292,8 +2292,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
     RegClass = OpInfo[OpIdx].RegClass;
     while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
       MI.addOperand(MCOperand::CreateReg(
-                      getRegisterEnum(B, RegClass, Rd,
-                                      UseDRegPair(Opcode))));
+                      getRegisterEnum(B, RegClass, Rd, DRegPair)));
       Rd += Inc;
       ++OpIdx;
     }
@@ -2312,8 +2311,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
 
     while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
       MI.addOperand(MCOperand::CreateReg(
-                      getRegisterEnum(B, RegClass, Rd,
-                                      UseDRegPair(Opcode))));
+                      getRegisterEnum(B, RegClass, Rd, DRegPair)));
       Rd += Inc;
       ++OpIdx;
     }
@@ -2351,6 +2349,11 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
     }
   }
 
+  // Accessing registers past the end of the NEON register file is not
+  // defined.
+  if (Rd > 32)
+    return false;
+
   return true;
 }
 
@@ -2423,10 +2426,14 @@ static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode,
     break;
   case ARM::VMOVv4i16:
   case ARM::VMOVv8i16:
+  case ARM::VMVNv4i16:
+  case ARM::VMVNv8i16:
     esize = ESize16;
     break;
   case ARM::VMOVv2i32:
   case ARM::VMOVv4i32:
+  case ARM::VMVNv2i32:
+  case ARM::VMVNv4i32:
     esize = ESize32;
     break;
   case ARM::VMOVv1i64:
@@ -2944,7 +2951,7 @@ static bool DisassembleNDupFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 // A8.6.49 ISB
 static inline bool MemBarrierInstr(uint32_t insn) {
   unsigned op7_4 = slice(insn, 7, 4);
-  if (slice(insn, 31, 20) == 0xf57 && (op7_4 >= 4 && op7_4 <= 6))
+  if (slice(insn, 31, 8) == 0xf57ff0 && (op7_4 >= 4 && op7_4 <= 6))
     return true;
 
   return false;
@@ -3001,8 +3008,15 @@ static bool DisassemblePreLoadFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
-  if (MemBarrierInstr(insn))
+  if (MemBarrierInstr(insn)) {
+    // DMBsy, DSBsy, and ISBsy instructions have zero operand and are taken care
+    // of within the generic ARMBasicMCBuilder::BuildIt() method.
+    //
+    // Inst{3-0} encodes the memory barrier option for the variants.
+    MI.addOperand(MCOperand::CreateImm(slice(insn, 3, 0)));
+    NumOpsAdded = 1;
     return true;
+  }
 
   switch (Opcode) {
   case ARM::CLREX:
@@ -3073,6 +3087,7 @@ static const DisassembleFP FuncPtrs[] = {
   &DisassembleLdStMulFrm,
   &DisassembleLdStExFrm,
   &DisassembleArithMiscFrm,
+  &DisassembleSatFrm,
   &DisassembleExtFrm,
   &DisassembleVFPUnaryFrm,
   &DisassembleVFPBinaryFrm,
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
index 7d21256a14f9f..9c30d332d1f20 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
+++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
@@ -23,7 +23,8 @@
 
 #include "llvm/MC/MCInst.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "ARMInstrInfo.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMRegisterInfo.h"
 #include "ARMDisassembler.h"
 
 namespace llvm {
@@ -53,36 +54,35 @@ public:
   ENTRY(ARM_FORMAT_LDSTMULFRM,    10) \
   ENTRY(ARM_FORMAT_LDSTEXFRM,     11) \
   ENTRY(ARM_FORMAT_ARITHMISCFRM,  12) \
-  ENTRY(ARM_FORMAT_EXTFRM,        13) \
-  ENTRY(ARM_FORMAT_VFPUNARYFRM,   14) \
-  ENTRY(ARM_FORMAT_VFPBINARYFRM,  15) \
-  ENTRY(ARM_FORMAT_VFPCONV1FRM,   16) \
-  ENTRY(ARM_FORMAT_VFPCONV2FRM,   17) \
-  ENTRY(ARM_FORMAT_VFPCONV3FRM,   18) \
-  ENTRY(ARM_FORMAT_VFPCONV4FRM,   19) \
-  ENTRY(ARM_FORMAT_VFPCONV5FRM,   20) \
-  ENTRY(ARM_FORMAT_VFPLDSTFRM,    21) \
-  ENTRY(ARM_FORMAT_VFPLDSTMULFRM, 22) \
-  ENTRY(ARM_FORMAT_VFPMISCFRM,    23) \
-  ENTRY(ARM_FORMAT_THUMBFRM,      24) \
-  ENTRY(ARM_FORMAT_NEONFRM,       25) \
-  ENTRY(ARM_FORMAT_NEONGETLNFRM,  26) \
-  ENTRY(ARM_FORMAT_NEONSETLNFRM,  27) \
-  ENTRY(ARM_FORMAT_NEONDUPFRM,    28) \
-  ENTRY(ARM_FORMAT_MISCFRM,       29) \
-  ENTRY(ARM_FORMAT_THUMBMISCFRM,  30) \
-  ENTRY(ARM_FORMAT_NLdSt,         31) \
-  ENTRY(ARM_FORMAT_N1RegModImm,   32) \
-  ENTRY(ARM_FORMAT_N2Reg,         33) \
-  ENTRY(ARM_FORMAT_NVCVT,         34) \
-  ENTRY(ARM_FORMAT_NVecDupLn,     35) \
-  ENTRY(ARM_FORMAT_N2RegVecShL,   36) \
-  ENTRY(ARM_FORMAT_N2RegVecShR,   37) \
-  ENTRY(ARM_FORMAT_N3Reg,         38) \
-  ENTRY(ARM_FORMAT_N3RegVecSh,    39) \
-  ENTRY(ARM_FORMAT_NVecExtract,   40) \
-  ENTRY(ARM_FORMAT_NVecMulScalar, 41) \
-  ENTRY(ARM_FORMAT_NVTBL,         42)
+  ENTRY(ARM_FORMAT_SATFRM,        13) \
+  ENTRY(ARM_FORMAT_EXTFRM,        14) \
+  ENTRY(ARM_FORMAT_VFPUNARYFRM,   15) \
+  ENTRY(ARM_FORMAT_VFPBINARYFRM,  16) \
+  ENTRY(ARM_FORMAT_VFPCONV1FRM,   17) \
+  ENTRY(ARM_FORMAT_VFPCONV2FRM,   18) \
+  ENTRY(ARM_FORMAT_VFPCONV3FRM,   19) \
+  ENTRY(ARM_FORMAT_VFPCONV4FRM,   20) \
+  ENTRY(ARM_FORMAT_VFPCONV5FRM,   21) \
+  ENTRY(ARM_FORMAT_VFPLDSTFRM,    22) \
+  ENTRY(ARM_FORMAT_VFPLDSTMULFRM, 23) \
+  ENTRY(ARM_FORMAT_VFPMISCFRM,    24) \
+  ENTRY(ARM_FORMAT_THUMBFRM,      25) \
+  ENTRY(ARM_FORMAT_MISCFRM,       26) \
+  ENTRY(ARM_FORMAT_NEONGETLNFRM,  27) \
+  ENTRY(ARM_FORMAT_NEONSETLNFRM,  28) \
+  ENTRY(ARM_FORMAT_NEONDUPFRM,    29) \
+  ENTRY(ARM_FORMAT_NLdSt,         30) \
+  ENTRY(ARM_FORMAT_N1RegModImm,   31) \
+  ENTRY(ARM_FORMAT_N2Reg,         32) \
+  ENTRY(ARM_FORMAT_NVCVT,         33) \
+  ENTRY(ARM_FORMAT_NVecDupLn,     34) \
+  ENTRY(ARM_FORMAT_N2RegVecShL,   35) \
+  ENTRY(ARM_FORMAT_N2RegVecShR,   36) \
+  ENTRY(ARM_FORMAT_N3Reg,         37) \
+  ENTRY(ARM_FORMAT_N3RegVecSh,    38) \
+  ENTRY(ARM_FORMAT_NVecExtract,   39) \
+  ENTRY(ARM_FORMAT_NVecMulScalar, 40) \
+  ENTRY(ARM_FORMAT_NVTBL,         41)
 
 // ARM instruction format specifies the encoding used by the instruction.
 #define ENTRY(n, v) n = v,
@@ -126,8 +126,8 @@ static inline unsigned slice(uint32_t Bits, unsigned From, unsigned To) {
 }
 
 /// Utility function for setting [From, To] bits to Val for a uint32_t.
-static inline void setSlice(uint32_t &Bits, unsigned From, unsigned To,
-                            uint32_t Val) {
+static inline void setSlice(unsigned &Bits, unsigned From, unsigned To,
+                            unsigned Val) {
   assert(From < 32 && To < 32 && From >= To);
   uint32_t Mask = ((1 << (From - To + 1)) - 1);
   Bits &= ~(Mask << To);
diff --git a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
index 4b7a0bf6fdb91..112817b13cf90 100644
--- a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
+++ b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
@@ -103,7 +103,7 @@ static inline unsigned getT1Cond(uint32_t insn) {
 }
 
 static inline bool IsGPR(unsigned RegClass) {
-  return RegClass == ARM::GPRRegClassID;
+  return RegClass == ARM::GPRRegClassID || RegClass == ARM::rGPRRegClassID;
 }
 
 // Utilities for 32-bit Thumb instructions.
@@ -220,7 +220,7 @@ static inline unsigned decodeImmShift(unsigned bits2, unsigned imm5,
   switch (bits2) {
   default: assert(0 && "No such value");
   case 0:
-    ShOp = ARM_AM::lsl;
+    ShOp = (imm5 == 0 ? ARM_AM::no_shift : ARM_AM::lsl);
     return imm5;
   case 1:
     ShOp = ARM_AM::lsr;
@@ -1324,7 +1324,7 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
            && OpInfo[1].RegClass == ARM::GPRRegClassID
            && OpInfo[2].RegClass < 0
            && OpInfo[3].RegClass < 0
-           && "Exactlt 4 operands expect and first two as reg operands");
+           && "Exactly 4 operands expect and first two as reg operands");
     // Only need to populate the src reg operand.
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        decodeRm(insn))));
@@ -1338,17 +1338,20 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
   OpIdx = 0;
 
   assert(NumOps >= 2
-         && OpInfo[0].RegClass == ARM::GPRRegClassID
-         && OpInfo[1].RegClass == ARM::GPRRegClassID
+         && (OpInfo[0].RegClass == ARM::GPRRegClassID ||
+             OpInfo[0].RegClass == ARM::rGPRRegClassID)
+         && (OpInfo[1].RegClass == ARM::GPRRegClassID ||
+             OpInfo[1].RegClass == ARM::rGPRRegClassID)
          && "Expect >= 2 operands and first two as reg operands");
 
-  bool ThreeReg = (NumOps > 2 && OpInfo[2].RegClass == ARM::GPRRegClassID);
+  bool ThreeReg = (NumOps > 2 && (OpInfo[2].RegClass == ARM::GPRRegClassID ||
+                                  OpInfo[2].RegClass == ARM::rGPRRegClassID));
   bool NoDstReg = (decodeRs(insn) == 0xF);
 
   // Build the register operands, followed by the constant shift specifier.
 
   MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, ARM::GPRRegClassID,
+                  getRegisterEnum(B, OpInfo[0].RegClass,
                                   NoDstReg ? decodeRn(insn) : decodeRs(insn))));
   ++OpIdx;
 
@@ -1359,7 +1362,7 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
       MI.addOperand(MI.getOperand(Idx));
       ++OpIdx;
     } else if (!NoDstReg) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[1].RegClass,
                                                          decodeRn(insn))));
       ++OpIdx;
     } else {
@@ -1368,7 +1371,7 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
     }
   }
 
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
                                                      decodeRm(insn))));
   ++OpIdx;
 
@@ -1386,14 +1389,7 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
       unsigned imm5 = getShiftAmtBits(insn);
       ARM_AM::ShiftOpc ShOp = ARM_AM::no_shift;
       unsigned ShAmt = decodeImmShift(bits2, imm5, ShOp);
-
-      // PKHBT/PKHTB are special in that we need the decodeImmShift() call to
-      // decode the shift amount from raw imm5 and bits2, but we DO NOT need
-      // to encode the ShOp, as it's in the asm string already.
-      if (Opcode == ARM::t2PKHBT || Opcode == ARM::t2PKHTB)
-        MI.addOperand(MCOperand::CreateImm(ShAmt));
-      else
-        MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(ShOp, ShAmt)));
+      MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(ShOp, ShAmt)));
     }
     ++OpIdx;
   }
@@ -1416,16 +1412,20 @@ static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode,
 
   OpIdx = 0;
 
-  assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::GPRRegClassID
+  unsigned RdRegClassID = OpInfo[0].RegClass;
+  assert(NumOps >= 2 && (RdRegClassID == ARM::GPRRegClassID ||
+                         RdRegClassID == ARM::rGPRRegClassID)
          && "Expect >= 2 operands and first one as reg operand");
 
-  bool TwoReg = (OpInfo[1].RegClass == ARM::GPRRegClassID);
+  unsigned RnRegClassID = OpInfo[1].RegClass;
+  bool TwoReg = (RnRegClassID == ARM::GPRRegClassID
+                 || RnRegClassID == ARM::rGPRRegClassID);
   bool NoDstReg = (decodeRs(insn) == 0xF);
 
   // Build the register operands, followed by the modified immediate.
 
   MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, ARM::GPRRegClassID,
+                  getRegisterEnum(B, RdRegClassID,
                                   NoDstReg ? decodeRn(insn) : decodeRs(insn))));
   ++OpIdx;
 
@@ -1434,7 +1434,7 @@ static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode,
       DEBUG(errs()<<"Thumb2 encoding error: d==15 for DPModImm 2-reg instr.\n");
       return false;
     }
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID,
                                                        decodeRn(insn))));
     ++OpIdx;
   }
@@ -1455,30 +1455,48 @@ static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode,
 
 static inline bool Thumb2SaturateOpcode(unsigned Opcode) {
   switch (Opcode) {
-  case ARM::t2SSATlsl: case ARM::t2SSATasr: case ARM::t2SSAT16:
-  case ARM::t2USATlsl: case ARM::t2USATasr: case ARM::t2USAT16:
+  case ARM::t2SSAT: case ARM::t2SSAT16:
+  case ARM::t2USAT: case ARM::t2USAT16:
     return true;
   default:
     return false;
   }
 }
 
-static inline unsigned decodeThumb2SaturatePos(unsigned Opcode, uint32_t insn) {
-  switch (Opcode) {
-  case ARM::t2SSATlsl:
-  case ARM::t2SSATasr:
-    return slice(insn, 4, 0) + 1;
-  case ARM::t2SSAT16:
-    return slice(insn, 3, 0) + 1;
-  case ARM::t2USATlsl:
-  case ARM::t2USATasr:
-    return slice(insn, 4, 0);
-  case ARM::t2USAT16:
-    return slice(insn, 3, 0);
-  default:
-    assert(0 && "Unexpected opcode");
-    return 0;
+/// DisassembleThumb2Sat - Disassemble Thumb2 saturate instructions:
+/// o t2SSAT, t2USAT: Rs sat_pos Rn shamt
+/// o t2SSAT16, t2USAT16: Rs sat_pos Rn
+static bool DisassembleThumb2Sat(MCInst &MI, unsigned Opcode, uint32_t insn,
+                                 unsigned &NumOpsAdded, BO B) {
+  const TargetInstrDesc &TID = ARMInsts[Opcode];
+  NumOpsAdded = TID.getNumOperands() - 2; // ignore predicate operands
+
+  // Disassemble the register def.
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
+                                                     decodeRs(insn))));
+
+  unsigned Pos = slice(insn, 4, 0);
+  if (Opcode == ARM::t2SSAT || Opcode == ARM::t2SSAT16)
+    Pos += 1;
+  MI.addOperand(MCOperand::CreateImm(Pos));
+
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
+                                                     decodeRn(insn))));
+
+  if (NumOpsAdded == 4) {
+    ARM_AM::ShiftOpc Opc = (slice(insn, 21, 21) != 0 ?
+                            ARM_AM::asr : ARM_AM::lsl);
+    // Inst{14-12:7-6} encodes the imm5 shift amount.
+    unsigned ShAmt = slice(insn, 14, 12) << 2 | slice(insn, 7, 6);
+    if (ShAmt == 0) {
+      if (Opc == ARM_AM::asr)
+        ShAmt = 32;
+      else
+        Opc = ARM_AM::no_shift;
+    }
+    MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(Opc, ShAmt)));
   }
+  return true;
 }
 
 // A6.3.3 Data-processing (plain binary immediate)
@@ -1492,11 +1510,6 @@ static inline unsigned decodeThumb2SaturatePos(unsigned Opcode, uint32_t insn) {
 // o t2SBFX (SBFX): Rs Rn lsb width
 // o t2UBFX (UBFX): Rs Rn lsb width
 // o t2BFI (BFI): Rs Rn lsb width
-//
-// [Signed|Unsigned] Saturate [16]
-//
-// o t2SSAT[lsl|asr], t2USAT[lsl|asr]: Rs sat_pos Rn shamt
-// o t2SSAT16, t2USAT16: Rs sat_pos Rn
 static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode,
     uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
@@ -1506,41 +1519,21 @@ static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode,
 
   OpIdx = 0;
 
-  assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::GPRRegClassID
+  unsigned RdRegClassID = OpInfo[0].RegClass;
+  assert(NumOps >= 2 && (RdRegClassID == ARM::GPRRegClassID ||
+                         RdRegClassID == ARM::rGPRRegClassID)
          && "Expect >= 2 operands and first one as reg operand");
 
-  bool TwoReg = (OpInfo[1].RegClass == ARM::GPRRegClassID);
+  unsigned RnRegClassID = OpInfo[1].RegClass;
+  bool TwoReg = (RnRegClassID == ARM::GPRRegClassID
+                 || RnRegClassID == ARM::rGPRRegClassID);
 
   // Build the register operand(s), followed by the immediate(s).
 
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RdRegClassID,
                                                      decodeRs(insn))));
   ++OpIdx;
 
-  // t2SSAT/t2SSAT16/t2USAT/t2USAT16 has imm operand after Rd.
-  if (Thumb2SaturateOpcode(Opcode)) {
-    MI.addOperand(MCOperand::CreateImm(decodeThumb2SaturatePos(Opcode, insn)));
-
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-
-    if (Opcode == ARM::t2SSAT16 || Opcode == ARM::t2USAT16) {
-      OpIdx += 2;
-      return true;
-    }
-
-    // For SSAT operand reg (Rn) has been disassembled above.
-    // Now disassemble the shift amount.
-
-    // Inst{14-12:7-6} encodes the imm5 shift amount.
-    unsigned ShAmt = slice(insn, 14, 12) << 2 | slice(insn, 7, 6);
-
-    MI.addOperand(MCOperand::CreateImm(ShAmt));
-
-    OpIdx += 3;
-    return true;
-  }
-
   if (TwoReg) {
     assert(NumOps >= 3 && "Expect >= 3 operands");
     int Idx;
@@ -1549,12 +1542,19 @@ static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode,
       MI.addOperand(MI.getOperand(Idx));
     } else {
       // Add src reg operand.
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID,
                                                          decodeRn(insn))));
     }
     ++OpIdx;
   }
 
+  if (Opcode == ARM::t2BFI) {
+    // Add val reg operand.
+    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID,
+                                                       decodeRn(insn))));
+    ++OpIdx;
+  }
+
   assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
          && !OpInfo[OpIdx].isOptionalDef()
          && "Pure imm operand expected");
@@ -1567,7 +1567,7 @@ static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode,
     MI.addOperand(MCOperand::CreateImm(getIImm3Imm8(insn)));
   else if (Opcode == ARM::t2MOVi16 || Opcode == ARM::t2MOVTi16)
     MI.addOperand(MCOperand::CreateImm(getImm16(insn)));
-  else if (Opcode == ARM::t2BFC) {
+  else if (Opcode == ARM::t2BFC || Opcode == ARM::t2BFI) {
     uint32_t mask = 0;
     if (getBitfieldInvMask(insn, mask))
       MI.addOperand(MCOperand::CreateImm(mask));
@@ -1575,17 +1575,10 @@ static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode,
       return false;
   } else {
     // Handle the case of: lsb width
-    assert((Opcode == ARM::t2SBFX || Opcode == ARM::t2UBFX ||
-            Opcode == ARM::t2BFI) && "Unexpected opcode");
+    assert((Opcode == ARM::t2SBFX || Opcode == ARM::t2UBFX)
+            && "Unexpected opcode");
     MI.addOperand(MCOperand::CreateImm(getLsb(insn)));
-    if (Opcode == ARM::t2BFI) {
-      if (getMsb(insn) < getLsb(insn)) {
-        DEBUG(errs() << "Encoding error: msb < lsb\n");
-        return false;
-      }
-      MI.addOperand(MCOperand::CreateImm(getMsb(insn) - getLsb(insn) + 1));
-    } else
-      MI.addOperand(MCOperand::CreateImm(getWidthMinus1(insn) + 1));
+    MI.addOperand(MCOperand::CreateImm(getWidthMinus1(insn) + 1));
 
     ++OpIdx;
   }
@@ -1618,8 +1611,8 @@ static inline bool t2MiscCtrlInstr(uint32_t insn) {
 // A8.6.26
 // t2BXJ -> Rn
 //
-// Miscellaneous control: t2Int_MemBarrierV7 (and its t2DMB variants),
-// t2Int_SyncBarrierV7 (and its t2DSB varianst), t2ISBsy, t2CLREX
+// Miscellaneous control: t2DMBsy (and its t2DMB variants),
+// t2DSBsy (and its t2DSB varianst), t2ISBsy, t2CLREX
 //   -> no operand (except pred-imm pred-ccr for CLREX, memory barrier variants)
 //
 // Hint: t2NOP, t2YIELD, t2WFE, t2WFI, t2SEV
@@ -1959,25 +1952,25 @@ static bool DisassembleThumb2DPReg(MCInst &MI, unsigned Opcode, uint32_t insn,
   OpIdx = 0;
 
   assert(NumOps >= 2 &&
-         OpInfo[0].RegClass == ARM::GPRRegClassID &&
-         OpInfo[1].RegClass == ARM::GPRRegClassID &&
+         OpInfo[0].RegClass == ARM::rGPRRegClassID &&
+         OpInfo[1].RegClass == ARM::rGPRRegClassID &&
          "Expect >= 2 operands and first two as reg operands");
 
   // Build the register operands, followed by the optional rotation amount.
 
-  bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass == ARM::GPRRegClassID;
+  bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass == ARM::rGPRRegClassID;
 
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
                                                      decodeRs(insn))));
   ++OpIdx;
 
   if (ThreeReg) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
                                                        decodeRn(insn))));
     ++OpIdx;
   }
 
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
                                                      decodeRm(insn))));
   ++OpIdx;
 
@@ -2009,26 +2002,26 @@ static bool DisassembleThumb2Mul(MCInst &MI, unsigned Opcode, uint32_t insn,
   const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
 
   assert(NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::GPRRegClassID &&
-         OpInfo[1].RegClass == ARM::GPRRegClassID &&
-         OpInfo[2].RegClass == ARM::GPRRegClassID &&
+         OpInfo[0].RegClass == ARM::rGPRRegClassID &&
+         OpInfo[1].RegClass == ARM::rGPRRegClassID &&
+         OpInfo[2].RegClass == ARM::rGPRRegClassID &&
          "Expect >= 3 operands and first three as reg operands");
 
   // Build the register operands.
 
-  bool FourReg = NumOps > 3 && OpInfo[3].RegClass == ARM::GPRRegClassID;
+  bool FourReg = NumOps > 3 && OpInfo[3].RegClass == ARM::rGPRRegClassID;
 
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
                                                      decodeRs(insn))));
 
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
                                                      decodeRn(insn))));
 
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
                                                      decodeRm(insn))));
 
   if (FourReg)
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
                                                        decodeRd(insn))));
 
   NumOpsAdded = FourReg ? 4 : 3;
@@ -2054,26 +2047,26 @@ static bool DisassembleThumb2LongMul(MCInst &MI, unsigned Opcode, uint32_t insn,
   const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
 
   assert(NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::GPRRegClassID &&
-         OpInfo[1].RegClass == ARM::GPRRegClassID &&
-         OpInfo[2].RegClass == ARM::GPRRegClassID &&
+         OpInfo[0].RegClass == ARM::rGPRRegClassID &&
+         OpInfo[1].RegClass == ARM::rGPRRegClassID &&
+         OpInfo[2].RegClass == ARM::rGPRRegClassID &&
          "Expect >= 3 operands and first three as reg operands");
 
-  bool FourReg = NumOps > 3 && OpInfo[3].RegClass == ARM::GPRRegClassID;
+  bool FourReg = NumOps > 3 && OpInfo[3].RegClass == ARM::rGPRRegClassID;
 
   // Build the register operands.
 
   if (FourReg)
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
                                                        decodeRd(insn))));
 
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
                                                      decodeRs(insn))));
 
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
                                                      decodeRn(insn))));
 
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
                                                      decodeRm(insn))));
 
   if (FourReg)
@@ -2152,22 +2145,20 @@ static bool DisassembleThumb2(uint16_t op1, uint16_t op2, uint16_t op,
     break;
   case 2:
     if (op == 0) {
-      if (slice(op2, 5, 5) == 0) {
+      if (slice(op2, 5, 5) == 0)
         // Data-processing (modified immediate)
         return DisassembleThumb2DPModImm(MI, Opcode, insn, NumOps, NumOpsAdded,
                                          B);
-      } else {
-        // Data-processing (plain binary immediate)
-        return DisassembleThumb2DPBinImm(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                         B);
-      }
-    } else {
-      // Branches and miscellaneous control on page A6-20.
-      return DisassembleThumb2BrMiscCtrl(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                         B);
-    }
+      if (Thumb2SaturateOpcode(Opcode))
+        return DisassembleThumb2Sat(MI, Opcode, insn, NumOpsAdded, B);
 
-    break;
+      // Data-processing (plain binary immediate)
+      return DisassembleThumb2DPBinImm(MI, Opcode, insn, NumOps, NumOpsAdded,
+                                       B);
+    }
+    // Branches and miscellaneous control on page A6-20.
+    return DisassembleThumb2BrMiscCtrl(MI, Opcode, insn, NumOps, NumOpsAdded,
+                                       B);
   case 3:
     switch (slice(op2, 6, 5)) {
     case 0:
diff --git a/lib/Target/ARM/Makefile b/lib/Target/ARM/Makefile
index 9e3ff29e07c42..b3fcfaf6bda7d 100644
--- a/lib/Target/ARM/Makefile
+++ b/lib/Target/ARM/Makefile
@@ -14,10 +14,11 @@ TARGET = ARM
 # Make sure that tblgen is run, first thing.
 BUILT_SOURCES = ARMGenRegisterInfo.h.inc ARMGenRegisterNames.inc \
                 ARMGenRegisterInfo.inc ARMGenInstrNames.inc \
-                ARMGenInstrInfo.inc ARMGenAsmWriter.inc \
+                ARMGenInstrInfo.inc ARMGenAsmWriter.inc ARMGenAsmMatcher.inc \
                 ARMGenDAGISel.inc ARMGenSubtarget.inc \
                 ARMGenCodeEmitter.inc ARMGenCallingConv.inc \
-                ARMGenDecoderTables.inc ARMGenEDInfo.inc
+                ARMGenDecoderTables.inc ARMGenEDInfo.inc \
+                ARMGenFastISel.inc
 
 DIRS = AsmPrinter AsmParser Disassembler TargetInfo
 
diff --git a/lib/Target/ARM/NEONMoveFix.cpp b/lib/Target/ARM/NEONMoveFix.cpp
index bbdd3c7f7c3e0..97e54bfaed9e7 100644
--- a/lib/Target/ARM/NEONMoveFix.cpp
+++ b/lib/Target/ARM/NEONMoveFix.cpp
@@ -24,7 +24,7 @@ STATISTIC(NumVMovs, "Number of reg-reg moves converted");
 namespace {
   struct NEONMoveFixPass : public MachineFunctionPass {
     static char ID;
-    NEONMoveFixPass() : MachineFunctionPass(&ID) {}
+    NEONMoveFixPass() : MachineFunctionPass(ID) {}
 
     virtual bool runOnMachineFunction(MachineFunction &Fn);
 
diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp
index f67717cdd56f5..3407ac6fe08ec 100644
--- a/lib/Target/ARM/NEONPreAllocPass.cpp
+++ b/lib/Target/ARM/NEONPreAllocPass.cpp
@@ -23,7 +23,7 @@ namespace {
 
   public:
     static char ID;
-    NEONPreAllocPass() : MachineFunctionPass(&ID) {}
+    NEONPreAllocPass() : MachineFunctionPass(ID) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -51,13 +51,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
   default:
     break;
 
-  case ARM::VLD1q8:
-  case ARM::VLD1q16:
-  case ARM::VLD1q32:
-  case ARM::VLD1q64:
-  case ARM::VLD2d8:
-  case ARM::VLD2d16:
-  case ARM::VLD2d32:
   case ARM::VLD2LNd8:
   case ARM::VLD2LNd16:
   case ARM::VLD2LNd32:
@@ -65,13 +58,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
     NumRegs = 2;
     return true;
 
-  case ARM::VLD2q8:
-  case ARM::VLD2q16:
-  case ARM::VLD2q32:
-    FirstOpnd = 0;
-    NumRegs = 4;
-    return true;
-
   case ARM::VLD2LNq16:
   case ARM::VLD2LNq32:
     FirstOpnd = 0;
@@ -88,10 +74,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
     Stride = 2;
     return true;
 
-  case ARM::VLD3d8:
-  case ARM::VLD3d16:
-  case ARM::VLD3d32:
-  case ARM::VLD1d64T:
   case ARM::VLD3LNd8:
   case ARM::VLD3LNd16:
   case ARM::VLD3LNd32:
@@ -99,24 +81,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
     NumRegs = 3;
     return true;
 
-  case ARM::VLD3q8_UPD:
-  case ARM::VLD3q16_UPD:
-  case ARM::VLD3q32_UPD:
-    FirstOpnd = 0;
-    NumRegs = 3;
-    Offset = 0;
-    Stride = 2;
-    return true;
-
-  case ARM::VLD3q8odd_UPD:
-  case ARM::VLD3q16odd_UPD:
-  case ARM::VLD3q32odd_UPD:
-    FirstOpnd = 0;
-    NumRegs = 3;
-    Offset = 1;
-    Stride = 2;
-    return true;
-
   case ARM::VLD3LNq16:
   case ARM::VLD3LNq32:
     FirstOpnd = 0;
@@ -133,10 +97,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
     Stride = 2;
     return true;
 
-  case ARM::VLD4d8:
-  case ARM::VLD4d16:
-  case ARM::VLD4d32:
-  case ARM::VLD1d64Q:
   case ARM::VLD4LNd8:
   case ARM::VLD4LNd16:
   case ARM::VLD4LNd32:
@@ -144,24 +104,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
     NumRegs = 4;
     return true;
 
-  case ARM::VLD4q8_UPD:
-  case ARM::VLD4q16_UPD:
-  case ARM::VLD4q32_UPD:
-    FirstOpnd = 0;
-    NumRegs = 4;
-    Offset = 0;
-    Stride = 2;
-    return true;
-
-  case ARM::VLD4q8odd_UPD:
-  case ARM::VLD4q16odd_UPD:
-  case ARM::VLD4q32odd_UPD:
-    FirstOpnd = 0;
-    NumRegs = 4;
-    Offset = 1;
-    Stride = 2;
-    return true;
-
   case ARM::VLD4LNq16:
   case ARM::VLD4LNq32:
     FirstOpnd = 0;
@@ -178,13 +120,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
     Stride = 2;
     return true;
 
-  case ARM::VST1q8:
-  case ARM::VST1q16:
-  case ARM::VST1q32:
-  case ARM::VST1q64:
-  case ARM::VST2d8:
-  case ARM::VST2d16:
-  case ARM::VST2d32:
   case ARM::VST2LNd8:
   case ARM::VST2LNd16:
   case ARM::VST2LNd32:
@@ -192,13 +127,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
     NumRegs = 2;
     return true;
 
-  case ARM::VST2q8:
-  case ARM::VST2q16:
-  case ARM::VST2q32:
-    FirstOpnd = 2;
-    NumRegs = 4;
-    return true;
-
   case ARM::VST2LNq16:
   case ARM::VST2LNq32:
     FirstOpnd = 2;
@@ -215,10 +143,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
     Stride = 2;
     return true;
 
-  case ARM::VST3d8:
-  case ARM::VST3d16:
-  case ARM::VST3d32:
-  case ARM::VST1d64T:
   case ARM::VST3LNd8:
   case ARM::VST3LNd16:
   case ARM::VST3LNd32:
@@ -226,24 +150,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
     NumRegs = 3;
     return true;
 
-  case ARM::VST3q8_UPD:
-  case ARM::VST3q16_UPD:
-  case ARM::VST3q32_UPD:
-    FirstOpnd = 4;
-    NumRegs = 3;
-    Offset = 0;
-    Stride = 2;
-    return true;
-
-  case ARM::VST3q8odd_UPD:
-  case ARM::VST3q16odd_UPD:
-  case ARM::VST3q32odd_UPD:
-    FirstOpnd = 4;
-    NumRegs = 3;
-    Offset = 1;
-    Stride = 2;
-    return true;
-
   case ARM::VST3LNq16:
   case ARM::VST3LNq32:
     FirstOpnd = 2;
@@ -260,10 +166,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
     Stride = 2;
     return true;
 
-  case ARM::VST4d8:
-  case ARM::VST4d16:
-  case ARM::VST4d32:
-  case ARM::VST1d64Q:
   case ARM::VST4LNd8:
   case ARM::VST4LNd16:
   case ARM::VST4LNd32:
@@ -271,24 +173,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
     NumRegs = 4;
     return true;
 
-  case ARM::VST4q8_UPD:
-  case ARM::VST4q16_UPD:
-  case ARM::VST4q32_UPD:
-    FirstOpnd = 4;
-    NumRegs = 4;
-    Offset = 0;
-    Stride = 2;
-    return true;
-
-  case ARM::VST4q8odd_UPD:
-  case ARM::VST4q16odd_UPD:
-  case ARM::VST4q32odd_UPD:
-    FirstOpnd = 4;
-    NumRegs = 4;
-    Offset = 1;
-    Stride = 2;
-    return true;
-
   case ARM::VST4LNq16:
   case ARM::VST4LNq32:
     FirstOpnd = 2;
@@ -468,7 +352,34 @@ bool NEONPreAllocPass::PreAllocNEONRegisters(MachineBasicBlock &MBB) {
       continue;
     if (FormsRegSequence(MI, FirstOpnd, NumRegs, Offset, Stride))
       continue;
-    llvm_unreachable("expected a REG_SEQUENCE");
+
+    MachineBasicBlock::iterator NextI = llvm::next(MBBI);
+    for (unsigned R = 0; R < NumRegs; ++R) {
+      MachineOperand &MO = MI->getOperand(FirstOpnd + R);
+      assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand");
+      unsigned VirtReg = MO.getReg();
+      assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+             "expected a virtual register");
+
+      // For now, just assign a fixed set of adjacent registers.
+      // This leaves plenty of room for future improvements.
+      static const unsigned NEONDRegs[] = {
+        ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+        ARM::D4, ARM::D5, ARM::D6, ARM::D7
+      };
+      MO.setReg(NEONDRegs[Offset + R * Stride]);
+
+      if (MO.isUse()) {
+        // Insert a copy from VirtReg.
+        BuildMI(MBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY),MO.getReg())
+          .addReg(VirtReg, getKillRegState(MO.isKill()));
+        MO.setIsKill();
+      } else if (MO.isDef() && !MO.isDead()) {
+        // Add a copy to VirtReg.
+        BuildMI(MBB, NextI, DebugLoc(), TII->get(TargetOpcode::COPY), VirtReg)
+          .addReg(MO.getReg());
+      }
+    }
   }
 
   return Modified;
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 0cb8ff01181d7..9fc3fb92cb2c1 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -611,27 +611,6 @@ constant which was already loaded).  Not sure what's necessary to do that.
 
 //===---------------------------------------------------------------------===//
 
-Given the following on ARMv7:
-int test1(int A, int B) {
-  return (A&-8388481)|(B&8388480);
-}
-
-We currently generate:
-	bfc	r0, #7, #16
-	movw	r2, #:lower16:8388480
-	movt	r2, #:upper16:8388480
-	and	r1, r1, r2
-	orr	r0, r1, r0
-	bx	lr
-
-The following is much shorter:
-	lsr	r1, r1, #7
-	bfi	r0, r1, #7, #16
-	bx	lr
-
-
-//===---------------------------------------------------------------------===//
-
 The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal:
 
 int a(int x) { return __builtin_bswap32(x); }
@@ -657,3 +636,24 @@ A custom Thumb version would also be a slight improvement over the generic
 version.
 
 //===---------------------------------------------------------------------===//
+
+Consider the following simple C code:
+
+void foo(unsigned char *a, unsigned char *b, int *c) {
+ if ((*a | *b) == 0) *c = 0;
+}
+
+currently llvm-gcc generates something like this (nice branchless code I'd say):
+
+       ldrb    r0, [r0]
+       ldrb    r1, [r1]
+       orr     r0, r1, r0
+       tst     r0, #255
+       moveq   r0, #0
+       streq   r0, [r2]
+       bx      lr
+
+Note that both "tst" and "moveq" are redundant.
+
+//===---------------------------------------------------------------------===//
+
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 39b70b43b23f5..a21a3da10bdad 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -68,7 +68,7 @@ void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
           .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg);
 }
 
-bool Thumb1RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
+bool Thumb1RegisterInfo::hasReservedCallFrame(const MachineFunction &MF) const {
   const MachineFrameInfo *FFI = MF.getFrameInfo();
   unsigned CFSize = FFI->getMaxCallFrameSize();
   // It's not always a good idea to include the call frame as part of the
@@ -363,107 +363,19 @@ static void removeOperands(MachineInstr &MI, unsigned i) {
     MI.RemoveOperand(Op);
 }
 
-int Thumb1RegisterInfo::
-rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                  unsigned FrameReg, int Offset,
-                  unsigned MOVOpc, unsigned ADDriOpc, unsigned SUBriOpc) const
-{
-  // if/when eliminateFrameIndex() conforms with ARMBaseRegisterInfo
-  // version then can pull out Thumb1 specific parts here
-  return 0;
-}
-
-/// saveScavengerRegister - Spill the register so it can be used by the
-/// register scavenger. Return true.
-bool
-Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator I,
-                                          MachineBasicBlock::iterator &UseMI,
-                                          const TargetRegisterClass *RC,
-                                          unsigned Reg) const {
-  // Thumb1 can't use the emergency spill slot on the stack because
-  // ldr/str immediate offsets must be positive, and if we're referencing
-  // off the frame pointer (if, for example, there are alloca() calls in
-  // the function, the offset will be negative. Use R12 instead since that's
-  // a call clobbered register that we know won't be used in Thumb1 mode.
-  DebugLoc DL;
-  BuildMI(MBB, I, DL, TII.get(ARM::tMOVtgpr2gpr)).
-    addReg(ARM::R12, RegState::Define).addReg(Reg, RegState::Kill);
-
-  // The UseMI is where we would like to restore the register. If there's
-  // interference with R12 before then, however, we'll need to restore it
-  // before that instead and adjust the UseMI.
-  bool done = false;
-  for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
-    if (II->isDebugValue())
-      continue;
-    // If this instruction affects R12, adjust our restore point.
-    for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = II->getOperand(i);
-      if (!MO.isReg() || MO.isUndef() || !MO.getReg() ||
-          TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-        continue;
-      if (MO.getReg() == ARM::R12) {
-        UseMI = II;
-        done = true;
-        break;
-      }
-    }
-  }
-  // Restore the register from R12
-  BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVgpr2tgpr)).
-    addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill);
-
-  return true;
-}
-
-unsigned
-Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                        int SPAdj, FrameIndexValue *Value,
-                                        RegScavenger *RS) const{
-  unsigned VReg = 0;
-  unsigned i = 0;
+bool Thumb1RegisterInfo::
+rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
+                  unsigned FrameReg, int &Offset,
+                  const ARMBaseInstrInfo &TII) const {
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc dl = MI.getDebugLoc();
-
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
-  }
-
-  unsigned FrameReg = ARM::SP;
-  int FrameIndex = MI.getOperand(i).getIndex();
-  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
-               MF.getFrameInfo()->getStackSize() + SPAdj;
-
-  if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
-    Offset -= AFI->getGPRCalleeSavedArea1Offset();
-  else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
-    Offset -= AFI->getGPRCalleeSavedArea2Offset();
-  else if (MF.getFrameInfo()->hasVarSizedObjects()) {
-    assert(SPAdj == 0 && hasFP(MF) && "Unexpected");
-    // There are alloca()'s in this function, must reference off the frame
-    // pointer instead.
-    FrameReg = getFrameRegister(MF);
-    Offset -= AFI->getFramePtrSpillOffset();
-  }
-
-  // Special handling of dbg_value instructions.
-  if (MI.isDebugValue()) {
-    MI.getOperand(i).  ChangeToRegister(FrameReg, false /*isDef*/);
-    MI.getOperand(i+1).ChangeToImmediate(Offset);
-    return 0;
-  }
-
   unsigned Opcode = MI.getOpcode();
   const TargetInstrDesc &Desc = MI.getDesc();
   unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
 
   if (Opcode == ARM::tADDrSPi) {
-    Offset += MI.getOperand(i+1).getImm();
+    Offset += MI.getOperand(FrameRegIdx+1).getImm();
 
     // Can't use tADDrSPi if it's based off the frame pointer.
     unsigned NumBits = 0;
@@ -483,12 +395,13 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) {
       // Turn it into a move.
       MI.setDesc(TII.get(ARM::tMOVgpr2tgpr));
-      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
       // Remove offset and remaining explicit predicate operands.
-      do MI.RemoveOperand(i+1);
-      while (MI.getNumOperands() > i+1 &&
-             (!MI.getOperand(i+1).isReg() || !MI.getOperand(i+1).isImm()));
-      return 0;
+      do MI.RemoveOperand(FrameRegIdx+1);
+      while (MI.getNumOperands() > FrameRegIdx+1 &&
+             (!MI.getOperand(FrameRegIdx+1).isReg() ||
+              !MI.getOperand(FrameRegIdx+1).isImm()));
+      return true;
     }
 
     // Common case: small offset, fits into instruction.
@@ -496,15 +409,15 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     if (((Offset / Scale) & ~Mask) == 0) {
       // Replace the FrameIndex with sp / fp
       if (Opcode == ARM::tADDi3) {
-        removeOperands(MI, i);
+        removeOperands(MI, FrameRegIdx);
         MachineInstrBuilder MIB(&MI);
         AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg)
                        .addImm(Offset / Scale));
       } else {
-        MI.getOperand(i).ChangeToRegister(FrameReg, false);
-        MI.getOperand(i+1).ChangeToImmediate(Offset / Scale);
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset / Scale);
       }
-      return 0;
+      return true;
     }
 
     unsigned DestReg = MI.getOperand(0).getReg();
@@ -516,7 +429,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       emitThumbRegPlusImmediate(MBB, II, DestReg, FrameReg, Offset, TII,
                                 *this, dl);
       MBB.erase(II);
-      return 0;
+      return true;
     }
 
     if (Offset > 0) {
@@ -524,12 +437,12 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       // r0 = add sp, 255*4
       // r0 = add r0, (imm - 255*4)
       if (Opcode == ARM::tADDi3) {
-        removeOperands(MI, i);
+        removeOperands(MI, FrameRegIdx);
         MachineInstrBuilder MIB(&MI);
         AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg).addImm(Mask));
       } else {
-        MI.getOperand(i).ChangeToRegister(FrameReg, false);
-        MI.getOperand(i+1).ChangeToImmediate(Mask);
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Mask);
       }
       Offset = (Offset - Mask * Scale);
       MachineBasicBlock::iterator NII = llvm::next(II);
@@ -542,14 +455,14 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl);
 
       MI.setDesc(TII.get(ARM::tADDhirr));
-      MI.getOperand(i).ChangeToRegister(DestReg, false, false, true);
-      MI.getOperand(i+1).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx).ChangeToRegister(DestReg, false, false, true);
+      MI.getOperand(FrameRegIdx+1).ChangeToRegister(FrameReg, false);
       if (Opcode == ARM::tADDi3) {
         MachineInstrBuilder MIB(&MI);
         AddDefaultPred(MIB);
       }
     }
-    return 0;
+    return true;
   } else {
     unsigned ImmIdx = 0;
     int InstrOffs = 0;
@@ -557,7 +470,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     unsigned Scale = 1;
     switch (AddrMode) {
     case ARMII::AddrModeT1_s: {
-      ImmIdx = i+1;
+      ImmIdx = FrameRegIdx+1;
       InstrOffs = MI.getOperand(ImmIdx).getImm();
       NumBits = (FrameReg == ARM::SP) ? 8 : 5;
       Scale = 4;
@@ -577,9 +490,9 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     unsigned Mask = (1 << NumBits) - 1;
     if ((unsigned)Offset <= Mask * Scale) {
       // Replace the FrameIndex with sp
-      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
       ImmOp.ChangeToImmediate(ImmedOffset);
-      return 0;
+      return true;
     }
 
     bool isThumSpillRestore = Opcode == ARM::tRestore || Opcode == ARM::tSpill;
@@ -600,12 +513,126 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       Offset &= ~(Mask*Scale);
     }
   }
+  return Offset == 0;
+}
+
+void
+Thumb1RegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
+                                      unsigned BaseReg, int64_t Offset) const {
+  MachineInstr &MI = *I;
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = false;
+  Done = rewriteFrameIndex(MI, i, BaseReg, Off, TII);
+  assert (Done && "Unable to resolve frame index!");
+}
+
+/// saveScavengerRegister - Spill the register so it can be used by the
+/// register scavenger. Return true.
+bool
+Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          MachineBasicBlock::iterator &UseMI,
+                                          const TargetRegisterClass *RC,
+                                          unsigned Reg) const {
+  // Thumb1 can't use the emergency spill slot on the stack because
+  // ldr/str immediate offsets must be positive, and if we're referencing
+  // off the frame pointer (if, for example, there are alloca() calls in
+  // the function, the offset will be negative. Use R12 instead since that's
+  // a call clobbered register that we know won't be used in Thumb1 mode.
+  DebugLoc DL;
+  BuildMI(MBB, I, DL, TII.get(ARM::tMOVtgpr2gpr)).
+    addReg(ARM::R12, RegState::Define).addReg(Reg, RegState::Kill);
+
+  // The UseMI is where we would like to restore the register. If there's
+  // interference with R12 before then, however, we'll need to restore it
+  // before that instead and adjust the UseMI.
+  bool done = false;
+  for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
+    if (II->isDebugValue())
+      continue;
+    // If this instruction affects R12, adjust our restore point.
+    for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = II->getOperand(i);
+      if (!MO.isReg() || MO.isUndef() || !MO.getReg() ||
+          TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        continue;
+      if (MO.getReg() == ARM::R12) {
+        UseMI = II;
+        done = true;
+        break;
+      }
+    }
+  }
+  // Restore the register from R12
+  BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVgpr2tgpr)).
+    addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill);
+
+  return true;
+}
+
+void
+Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                        int SPAdj, RegScavenger *RS) const {
+  unsigned VReg = 0;
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc dl = MI.getDebugLoc();
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  unsigned FrameReg = ARM::SP;
+  int FrameIndex = MI.getOperand(i).getIndex();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+               MF.getFrameInfo()->getStackSize() + SPAdj;
+
+  if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
+    Offset -= AFI->getGPRCalleeSavedArea1Offset();
+  else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
+    Offset -= AFI->getGPRCalleeSavedArea2Offset();
+  else if (MF.getFrameInfo()->hasVarSizedObjects()) {
+    assert(SPAdj == 0 && hasFP(MF) && "Unexpected");
+    // There are alloca()'s in this function, must reference off the frame
+    // pointer or base pointer instead.
+    if (!hasBasePointer(MF)) {
+      FrameReg = getFrameRegister(MF);
+      Offset -= AFI->getFramePtrSpillOffset();
+    } else
+      FrameReg = BasePtr;
+  }
+
+  // Special handling of dbg_value instructions.
+  if (MI.isDebugValue()) {
+    MI.getOperand(i).  ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(i+1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  assert(AFI->isThumbFunction() &&
+         "This eliminateFrameIndex only supports Thumb1!");
+  if (rewriteFrameIndex(MI, i, FrameReg, Offset, TII))
+    return;
 
   // If we get here, the immediate doesn't fit into the instruction.  We folded
   // as much as possible above, handle the rest, providing a register that is
   // SP+LargeImm.
   assert(Offset && "This code isn't needed if offset already handled!");
 
+  unsigned Opcode = MI.getOpcode();
+  const TargetInstrDesc &Desc = MI.getDesc();
+
   // Remove predicate first.
   int PIdx = MI.findFirstPredOperandIdx();
   if (PIdx != -1)
@@ -637,11 +664,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       MI.addOperand(MachineOperand::CreateReg(0, false));
   } else if (Desc.mayStore()) {
       VReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass);
-      assert (Value && "Frame index virtual allocated, but Value arg is NULL!");
       bool UseRR = false;
-      bool TrackVReg = true;
-      Value->first = FrameReg; // use the frame register as a kind indicator
-      Value->second = Offset;
 
       if (Opcode == ARM::tSpill) {
         if (FrameReg == ARM::SP)
@@ -650,7 +673,6 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
         else {
           emitLoadConstPool(MBB, II, dl, VReg, 0, Offset);
           UseRR = true;
-          TrackVReg = false;
         }
       } else
         emitThumbRegPlusImmediate(MBB, II, VReg, FrameReg, Offset, TII,
@@ -661,8 +683,6 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
         MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
       else // tSTR has an extra register operand.
         MI.addOperand(MachineOperand::CreateReg(0, false));
-      if (!ReuseFrameIndexVals || !TrackVReg)
-        VReg = 0;
   } else
     assert(false && "Unexpected opcode!");
 
@@ -671,7 +691,6 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     MachineInstrBuilder MIB(&MI);
     AddDefaultPred(MIB);
   }
-  return VReg;
 }
 
 void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const {
@@ -742,11 +761,11 @@ void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const {
       dl = MBBI->getDebugLoc();
   }
 
-  // Darwin ABI requires FP to point to the stack slot that contains the
-  // previous FP.
-  if (STI.isTargetDarwin() || hasFP(MF)) {
+  // Adjust FP so it point to the stack slot that contains the previous FP.
+  if (hasFP(MF)) {
     BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
       .addFrameIndex(FramePtrSpillFI).addImm(0);
+    AFI->setShouldRestoreSPFromFP(true);
   }
 
   // Determine starting offsets of spill areas.
@@ -764,14 +783,20 @@ void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const {
     emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes);
   }
 
-  if (STI.isTargetELF() && hasFP(MF)) {
+  if (STI.isTargetELF() && hasFP(MF))
     MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
                              AFI->getFramePtrSpillOffset());
-  }
 
   AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
   AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
   AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  if (hasBasePointer(MF))
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), BasePtr).addReg(ARM::SP);
 }
 
 static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
@@ -828,7 +853,7 @@ void Thumb1RegisterInfo::emitEpilogue(MachineFunction &MF,
                  AFI->getGPRCalleeSavedArea2Size() +
                  AFI->getDPRCalleeSavedAreaSize());
 
-    if (hasFP(MF)) {
+    if (AFI->shouldRestoreSPFromFP()) {
       NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
       // Reset SP based on frame pointer only if the stack frame extends beyond
       // frame pointer stack slot or target is ELF and the function has FP.
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index 9a0308afa20c2..c578054a5d71f 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -38,27 +38,27 @@ public:
                         unsigned PredReg = 0) const;
 
   /// Code Generation virtual methods...
-  bool hasReservedCallFrame(MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
-  // rewrite MI to access 'Offset' bytes from the FP. Return the offset that
-  // could not be handled directly in MI.
-  int rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                        unsigned FrameReg, int Offset,
-                        unsigned MOVOpc, unsigned ADDriOpc,
-                        unsigned SUBriOpc) const;
-
+  // rewrite MI to access 'Offset' bytes from the FP. Update Offset to be
+  // however much remains to be handled. Return 'true' if no further
+  // work is required.
+  bool rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
+                         unsigned FrameReg, int &Offset,
+                         const ARMBaseInstrInfo &TII) const;
+  void resolveFrameIndex(MachineBasicBlock::iterator I,
+                         unsigned BaseReg, int64_t Offset) const;
   bool saveScavengerRegister(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator I,
                              MachineBasicBlock::iterator &UseMI,
                              const TargetRegisterClass *RC,
                              unsigned Reg) const;
-  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
-                               int SPAdj, FrameIndexValue *Value = NULL,
-                               RegScavenger *RS = NULL) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
 
   void emitPrologue(MachineFunction &MF) const;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index cd15bbed9f23e..45e693744b806 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -27,7 +27,7 @@ namespace {
 
   public:
     static char ID;
-    Thumb2ITBlockPass() : MachineFunctionPass(&ID) {}
+    Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
 
     const Thumb2InstrInfo *TII;
     const TargetRegisterInfo *TRI;
@@ -91,35 +91,53 @@ static void TrackDefUses(MachineInstr *MI,
   }
 }
 
+static bool isCopy(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case ARM::MOVr:
+  case ARM::MOVr_TC:
+  case ARM::tMOVr:
+  case ARM::tMOVgpr2tgpr:
+  case ARM::tMOVtgpr2gpr:
+  case ARM::tMOVgpr2gpr:
+  case ARM::t2MOVr:
+    return true;
+  }
+}
+
 bool
 Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
                                       ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
                                         SmallSet<unsigned, 4> &Defs,
                                         SmallSet<unsigned, 4> &Uses) {
-  unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-  if (TII->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
-    assert(SrcSubIdx == 0 && DstSubIdx == 0 &&
-           "Sub-register indices still around?");
-    // llvm models select's as two-address instructions. That means a copy
-    // is inserted before a t2MOVccr, etc. If the copy is scheduled in
-    // between selects we would end up creating multiple IT blocks.
-
-    // First check if it's safe to move it.
-    if (Uses.count(DstReg) || Defs.count(SrcReg))
-      return false;
-
-    // Then peek at the next instruction to see if it's predicated on CC or OCC.
-    // If not, then there is nothing to be gained by moving the copy.
-    MachineBasicBlock::iterator I = MI; ++I;
-    MachineBasicBlock::iterator E = MI->getParent()->end();
-    while (I != E && I->isDebugValue())
-      ++I;
-    if (I != E) {
-      unsigned NPredReg = 0;
-      ARMCC::CondCodes NCC = llvm::getITInstrPredicate(I, NPredReg);
-      if (NCC == CC || NCC == OCC)
-        return true;
-    }
+  if (!isCopy(MI))
+    return false;
+  // llvm models select's as two-address instructions. That means a copy
+  // is inserted before a t2MOVccr, etc. If the copy is scheduled in
+  // between selects we would end up creating multiple IT blocks.
+  assert(MI->getOperand(0).getSubReg() == 0 &&
+         MI->getOperand(1).getSubReg() == 0 &&
+         "Sub-register indices still around?");
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+
+  // First check if it's safe to move it.
+  if (Uses.count(DstReg) || Defs.count(SrcReg))
+    return false;
+
+  // Then peek at the next instruction to see if it's predicated on CC or OCC.
+  // If not, then there is nothing to be gained by moving the copy.
+  MachineBasicBlock::iterator I = MI; ++I;
+  MachineBasicBlock::iterator E = MI->getParent()->end();
+  while (I != E && I->isDebugValue())
+    ++I;
+  if (I != E) {
+    unsigned NPredReg = 0;
+    ARMCC::CondCodes NCC = llvm::getITInstrPredicate(I, NPredReg);
+    if (NCC == CC || NCC == OCC)
+      return true;
   }
   return false;
 }
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index ee517279c9d79..442f41da8a2d9 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -147,8 +147,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                     unsigned SrcReg, bool isKill, int FI,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
-  if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass ||
-      RC == ARM::tcGPRRegisterClass) {
+  if (RC == ARM::GPRRegisterClass   || RC == ARM::tGPRRegisterClass ||
+      RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
@@ -173,8 +173,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
-  if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass ||
-      RC == ARM::tcGPRRegisterClass) {
+  if (RC == ARM::GPRRegisterClass   || RC == ARM::tGPRRegisterClass ||
+      RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index ba392f36d9464..0c3962dd123d8 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -173,7 +173,7 @@ namespace {
   char Thumb2SizeReduce::ID = 0;
 }
 
-Thumb2SizeReduce::Thumb2SizeReduce() : MachineFunctionPass(&ID) {
+Thumb2SizeReduce::Thumb2SizeReduce() : MachineFunctionPass(ID) {
   for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) {
     unsigned FromOpc = ReduceTable[i].WideOpc;
     if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second)
@@ -315,6 +315,18 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm());
     if (!isARMLowRegister(BaseReg) || Mode != ARM_AM::ia)
       return false;
+    // For the non-writeback version (this one), the base register must be
+    // one of the registers being loaded.
+    bool isOK = false;
+    for (unsigned i = 4; i < MI->getNumOperands(); ++i) {
+      if (MI->getOperand(i).getReg() == BaseReg) {
+        isOK = true;
+        break;
+      }
+    }
+    if (!isOK)
+      return false;
+
     OpNum = 0;
     isLdStMul = true;
     break;
diff --git a/lib/Target/Alpha/AlphaBranchSelector.cpp b/lib/Target/Alpha/AlphaBranchSelector.cpp
index 001656e0121a7..3768117095369 100644
--- a/lib/Target/Alpha/AlphaBranchSelector.cpp
+++ b/lib/Target/Alpha/AlphaBranchSelector.cpp
@@ -22,7 +22,7 @@ using namespace llvm;
 namespace {
   struct AlphaBSel : public MachineFunctionPass {
     static char ID;
-    AlphaBSel() : MachineFunctionPass(&ID) {}
+    AlphaBSel() : MachineFunctionPass(ID) {}
 
     virtual bool runOnMachineFunction(MachineFunction &Fn);
 
diff --git a/lib/Target/Alpha/AlphaCodeEmitter.cpp b/lib/Target/Alpha/AlphaCodeEmitter.cpp
index a6c6f52704f6b..3aec07035d74c 100644
--- a/lib/Target/Alpha/AlphaCodeEmitter.cpp
+++ b/lib/Target/Alpha/AlphaCodeEmitter.cpp
@@ -34,7 +34,7 @@ namespace {
   public:
     static char ID;
 
-    AlphaCodeEmitter(JITCodeEmitter &mce) : MachineFunctionPass(&ID),
+    AlphaCodeEmitter(JITCodeEmitter &mce) : MachineFunctionPass(ID),
     MCE(mce) {}
 
     /// getBinaryCodeForInstr - This function, generated by the
diff --git a/lib/Target/Alpha/AlphaISelDAGToDAG.cpp b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
index d526dc0827b26..d197bd15ef9c5 100644
--- a/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
+++ b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
@@ -113,8 +113,8 @@ namespace {
     static uint64_t getNearPower2(uint64_t x) {
       if (!x) return 0;
       unsigned at = CountLeadingZeros_64(x);
-      uint64_t complow = 1 << (63 - at);
-      uint64_t comphigh = 1 << (64 - at);
+      uint64_t complow = 1ULL << (63 - at);
+      uint64_t comphigh = 1ULL << (64 - at);
       //cerr << x << ":" << complow << ":" << comphigh << "\n";
       if (abs64(complow - x) <= abs64(comphigh - x))
         return complow;
diff --git a/lib/Target/Alpha/AlphaInstrInfo.cpp b/lib/Target/Alpha/AlphaInstrInfo.cpp
index ad625a2694172..5a2f5610fdb49 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.cpp
+++ b/lib/Target/Alpha/AlphaInstrInfo.cpp
@@ -27,32 +27,6 @@ AlphaInstrInfo::AlphaInstrInfo()
     RI(*this) { }
 
 
-bool AlphaInstrInfo::isMoveInstr(const MachineInstr& MI,
-                                 unsigned& sourceReg, unsigned& destReg,
-                                 unsigned& SrcSR, unsigned& DstSR) const {
-  unsigned oc = MI.getOpcode();
-  if (oc == Alpha::BISr   || 
-      oc == Alpha::CPYSS  || 
-      oc == Alpha::CPYST  ||
-      oc == Alpha::CPYSSt || 
-      oc == Alpha::CPYSTs) {
-    // or r1, r2, r2 
-    // cpys(s|t) r1 r2 r2
-    assert(MI.getNumOperands() >= 3 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           MI.getOperand(2).isReg() &&
-           "invalid Alpha BIS instruction!");
-    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
-      sourceReg = MI.getOperand(1).getReg();
-      destReg = MI.getOperand(0).getReg();
-      SrcSR = DstSR = 0;
-      return true;
-    }
-  }
-  return false;
-}
-
 unsigned 
 AlphaInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
                                     int &FrameIndex) const {
diff --git a/lib/Target/Alpha/AlphaInstrInfo.h b/lib/Target/Alpha/AlphaInstrInfo.h
index e20e8323b64e6..ee6077a4a01a1 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.h
+++ b/lib/Target/Alpha/AlphaInstrInfo.h
@@ -30,12 +30,6 @@ public:
   ///
   virtual const AlphaRegisterInfo &getRegisterInfo() const { return RI; }
 
-  /// Return true if the instruction is a register to register move and return
-  /// the source and dest operands and their sub-register indices by reference.
-  virtual bool isMoveInstr(const MachineInstr &MI,
-                           unsigned &SrcReg, unsigned &DstReg,
-                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-  
   virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                        int &FrameIndex) const;
   virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
diff --git a/lib/Target/Alpha/AlphaLLRP.cpp b/lib/Target/Alpha/AlphaLLRP.cpp
index 34be470f03e30..85fbfd1affe21 100644
--- a/lib/Target/Alpha/AlphaLLRP.cpp
+++ b/lib/Target/Alpha/AlphaLLRP.cpp
@@ -39,7 +39,7 @@ namespace {
 
     static char ID;
     AlphaLLRPPass(AlphaTargetMachine &tm) 
-      : MachineFunctionPass(&ID), TM(tm) { }
+      : MachineFunctionPass(ID), TM(tm) { }
 
     virtual const char *getPassName() const {
       return "Alpha NOP inserter";
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.cpp b/lib/Target/Alpha/AlphaRegisterInfo.cpp
index dc9d935ec047b..327ddb4d9a720 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.cpp
+++ b/lib/Target/Alpha/AlphaRegisterInfo.cpp
@@ -137,10 +137,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 //variable locals
 //<- SP
 
-unsigned
+void
 AlphaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                       int SPAdj, FrameIndexValue *Value,
-                                       RegScavenger *RS) const {
+                                       int SPAdj, RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
   unsigned i = 0;
@@ -185,7 +184,6 @@ AlphaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   } else {
     MI.getOperand(i).ChangeToImmediate(Offset);
   }
-  return 0;
 }
 
 
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.h b/lib/Target/Alpha/AlphaRegisterInfo.h
index f9fd87a637374..b164979a63119 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.h
+++ b/lib/Target/Alpha/AlphaRegisterInfo.h
@@ -38,9 +38,8 @@ struct AlphaRegisterInfo : public AlphaGenRegisterInfo {
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
-  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
-                               int SPAdj, FrameIndexValue *Value = NULL,
-                               RegScavenger *RS = NULL) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
 
   //void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
diff --git a/lib/Target/Alpha/AsmPrinter/AlphaAsmPrinter.cpp b/lib/Target/Alpha/AsmPrinter/AlphaAsmPrinter.cpp
index 9f4aff6fd5f53..5428cb96173b2 100644
--- a/lib/Target/Alpha/AsmPrinter/AlphaAsmPrinter.cpp
+++ b/lib/Target/Alpha/AsmPrinter/AlphaAsmPrinter.cpp
@@ -53,8 +53,6 @@ namespace {
 
     void printOp(const MachineOperand &MO, raw_ostream &O);
     void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
-    void printBaseOffsetPair(const MachineInstr *MI, int i, raw_ostream &O,
-                             bool brackets=true);
     virtual void EmitFunctionBodyStart();
     virtual void EmitFunctionBodyEnd(); 
     void EmitStartOfAsmFile(Module &M);
diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.cpp b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
index a74d42d595496..e50d57a31b6ef 100644
--- a/lib/Target/Blackfin/BlackfinInstrInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
@@ -28,34 +28,6 @@ BlackfinInstrInfo::BlackfinInstrInfo(BlackfinSubtarget &ST)
     RI(ST, *this),
     Subtarget(ST) {}
 
-/// Return true if the instruction is a register to register move and
-/// leave the source and dest operands in the passed parameters.
-bool BlackfinInstrInfo::isMoveInstr(const MachineInstr &MI,
-                                    unsigned &SrcReg,
-                                    unsigned &DstReg,
-                                    unsigned &SrcSR,
-                                    unsigned &DstSR) const {
-  SrcSR = DstSR = 0; // No sub-registers.
-  switch (MI.getOpcode()) {
-  case BF::MOVE:
-  case BF::MOVE_ncccc:
-  case BF::MOVE_ccncc:
-  case BF::MOVECC_zext:
-  case BF::MOVECC_nz:
-    DstReg = MI.getOperand(0).getReg();
-    SrcReg = MI.getOperand(1).getReg();
-    return true;
-  case BF::SLL16i:
-    if (MI.getOperand(2).getImm()!=0)
-      return false;
-    DstReg = MI.getOperand(0).getReg();
-    SrcReg = MI.getOperand(1).getReg();
-    return true;
-  default:
-    return false;
-  }
-}
-
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
 /// load from a stack slot, return the virtual or physical register number of
 /// the destination along with the FrameIndex of the loaded stack slot.  If
diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.h b/lib/Target/Blackfin/BlackfinInstrInfo.h
index 6c3591707269f..fdc1029da5883 100644
--- a/lib/Target/Blackfin/BlackfinInstrInfo.h
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.h
@@ -30,10 +30,6 @@ namespace llvm {
     /// always be able to get register info as well (through this method).
     virtual const BlackfinRegisterInfo &getRegisterInfo() const { return RI; }
 
-    virtual bool isMoveInstr(const MachineInstr &MI,
-                             unsigned &SrcReg, unsigned &DstReg,
-                             unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-
     virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                          int &FrameIndex) const;
 
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
index 06e95de1587f0..a51831263e909 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
@@ -190,10 +190,9 @@ static unsigned findScratchRegister(MachineBasicBlock::iterator II,
   return Reg;
 }
 
-unsigned
+void
 BlackfinRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                          int SPAdj, FrameIndexValue *Value,
-                                          RegScavenger *RS) const {
+                                          int SPAdj, RegScavenger *RS) const {
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
@@ -230,20 +229,20 @@ BlackfinRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       MI.setDesc(TII.get(isStore
                          ? BF::STORE32p_uimm6m4
                          : BF::LOAD32p_uimm6m4));
-      return 0;
+      return;
     }
     if (BaseReg == BF::FP && isUInt<7>(-Offset)) {
       MI.setDesc(TII.get(isStore
                          ? BF::STORE32fp_nimm7m4
                          : BF::LOAD32fp_nimm7m4));
       MI.getOperand(FIPos+1).setImm(-Offset);
-      return 0;
+      return;
     }
     if (isInt<18>(Offset)) {
       MI.setDesc(TII.get(isStore
                          ? BF::STORE32p_imm18m4
                          : BF::LOAD32p_imm18m4));
-      return 0;
+      return;
     }
     // Use RegScavenger to calculate proper offset...
     MI.dump();
@@ -328,7 +327,6 @@ BlackfinRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     llvm_unreachable("Cannot eliminate frame index");
     break;
   }
-  return 0;
 }
 
 void BlackfinRegisterInfo::
@@ -344,10 +342,6 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   }
 }
 
-void BlackfinRegisterInfo::
-processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
-}
-
 // Emit a prologue that sets up a stack frame.
 // On function entry, R0-R2 and P0 may hold arguments.
 // R3, P1, and P2 may be used as scratch registers
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.h b/lib/Target/Blackfin/BlackfinRegisterInfo.h
index ead0b4a73c832..bb83c34f80032 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.h
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.h
@@ -51,15 +51,12 @@ namespace llvm {
                                        MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I) const;
 
-    unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                 int SPAdj, FrameIndexValue *Value = NULL,
-                                 RegScavenger *RS = NULL) const;
+    void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                             int SPAdj, RegScavenger *RS = NULL) const;
 
     void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                               RegScavenger *RS) const;
 
-    void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
-
     void emitPrologue(MachineFunction &MF) const;
     void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
index e8d8474b5be86..270fff6064adc 100644
--- a/lib/Target/CBackend/CBackend.cpp
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -73,7 +73,7 @@ namespace {
   public:
     static char ID;
     CBackendNameAllUsedStructsAndMergeFunctions() 
-      : ModulePass(&ID) {}
+      : ModulePass(ID) {}
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<FindUsedTypes>();
     }
@@ -110,7 +110,7 @@ namespace {
   public:
     static char ID;
     explicit CWriter(formatted_raw_ostream &o)
-      : FunctionPass(&ID), Out(o), IL(0), Mang(0), LI(0), 
+      : FunctionPass(ID), Out(o), IL(0), Mang(0), LI(0), 
         TheModule(0), TAsm(0), TCtx(0), TD(0), OpaqueCounter(0),
         NextAnonValueNumber(0) {
       FPCounter = 0;
@@ -199,7 +199,6 @@ namespace {
 
     void lowerIntrinsics(Function &F);
 
-    void printModule(Module *M);
     void printModuleTypes(const TypeSymbolTable &ST);
     void printContainedStructs(const Type *Ty, std::set<const Type *> &);
     void printFloatingPointConstants(Function &F);
@@ -1300,6 +1299,13 @@ void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) {
 }
 
 std::string CWriter::GetValueName(const Value *Operand) {
+
+  // Resolve potential alias.
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(Operand)) {
+    if (const Value *V = GA->resolveAliasedGlobal(false))
+      Operand = V;
+  }
+
   // Mangle globals with the standard mangler interface for LLC compatibility.
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(Operand)) {
     SmallString<128> Str;
diff --git a/lib/Target/CellSPU/SPUCallingConv.td b/lib/Target/CellSPU/SPUCallingConv.td
index ec2f663908f68..04fa2ae866d69 100644
--- a/lib/Target/CellSPU/SPUCallingConv.td
+++ b/lib/Target/CellSPU/SPUCallingConv.td
@@ -1,4 +1,4 @@
-//===- SPUCallingConv.td - Calling Conventions for CellSPU ------*- C++ -*-===//
+//===- SPUCallingConv.td - Calling Conventions for CellSPU -*- tablegen -*-===//
 // 
 //                     The LLVM Compiler Infrastructure
 //
@@ -19,16 +19,17 @@ class CCIfSubtarget<string F, CCAction A>
 // Return Value Calling Convention
 //===----------------------------------------------------------------------===//
 
-// Return-value convention for Cell SPU: Everything can be passed back via $3:
+// Return-value convention for Cell SPU: return value to be passed in reg 3-74
 def RetCC_SPU : CallingConv<[
-  CCIfType<[i8],       CCAssignToReg<[R3]>>,
-  CCIfType<[i16],      CCAssignToReg<[R3]>>,
-  CCIfType<[i32],      CCAssignToReg<[R3]>>,
-  CCIfType<[i64],      CCAssignToReg<[R3]>>,
-  CCIfType<[i128],     CCAssignToReg<[R3]>>,
-  CCIfType<[f32, f64], CCAssignToReg<[R3]>>,
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToReg<[R3]>>,
-  CCIfType<[v2i32],                                    CCAssignToReg<[R3]>>
+  CCIfType<[i8,i16,i32,i64,i128,f32,f64,v16i8,v8i16,v4i32,v2i64,v4f32,v2f64],
+  CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                 R66, R67, R68, R69, R70, R71, R72, R73, R74]>>
 ]>;
 
 
@@ -45,8 +46,7 @@ def CCC_SPU : CallingConv<[
                            R39, R40, R41, R42, R43, R44, R45, R46, R47,
                            R48, R49, R50, R51, R52, R53, R54, R55, R56,
                            R57, R58, R59, R60, R61, R62, R63, R64, R65,
-                           R66, R67, R68, R69, R70, R71, R72, R73, R74,
-                           R75, R76, R77, R78, R79]>>,
+                           R66, R67, R68, R69, R70, R71, R72, R73, R74]>>,
   // Integer/FP values get stored in stack slots that are 8 bytes in size and
   // 8-byte aligned if there are no more registers to hold them.
   CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index 9b8c2ddd06359..2f1598441f5ae 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -41,13 +41,6 @@ using namespace llvm;
 namespace {
   //! ConstantSDNode predicate for i32 sign-extended, 10-bit immediates
   bool
-  isI64IntS10Immediate(ConstantSDNode *CN)
-  {
-    return isInt<10>(CN->getSExtValue());
-  }
-
-  //! ConstantSDNode predicate for i32 sign-extended, 10-bit immediates
-  bool
   isI32IntS10Immediate(ConstantSDNode *CN)
   {
     return isInt<10>(CN->getSExtValue());
@@ -67,14 +60,6 @@ namespace {
     return isInt<10>(CN->getSExtValue());
   }
 
-  //! SDNode predicate for i16 sign-extended, 10-bit immediate values
-  bool
-  isI16IntS10Immediate(SDNode *N)
-  {
-    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
-    return (CN != 0 && isI16IntS10Immediate(CN));
-  }
-
   //! ConstantSDNode predicate for i16 unsigned 10-bit immediate values
   bool
   isI16IntU10Immediate(ConstantSDNode *CN)
@@ -82,14 +67,6 @@ namespace {
     return isUInt<10>((short) CN->getZExtValue());
   }
 
-  //! SDNode predicate for i16 sign-extended, 10-bit immediate values
-  bool
-  isI16IntU10Immediate(SDNode *N)
-  {
-    return (N->getOpcode() == ISD::Constant
-            && isI16IntU10Immediate(cast<ConstantSDNode>(N)));
-  }
-
   //! ConstantSDNode predicate for signed 16-bit values
   /*!
     \arg CN The constant SelectionDAG node holding the value
@@ -119,14 +96,6 @@ namespace {
     return false;
   }
 
-  //! SDNode predicate for signed 16-bit values.
-  bool
-  isIntS16Immediate(SDNode *N, short &Imm)
-  {
-    return (N->getOpcode() == ISD::Constant
-            && isIntS16Immediate(cast<ConstantSDNode>(N), Imm));
-  }
-
   //! ConstantFPSDNode predicate for representing floats as 16-bit sign ext.
   static bool
   isFPS16Immediate(ConstantFPSDNode *FPN, short &Imm)
@@ -142,16 +111,6 @@ namespace {
     return false;
   }
 
-  bool
-  isHighLow(const SDValue &Op)
-  {
-    return (Op.getOpcode() == SPUISD::IndirectAddr
-            && ((Op.getOperand(0).getOpcode() == SPUISD::Hi
-                 && Op.getOperand(1).getOpcode() == SPUISD::Lo)
-                || (Op.getOperand(0).getOpcode() == SPUISD::Lo
-                    && Op.getOperand(1).getOpcode() == SPUISD::Hi)));
-  }
-
   //===------------------------------------------------------------------===//
   //! EVT to "useful stuff" mapping structure:
 
@@ -607,7 +566,8 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDNode *Op, SDValue N, SDValue &Base,
     return true;
   } else if (Opc == ISD::Register 
            ||Opc == ISD::CopyFromReg 
-           ||Opc == ISD::UNDEF) {
+           ||Opc == ISD::UNDEF
+           ||Opc == ISD::Constant) {
     unsigned OpOpc = Op->getOpcode();
 
     if (OpOpc == ISD::STORE || OpOpc == ISD::LOAD) {
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index ece19b9b89f6d..46f31899be0c4 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -426,9 +426,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
   addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
 
-  // "Odd size" vector classes that we're willing to support:
-  addRegisterClass(MVT::v2i32, SPU::VECREGRegisterClass);
-
   for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
     MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
@@ -751,7 +748,6 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 
     if (alignment == 16) {
       ConstantSDNode *CN;
-
       // Special cases for a known aligned load to simplify the base pointer
       // and insertion byte:
       if (basePtr.getOpcode() == ISD::ADD
@@ -775,6 +771,9 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
         insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                                     basePtr,
                                     DAG.getConstant(0, PtrVT));
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                    basePtr,
+                                    DAG.getConstant(0, PtrVT));
       }
     } else {
       // Unaligned load: must be more pessimistic about addressing modes:
@@ -811,8 +810,8 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
                                   DAG.getConstant(0, PtrVT));
     }
 
-    // Re-emit as a v16i8 vector load
-    alignLoadVec = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr,
+    // Load the memory to which to store.
+    alignLoadVec = DAG.getLoad(vecVT, dl, the_chain, basePtr,
                                SN->getSrcValue(), SN->getSrcValueOffset(),
                                SN->isVolatile(), SN->isNonTemporal(), 16);
 
@@ -843,10 +842,10 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
       }
 #endif
 
-    SDValue insertEltOp =
-            DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT, insertEltOffs);
-    SDValue vectorizeOp =
-            DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, theValue);
+    SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT,
+                                      insertEltOffs);
+    SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, 
+                                      theValue);
 
     result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
                          vectorizeOp, alignLoadVec,
@@ -1325,41 +1324,23 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   if (Ins.empty())
     return Chain;
 
+  // Now handle the return value(s)
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCRetInfo(CallConv, isVarArg, getTargetMachine(),
+                    RVLocs, *DAG.getContext());
+  CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
+
+
   // If the call has results, copy the values out of the ret val registers.
-  switch (Ins[0].VT.getSimpleVT().SimpleTy) {
-  default: llvm_unreachable("Unexpected ret value!");
-  case MVT::Other: break;
-  case MVT::i32:
-    if (Ins.size() > 1 && Ins[1].VT == MVT::i32) {
-      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R4,
-                                 MVT::i32, InFlag).getValue(1);
-      InVals.push_back(Chain.getValue(0));
-      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32,
-                                 Chain.getValue(2)).getValue(1);
-      InVals.push_back(Chain.getValue(0));
-    } else {
-      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32,
-                                 InFlag).getValue(1);
-      InVals.push_back(Chain.getValue(0));
-    }
-    break;
-  case MVT::i8:
-  case MVT::i16:
-  case MVT::i64:
-  case MVT::i128:
-  case MVT::f32:
-  case MVT::f64:
-  case MVT::v2f64:
-  case MVT::v2i64:
-  case MVT::v4f32:
-  case MVT::v4i32:
-  case MVT::v8i16:
-  case MVT::v16i8:
-    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, Ins[0].VT,
-                                   InFlag).getValue(1);
-    InVals.push_back(Chain.getValue(0));
-    break;
-  }
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+    
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
+                                     InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+    InVals.push_back(Val);
+   }
 
   return Chain;
 }
@@ -1621,10 +1602,6 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
     SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
   }
-  case MVT::v2i32: {
-    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T);
-  }
   case MVT::v2i64: {
     return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
   }
@@ -1748,11 +1725,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
 
   // If we have a single element being moved from V1 to V2, this can be handled
   // using the C*[DX] compute mask instructions, but the vector elements have
-  // to be monotonically increasing with one exception element.
+  // to be monotonically increasing with one exception element, and the source
+  // slot of the element to move must be the same as the destination.
   EVT VecVT = V1.getValueType();
   EVT EltVT = VecVT.getVectorElementType();
   unsigned EltsFromV2 = 0;
-  unsigned V2Elt = 0;
+  unsigned V2EltOffset = 0;
   unsigned V2EltIdx0 = 0;
   unsigned CurrElt = 0;
   unsigned MaxElts = VecVT.getVectorNumElements();
@@ -1785,9 +1763,13 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
 
     if (monotonic) {
       if (SrcElt >= V2EltIdx0) {
-        if (1 >= (++EltsFromV2)) {
-          V2Elt = (V2EltIdx0 - SrcElt) << 2;
-        }
+        // TODO: optimize for the monotonic case when several consecutive
+        // elements are taken form V2. Do we ever get such a case?
+        if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0))
+          V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8);
+        else
+          monotonic = false;
+        ++EltsFromV2;
       } else if (CurrElt != SrcElt) {
         monotonic = false;
       }
@@ -1823,7 +1805,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     // R1 ($sp) is used here only as it is guaranteed to have last bits zero
     SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                                 DAG.getRegister(SPU::R1, PtrVT),
-                                DAG.getConstant(V2Elt, MVT::i32));
+                                DAG.getConstant(V2EltOffset, MVT::i32));
     SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, 
                                      maskVT, Pointer);
 
@@ -1847,7 +1829,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
       for (unsigned j = 0; j < BytesPerElement; ++j)
         ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
     }
-
     SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
                                     &ResultMask[0], ResultMask.size());
     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
@@ -1997,7 +1978,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
     // Variable index: Rotate the requested element into slot 0, then replicate
     // slot 0 across the vector
     EVT VecVT = N.getValueType();
-    if (!VecVT.isSimple() || !VecVT.isVector() || !VecVT.is128BitVector()) {
+    if (!VecVT.isSimple() || !VecVT.isVector()) {
       report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
                         "vector type!");
     }
@@ -2072,21 +2053,25 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   SDValue IdxOp = Op.getOperand(2);
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
+  EVT eltVT = ValOp.getValueType();
 
   // use 0 when the lane to insert to is 'undef'
-  int64_t Idx=0;
+  int64_t Offset=0;
   if (IdxOp.getOpcode() != ISD::UNDEF) {
     ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
     assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
-    Idx = (CN->getSExtValue());
+    Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8;
   }
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   // Use $sp ($1) because it's always 16-byte aligned and it's available:
   SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                                 DAG.getRegister(SPU::R1, PtrVT),
-                                DAG.getConstant(Idx, PtrVT));
-  SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, VT, Pointer);
+                                DAG.getConstant(Offset, PtrVT));
+  // widen the mask when dealing with half vectors
+  EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(), 
+                                128/ VT.getVectorElementType().getSizeInBits());
+  SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer);
 
   SDValue result =
     DAG.getNode(SPUISD::SHUFB, dl, VT,
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
index 69aa0887bd77f..26d6b4f25ef12 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -54,148 +54,6 @@ SPUInstrInfo::SPUInstrInfo(SPUTargetMachine &tm)
     RI(*TM.getSubtargetImpl(), *this)
 { /* NOP */ }
 
-bool
-SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
-                          unsigned& sourceReg,
-                          unsigned& destReg,
-                          unsigned& SrcSR, unsigned& DstSR) const {
-  SrcSR = DstSR = 0;  // No sub-registers.
-
-  switch (MI.getOpcode()) {
-  default:
-    break;
-  case SPU::ORIv4i32:
-  case SPU::ORIr32:
-  case SPU::ORHIv8i16:
-  case SPU::ORHIr16:
-  case SPU::ORHIi8i16:
-  case SPU::ORBIv16i8:
-  case SPU::ORBIr8:
-  case SPU::ORIi16i32:
-  case SPU::ORIi8i32:
-  case SPU::AHIvec:
-  case SPU::AHIr16:
-  case SPU::AIv4i32:
-    assert(MI.getNumOperands() == 3 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           MI.getOperand(2).isImm() &&
-           "invalid SPU ORI/ORHI/ORBI/AHI/AI/SFI/SFHI instruction!");
-    if (MI.getOperand(2).getImm() == 0) {
-      sourceReg = MI.getOperand(1).getReg();
-      destReg = MI.getOperand(0).getReg();
-      return true;
-    }
-    break;
-  case SPU::AIr32:
-    assert(MI.getNumOperands() == 3 &&
-           "wrong number of operands to AIr32");
-    if (MI.getOperand(0).isReg() &&
-        MI.getOperand(1).isReg() &&
-        (MI.getOperand(2).isImm() &&
-         MI.getOperand(2).getImm() == 0)) {
-      sourceReg = MI.getOperand(1).getReg();
-      destReg = MI.getOperand(0).getReg();
-      return true;
-    }
-    break;
-  case SPU::LRr8:
-  case SPU::LRr16:
-  case SPU::LRr32:
-  case SPU::LRf32:
-  case SPU::LRr64:
-  case SPU::LRf64:
-  case SPU::LRr128:
-  case SPU::LRv16i8:
-  case SPU::LRv8i16:
-  case SPU::LRv4i32:
-  case SPU::LRv4f32:
-  case SPU::LRv2i64:
-  case SPU::LRv2f64:
-  case SPU::ORv16i8_i8:
-  case SPU::ORv8i16_i16:
-  case SPU::ORv4i32_i32:
-  case SPU::ORv2i64_i64:
-  case SPU::ORv4f32_f32:
-  case SPU::ORv2f64_f64:
-  case SPU::ORi8_v16i8:
-  case SPU::ORi16_v8i16:
-  case SPU::ORi32_v4i32:
-  case SPU::ORi64_v2i64:
-  case SPU::ORf32_v4f32:
-  case SPU::ORf64_v2f64:
-/*
-  case SPU::ORi128_r64:
-  case SPU::ORi128_f64:
-  case SPU::ORi128_r32:
-  case SPU::ORi128_f32:
-  case SPU::ORi128_r16:
-  case SPU::ORi128_r8:
-*/
-  case SPU::ORi128_vec:
-/*
-  case SPU::ORr64_i128:
-  case SPU::ORf64_i128:
-  case SPU::ORr32_i128:
-  case SPU::ORf32_i128:
-  case SPU::ORr16_i128:
-  case SPU::ORr8_i128:
-*/
-  case SPU::ORvec_i128:
-/*
-  case SPU::ORr16_r32:
-  case SPU::ORr8_r32:
-  case SPU::ORf32_r32:
-  case SPU::ORr32_f32:
-  case SPU::ORr32_r16:
-  case SPU::ORr32_r8:
-  case SPU::ORr16_r64:
-  case SPU::ORr8_r64:
-  case SPU::ORr64_r16:
-  case SPU::ORr64_r8:
-*/
-  case SPU::ORr64_r32:
-  case SPU::ORr32_r64:
-  case SPU::ORf32_r32:
-  case SPU::ORr32_f32:
-  case SPU::ORf64_r64:
-  case SPU::ORr64_f64: {
-    assert(MI.getNumOperands() == 2 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           "invalid SPU OR<type>_<vec> or LR instruction!");
-      sourceReg = MI.getOperand(1).getReg();
-      destReg = MI.getOperand(0).getReg();
-      return true;
-    break;
-  }
-  case SPU::ORv16i8:
-  case SPU::ORv8i16:
-  case SPU::ORv4i32:
-  case SPU::ORv2i64:
-  case SPU::ORr8:
-  case SPU::ORr16:
-  case SPU::ORr32:
-  case SPU::ORr64:
-  case SPU::ORr128:
-  case SPU::ORf32:
-  case SPU::ORf64:
-    assert(MI.getNumOperands() == 3 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           MI.getOperand(2).isReg() &&
-           "invalid SPU OR(vec|r32|r64|gprc) instruction!");
-    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
-      sourceReg = MI.getOperand(1).getReg();
-      destReg = MI.getOperand(0).getReg();
-      return true;
-    }
-    break;
-  }
-
-  return false;
-}
-
 unsigned
 SPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
                                   int &FrameIndex) const {
diff --git a/lib/Target/CellSPU/SPUInstrInfo.h b/lib/Target/CellSPU/SPUInstrInfo.h
index fbb1733181486..191e55d0ca612 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.h
+++ b/lib/Target/CellSPU/SPUInstrInfo.h
@@ -32,12 +32,6 @@ namespace llvm {
     ///
     virtual const SPURegisterInfo &getRegisterInfo() const { return RI; }
 
-    /// Return true if the instruction is a register to register move and return
-    /// the source and dest operands and their sub-register indices by reference.
-    virtual bool isMoveInstr(const MachineInstr &MI,
-                             unsigned &SrcReg, unsigned &DstReg,
-                             unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-
     unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                  int &FrameIndex) const;
     unsigned isStoreToStackSlot(const MachineInstr *MI,
diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td
index a7fb14c26a76c..ca0fe00e37f89 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -62,8 +62,6 @@ let canFoldAsLoad = 1 in {
     def v4f32: LoadDFormVec<v4f32>;
     def v2f64: LoadDFormVec<v2f64>;
 
-    def v2i32: LoadDFormVec<v2i32>;
-
     def r128:  LoadDForm<GPRC>;
     def r64:   LoadDForm<R64C>;
     def r32:   LoadDForm<R32C>;
@@ -96,8 +94,6 @@ let canFoldAsLoad = 1 in {
     def v4f32: LoadAFormVec<v4f32>;
     def v2f64: LoadAFormVec<v2f64>;
 
-    def v2i32: LoadAFormVec<v2i32>;
-
     def r128:  LoadAForm<GPRC>;
     def r64:   LoadAForm<R64C>;
     def r32:   LoadAForm<R32C>;
@@ -130,8 +126,6 @@ let canFoldAsLoad = 1 in {
     def v4f32: LoadXFormVec<v4f32>;
     def v2f64: LoadXFormVec<v2f64>;
 
-    def v2i32: LoadXFormVec<v2i32>;
-
     def r128:  LoadXForm<GPRC>;
     def r64:   LoadXForm<R64C>;
     def r32:   LoadXForm<R32C>;
@@ -180,8 +174,6 @@ multiclass StoreDForms
   def v4f32: StoreDFormVec<v4f32>;
   def v2f64: StoreDFormVec<v2f64>;
 
-  def v2i32: StoreDFormVec<v2i32>;
-
   def r128:  StoreDForm<GPRC>;
   def r64:   StoreDForm<R64C>;
   def r32:   StoreDForm<R32C>;
@@ -212,8 +204,6 @@ multiclass StoreAForms
   def v4f32: StoreAFormVec<v4f32>;
   def v2f64: StoreAFormVec<v2f64>;
 
-  def v2i32: StoreAFormVec<v2i32>;
-
   def r128:  StoreAForm<GPRC>;
   def r64:   StoreAForm<R64C>;
   def r32:   StoreAForm<R32C>;
@@ -246,8 +236,6 @@ multiclass StoreXForms
   def v4f32: StoreXFormVec<v4f32>;
   def v2f64: StoreXFormVec<v2f64>;
 
-  def v2i32: StoreXFormVec<v2i32>;
-
   def r128:  StoreXForm<GPRC>;
   def r64:   StoreXForm<R64C>;
   def r32:   StoreXForm<R32C>;
@@ -607,7 +595,6 @@ class ARegInst<RegisterClass rclass>:
 multiclass AddInstruction {
   def v4i32: AVecInst<v4i32>;
   def v16i8: AVecInst<v16i8>;
-  
   def r32:   ARegInst<R32C>;
 }
 
@@ -672,6 +659,7 @@ def SFvec : RRForm<0b00000010000, (outs VECREG:$rT),
   "sf\t$rT, $rA, $rB", IntegerOp,
   [(set (v4i32 VECREG:$rT), (sub (v4i32 VECREG:$rB), (v4i32 VECREG:$rA)))]>;
 
+
 def SFr32 : RRForm<0b00000010000, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
   "sf\t$rT, $rA, $rB", IntegerOp,
   [(set R32C:$rT, (sub R32C:$rB, R32C:$rA))]>;
@@ -1448,6 +1436,9 @@ class ORCvtGPRCVec:
 class ORCvtVecGPRC:
     ORCvtForm<(outs GPRC:$rT), (ins VECREG:$rA)>;
 
+class ORCvtVecVec:
+    ORCvtForm<(outs VECREG:$rT), (ins VECREG:$rA)>;
+
 multiclass BitwiseOr
 {
   def v16i8: ORVecInst<v16i8>;
@@ -3894,6 +3885,79 @@ multiclass SFPSub
 
 defm FS : SFPSub;
 
+class FMInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01100011010, OOL, IOL,
+      "fm\t$rT, $rA, $rB", SPrecFP,
+      pattern>;
+
+class FMVecInst<ValueType type>:
+    FMInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+           [(set (type VECREG:$rT),
+                 (fmul (type VECREG:$rA), (type VECREG:$rB)))]>;
+
+multiclass SFPMul
+{
+  def v4f32: FMVecInst<v4f32>;
+  def f32:   FMInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                     [(set R32FP:$rT, (fmul R32FP:$rA, R32FP:$rB))]>; 
+}
+
+defm FM : SFPMul;
+
+// Floating point multiply and add
+// e.g. d = c + (a * b)
+def FMAv4f32:
+    RRRForm<0b0111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "fma\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set (v4f32 VECREG:$rT),
+            (fadd (v4f32 VECREG:$rC),
+                  (fmul (v4f32 VECREG:$rA), (v4f32 VECREG:$rB))))]>;
+
+def FMAf32:
+    RRRForm<0b0111, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
+      "fma\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set R32FP:$rT, (fadd R32FP:$rC, (fmul R32FP:$rA, R32FP:$rB)))]>;
+
+// FP multiply and subtract
+// Subtracts value in rC from product
+// res = a * b - c
+def FMSv4f32 :
+    RRRForm<0b0111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "fms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set (v4f32 VECREG:$rT),
+            (fsub (fmul (v4f32 VECREG:$rA), (v4f32 VECREG:$rB)),
+                  (v4f32 VECREG:$rC)))]>;
+
+def FMSf32 :
+    RRRForm<0b0111, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
+      "fms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set R32FP:$rT,
+            (fsub (fmul R32FP:$rA, R32FP:$rB), R32FP:$rC))]>;
+
+// Floating Negative Mulitply and Subtract
+// Subtracts product from value in rC
+// res = fneg(fms a b c)
+//     = - (a * b - c)
+//     = c - a * b
+// NOTE: subtraction order
+// fsub a b = a - b
+// fs a b = b - a?
+def FNMSf32 :
+    RRRForm<0b1101, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
+      "fnms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set R32FP:$rT, (fsub R32FP:$rC, (fmul R32FP:$rA, R32FP:$rB)))]>;
+
+def FNMSv4f32 :
+    RRRForm<0b1101, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "fnms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set (v4f32 VECREG:$rT),
+            (fsub (v4f32 VECREG:$rC),
+                  (fmul (v4f32 VECREG:$rA),
+                        (v4f32 VECREG:$rB))))]>;
+
+
+
+
 // Floating point reciprocal estimate
 
 class FRESTInst<dag OOL, dag IOL>:
@@ -4019,72 +4083,6 @@ def FSCRRf32 :
 // status and control register read
 
 //--------------------------------------
-// Floating point multiply instructions
-//--------------------------------------
-
-def FMv4f32:
-    RRForm<0b00100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-      "fm\t$rT, $rA, $rB", SPrecFP,
-      [(set (v4f32 VECREG:$rT), (fmul (v4f32 VECREG:$rA),
-                                      (v4f32 VECREG:$rB)))]>;
-
-def FMf32 :
-    RRForm<0b01100011010, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
-      "fm\t$rT, $rA, $rB", SPrecFP,
-      [(set R32FP:$rT, (fmul R32FP:$rA, R32FP:$rB))]>;
-
-// Floating point multiply and add
-// e.g. d = c + (a * b)
-def FMAv4f32:
-    RRRForm<0b0111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
-      "fma\t$rT, $rA, $rB, $rC", SPrecFP,
-      [(set (v4f32 VECREG:$rT),
-            (fadd (v4f32 VECREG:$rC),
-                  (fmul (v4f32 VECREG:$rA), (v4f32 VECREG:$rB))))]>;
-
-def FMAf32:
-    RRRForm<0b0111, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
-      "fma\t$rT, $rA, $rB, $rC", SPrecFP,
-      [(set R32FP:$rT, (fadd R32FP:$rC, (fmul R32FP:$rA, R32FP:$rB)))]>;
-
-// FP multiply and subtract
-// Subtracts value in rC from product
-// res = a * b - c
-def FMSv4f32 :
-    RRRForm<0b0111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
-      "fms\t$rT, $rA, $rB, $rC", SPrecFP,
-      [(set (v4f32 VECREG:$rT),
-            (fsub (fmul (v4f32 VECREG:$rA), (v4f32 VECREG:$rB)),
-                  (v4f32 VECREG:$rC)))]>;
-
-def FMSf32 :
-    RRRForm<0b0111, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
-      "fms\t$rT, $rA, $rB, $rC", SPrecFP,
-      [(set R32FP:$rT,
-            (fsub (fmul R32FP:$rA, R32FP:$rB), R32FP:$rC))]>;
-
-// Floating Negative Mulitply and Subtract
-// Subtracts product from value in rC
-// res = fneg(fms a b c)
-//     = - (a * b - c)
-//     = c - a * b
-// NOTE: subtraction order
-// fsub a b = a - b
-// fs a b = b - a?
-def FNMSf32 :
-    RRRForm<0b1101, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
-      "fnms\t$rT, $rA, $rB, $rC", SPrecFP,
-      [(set R32FP:$rT, (fsub R32FP:$rC, (fmul R32FP:$rA, R32FP:$rB)))]>;
-
-def FNMSv4f32 :
-    RRRForm<0b1101, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
-      "fnms\t$rT, $rA, $rB, $rC", SPrecFP,
-      [(set (v4f32 VECREG:$rT),
-            (fsub (v4f32 VECREG:$rC),
-                  (fmul (v4f32 VECREG:$rA),
-                        (v4f32 VECREG:$rB))))]>;
-
-//--------------------------------------
 // Floating Point Conversions
 // Signed conversions:
 def CSiFv4f32:
diff --git a/lib/Target/CellSPU/SPUOperands.td b/lib/Target/CellSPU/SPUOperands.td
index 6216651e48a40..e1a0358abc46f 100644
--- a/lib/Target/CellSPU/SPUOperands.td
+++ b/lib/Target/CellSPU/SPUOperands.td
@@ -98,12 +98,6 @@ def immU8 : PatLeaf<(imm), [{
   return (N->getZExtValue() <= 0xff);
 }]>;
 
-// i64ImmSExt10 predicate - True if the i64 immediate fits in a 10-bit sign
-// extended field.  Used by RI10Form instructions like 'ldq'.
-def i64ImmSExt10  : PatLeaf<(imm), [{
-  return isI64IntS10Immediate(N);
-}]>;
-
 // i32ImmSExt10 predicate - True if the i32 immediate fits in a 10-bit sign
 // extended field.  Used by RI10Form instructions like 'ldq'.
 def i32ImmSExt10  : PatLeaf<(imm), [{
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
index f7cfa42f2a95a..cf718917a5616 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -270,9 +270,8 @@ SPURegisterInfo::eliminateCallFramePseudoInstr(MachineFunction &MF,
   MBB.erase(I);
 }
 
-unsigned
+void
 SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                                     FrameIndexValue *Value,
                                      RegScavenger *RS) const
 {
   unsigned i = 0;
@@ -328,7 +327,6 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   } else {
     MO.ChangeToImmediate(Offset);
   }
-  return 0;
 }
 
 /// determineFrameLayout - Determine the size of the frame and maximum call
@@ -417,7 +415,7 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
     if (hasDebugInfo) {
       // Mark effective beginning of when frame pointer becomes valid.
       FrameLabel = MMI.getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, dl, TII.get(SPU::DBG_LABEL)).addSym(FrameLabel);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)).addSym(FrameLabel);
     }
 
     // Adjust stack pointer, spilling $lr -> 16($sp) and $sp -> -FrameSize($sp)
@@ -476,7 +474,7 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
 
       // Mark effective beginning of when frame pointer is ready.
       MCSymbol *ReadyLabel = MMI.getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, dl, TII.get(SPU::DBG_LABEL)).addSym(ReadyLabel);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)).addSym(ReadyLabel);
 
       MachineLocation FPDst(SPU::R1);
       MachineLocation FPSrc(MachineLocation::VirtualFP);
@@ -491,7 +489,7 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
       dl = MBBI->getDebugLoc();
 
       // Insert terminator label
-      BuildMI(MBB, MBBI, dl, TII.get(SPU::DBG_LABEL))
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL))
         .addSym(MMI.getContext().CreateTempSymbol());
     }
   }
@@ -587,6 +585,7 @@ SPURegisterInfo::convertDFormToXForm(int dFormOpcode) const
     case SPU::LQDr32:    return SPU::LQXr32;
     case SPU::LQDr128:   return SPU::LQXr128;
     case SPU::LQDv16i8:  return SPU::LQXv16i8;
+    case SPU::LQDv4i32:  return SPU::LQXv4i32;
     case SPU::LQDv4f32:  return SPU::LQXv4f32;
     case SPU::STQDr32:   return SPU::STQXr32;
     case SPU::STQDr128:  return SPU::STQXr128;
diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h
index 7a6ae6d43c7ed..aedb769cb4fc8 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.h
+++ b/lib/Target/CellSPU/SPURegisterInfo.h
@@ -63,9 +63,8 @@ namespace llvm {
                                        MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I) const;
     //! Convert frame indicies into machine operands
-    unsigned eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                                 FrameIndexValue *Value = NULL,
-                                 RegScavenger *RS = NULL) const;
+    void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                             RegScavenger *RS = NULL) const;
     //! Determine the frame's layour
     void determineFrameLayout(MachineFunction &MF) const;
 
diff --git a/lib/Target/CellSPU/SPURegisterInfo.td b/lib/Target/CellSPU/SPURegisterInfo.td
index bb88f2bf9a298..3e8f0979256af 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.td
+++ b/lib/Target/CellSPU/SPURegisterInfo.td
@@ -394,7 +394,7 @@ def R8C : RegisterClass<"SPU", [i8], 128,
 
 // The SPU's registers as vector registers:
 def VECREG : RegisterClass<"SPU",
-                           [v16i8,v8i16,v2i32,v4i32,v4f32,v2i64,v2f64],
+                           [v16i8,v8i16,v4i32,v4f32,v2i64,v2f64],
                            128,
  [
    /* volatile register */
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 145568adcd4af..f08559f6e9f2e 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -104,7 +104,7 @@ namespace {
   public:
     static char ID;
     explicit CppWriter(formatted_raw_ostream &o) :
-      ModulePass(&ID), Out(o), uniqueNum(0), is_inline(false), indent_level(0){}
+      ModulePass(ID), Out(o), uniqueNum(0), is_inline(false), indent_level(0){}
 
     virtual const char *getPassName() const { return "C++ backend"; }
 
@@ -288,6 +288,8 @@ void CppWriter::printLinkageType(GlobalValue::LinkageTypes LT) {
     Out << "GlobalValue::LinkerPrivateLinkage"; break;
   case GlobalValue::LinkerPrivateWeakLinkage:
     Out << "GlobalValue::LinkerPrivateWeakLinkage"; break;
+  case GlobalValue::LinkerPrivateWeakDefAutoLinkage:
+    Out << "GlobalValue::LinkerPrivateWeakDefAutoLinkage"; break;
   case GlobalValue::AvailableExternallyLinkage:
     Out << "GlobalValue::AvailableExternallyLinkage "; break;
   case GlobalValue::LinkOnceAnyLinkage:
@@ -471,14 +473,22 @@ void CppWriter::printAttributes(const AttrListPtr &PAL,
       HANDLE_ATTR(Nest);
       HANDLE_ATTR(ReadNone);
       HANDLE_ATTR(ReadOnly);
-      HANDLE_ATTR(InlineHint);
       HANDLE_ATTR(NoInline);
       HANDLE_ATTR(AlwaysInline);
       HANDLE_ATTR(OptimizeForSize);
       HANDLE_ATTR(StackProtect);
       HANDLE_ATTR(StackProtectReq);
       HANDLE_ATTR(NoCapture);
+      HANDLE_ATTR(NoRedZone);
+      HANDLE_ATTR(NoImplicitFloat);
+      HANDLE_ATTR(Naked);
+      HANDLE_ATTR(InlineHint);
 #undef HANDLE_ATTR
+      if (attrs & Attribute::StackAlignment)
+        Out << " | Attribute::constructStackAlignmentFromInt("
+            << Attribute::getStackAlignmentFromAttrs(attrs)
+            << ")"; 
+      attrs &= ~Attribute::StackAlignment;
       assert(attrs == 0 && "Unhandled attribute!");
       Out << ";";
       nl(Out);
@@ -1404,7 +1414,8 @@ void CppWriter::printInstruction(const Instruction *I,
         nl(Out);
       }
       Out << "CallInst* " << iName << " = CallInst::Create("
-          << opNames[call->getNumArgOperands()] << ", " << iName << "_params.begin(), "
+          << opNames[call->getNumArgOperands()] << ", "
+          << iName << "_params.begin(), "
           << iName << "_params.end(), \"";
     } else if (call->getNumArgOperands() == 1) {
       Out << "CallInst* " << iName << " = CallInst::Create("
diff --git a/lib/Target/MBlaze/AsmPrinter/MBlazeAsmPrinter.cpp b/lib/Target/MBlaze/AsmPrinter/MBlazeAsmPrinter.cpp
index b6e4d654f4aa7..f4b30ad271f16 100644
--- a/lib/Target/MBlaze/AsmPrinter/MBlazeAsmPrinter.cpp
+++ b/lib/Target/MBlaze/AsmPrinter/MBlazeAsmPrinter.cpp
@@ -65,11 +65,8 @@ namespace {
     void printFSLImm(const MachineInstr *MI, int opNum, raw_ostream &O);
     void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
                          const char *Modifier = 0);
-    void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
-                         const char *Modifier = 0);
     void printSavedRegsBitmask(raw_ostream &OS);
 
-    const char *emitCurrentABIString();
     void emitFrameDirective();
 
     void printInstruction(const MachineInstr *MI, raw_ostream &O);
@@ -292,13 +289,6 @@ printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
   printOperand(MI, opNum, O);
 }
 
-void MBlazeAsmPrinter::
-printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
-                const char *Modifier) {
-  const MachineOperand& MO = MI->getOperand(opNum);
-  O << MBlaze::MBlazeFCCToString((MBlaze::CondCode)MO.getImm());
-}
-
 // Force static initialization.
 extern "C" void LLVMInitializeMBlazeAsmPrinter() {
   RegisterAsmPrinter<MBlazeAsmPrinter> X(TheMBlazeTarget);
diff --git a/lib/Target/MBlaze/MBlaze.td b/lib/Target/MBlaze/MBlaze.td
index 482ddd3963fba..3815b6d0a398d 100644
--- a/lib/Target/MBlaze/MBlaze.td
+++ b/lib/Target/MBlaze/MBlaze.td
@@ -1,4 +1,4 @@
-//===- MBlaze.td - Describe the MBlaze Target Machine -----------*- C++ -*-===//
+//===- MBlaze.td - Describe the MBlaze Target Machine ------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/MBlaze/MBlazeCallingConv.td b/lib/Target/MBlaze/MBlazeCallingConv.td
index ddd49980e0a29..8622e0d74bcd5 100644
--- a/lib/Target/MBlaze/MBlazeCallingConv.td
+++ b/lib/Target/MBlaze/MBlazeCallingConv.td
@@ -1,4 +1,4 @@
-//===- MBlazeCallingConv.td - Calling Conventions for MBlaze ----*- C++ -*-===//
+//===- MBlazeCallingConv.td - Calling Conventions for MBlaze -*- tablegen -*-=//
 // 
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
index 42fea25073327..b551b79b291ed 100644
--- a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
+++ b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
@@ -32,7 +32,7 @@ namespace {
 
     static char ID;
     Filler(TargetMachine &tm) 
-      : MachineFunctionPass(&ID), TM(tm), TII(tm.getInstrInfo()) { }
+      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
 
     virtual const char *getPassName() const {
       return "MBlaze Delay Slot Filler";
diff --git a/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp b/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp
index c7cd5f4e44a98..e64dd0e3e2c3b 100644
--- a/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp
+++ b/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp
@@ -219,7 +219,7 @@ SelectAddr(SDNode *Op, SDValue Addr, SDValue &Offset, SDValue &Base) {
   // Operand is a result from an ADD.
   if (Addr.getOpcode() == ISD::ADD) {
     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
-      if (Predicate_immSExt16(CN)) {
+      if (isUInt<16>(CN->getZExtValue())) {
 
         // If the first operand is a FI, get the TargetFI Node
         if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
diff --git a/lib/Target/MBlaze/MBlazeInstrFPU.td b/lib/Target/MBlaze/MBlazeInstrFPU.td
index a48a8c9723537..657b1d4940a70 100644
--- a/lib/Target/MBlaze/MBlazeInstrFPU.td
+++ b/lib/Target/MBlaze/MBlazeInstrFPU.td
@@ -1,4 +1,4 @@
-//===- MBlazeInstrFPU.td - MBlaze FPU Instruction defs ----------*- C++ -*-===//
+//===- MBlazeInstrFPU.td - MBlaze FPU Instruction defs -----*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/MBlaze/MBlazeInstrFSL.td b/lib/Target/MBlaze/MBlazeInstrFSL.td
index b59999e76ae58..51584111e6661 100644
--- a/lib/Target/MBlaze/MBlazeInstrFSL.td
+++ b/lib/Target/MBlaze/MBlazeInstrFSL.td
@@ -1,4 +1,4 @@
-//===- MBlazeInstrFSL.td - MBlaze FSL Instruction defs ----------*- C++ -*-===//
+//===- MBlazeInstrFSL.td - MBlaze FSL Instruction defs -----*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/MBlaze/MBlazeInstrFormats.td b/lib/Target/MBlaze/MBlazeInstrFormats.td
index 7d655433d4e4d..28e8e4402225b 100644
--- a/lib/Target/MBlaze/MBlazeInstrFormats.td
+++ b/lib/Target/MBlaze/MBlazeInstrFormats.td
@@ -1,4 +1,4 @@
-//===- MBlazeInstrFormats.td - MB Instruction defs --------------*- C++ -*-===//
+//===- MBlazeInstrFormats.td - MB Instruction defs ---------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
index 6ff5825a26b6e..b590c090e095a 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
@@ -30,41 +30,6 @@ static bool isZeroImm(const MachineOperand &op) {
   return op.isImm() && op.getImm() == 0;
 }
 
-/// Return true if the instruction is a register to register move and
-/// leave the source and dest operands in the passed parameters.
-bool MBlazeInstrInfo::
-isMoveInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg,
-            unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
-  SrcSubIdx = DstSubIdx = 0; // No sub-registers.
-
-  // add $dst, $src, $zero || addu $dst, $zero, $src
-  // or  $dst, $src, $zero || or   $dst, $zero, $src
-  if ((MI.getOpcode() == MBlaze::ADD) || (MI.getOpcode() == MBlaze::OR)) {
-    if (MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == MBlaze::R0) {
-      DstReg = MI.getOperand(0).getReg();
-      SrcReg = MI.getOperand(2).getReg();
-      return true;
-    } else if (MI.getOperand(2).isReg() && 
-               MI.getOperand(2).getReg() == MBlaze::R0) {
-      DstReg = MI.getOperand(0).getReg();
-      SrcReg = MI.getOperand(1).getReg();
-      return true;
-    }
-  }
-
-  // addi $dst, $src, 0
-  // ori  $dst, $src, 0
-  if ((MI.getOpcode() == MBlaze::ADDI) || (MI.getOpcode() == MBlaze::ORI)) {
-    if ((MI.getOperand(1).isReg()) && (isZeroImm(MI.getOperand(2)))) {
-      DstReg = MI.getOperand(0).getReg();
-      SrcReg = MI.getOperand(1).getReg();
-      return true;
-    }
-  }
-
-  return false;
-}
-
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
 /// load from a stack slot, return the virtual or physical register number of
 /// the destination along with the FrameIndex of the loaded stack slot.  If
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.h b/lib/Target/MBlaze/MBlazeInstrInfo.h
index f0743705f010e..b3dba0ec768c0 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.h
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.h
@@ -173,12 +173,6 @@ public:
   ///
   virtual const MBlazeRegisterInfo &getRegisterInfo() const { return RI; }
 
-  /// Return true if the instruction is a register to register move and return
-  /// the source and dest operands and their sub-register indices by reference.
-  virtual bool isMoveInstr(const MachineInstr &MI,
-                           unsigned &SrcReg, unsigned &DstReg,
-                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td
index 3c406dda0591b..e5d153474a7e5 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.td
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.td
@@ -1,4 +1,4 @@
-//===- MBlazeInstrInfo.td - MBlaze Instruction defs -------------*- C++ -*-===//
+//===- MBlazeInstrInfo.td - MBlaze Instruction defs --------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/MBlaze/MBlazeIntrinsics.td b/lib/Target/MBlaze/MBlazeIntrinsics.td
index 82552fa4b343a..a27cb5ba0dc4f 100644
--- a/lib/Target/MBlaze/MBlazeIntrinsics.td
+++ b/lib/Target/MBlaze/MBlazeIntrinsics.td
@@ -17,17 +17,11 @@
 
 // MBlaze intrinsic classes.
 let TargetPrefix = "mblaze", isTarget = 1 in { 
-  class MBFSL_Get_Intrinsic : Intrinsic<[llvm_i32_ty],
-                                        [llvm_i32_ty],
-                                        [IntrWriteMem]>;
+  class MBFSL_Get_Intrinsic : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
 
-  class MBFSL_Put_Intrinsic : Intrinsic<[],
-                                        [llvm_i32_ty, llvm_i32_ty],
-                                        [IntrWriteMem]>;
+  class MBFSL_Put_Intrinsic : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], []>;
 
-  class MBFSL_PutT_Intrinsic : Intrinsic<[],
-                                         [llvm_i32_ty],
-                                         [IntrWriteMem]>;
+  class MBFSL_PutT_Intrinsic : Intrinsic<[], [llvm_i32_ty], []>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
index 8cafa8c519c62..22b6a30470d17 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -242,9 +242,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 // FrameIndex represent objects inside a abstract stack.
 // We must replace FrameIndex with an stack/frame pointer
 // direct reference.
-unsigned MBlazeRegisterInfo::
+void MBlazeRegisterInfo::
 eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                    FrameIndexValue *Value, RegScavenger *RS) const {
+                    RegScavenger *RS) const {
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
 
@@ -277,7 +277,6 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 
   MI.getOperand(oi).ChangeToImmediate(Offset);
   MI.getOperand(i).ChangeToRegister(getFrameRegister(MF), false);
-  return 0;
 }
 
 void MBlazeRegisterInfo::
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.h b/lib/Target/MBlaze/MBlazeRegisterInfo.h
index af97b0e2d79ec..1e1fde14ab7b8 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.h
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.h
@@ -63,9 +63,8 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
                                      MachineBasicBlock::iterator I) const;
 
   /// Stack Frame Processing Methods
-  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
-                               int SPAdj, FrameIndexValue *Value = NULL,
-                               RegScavenger *RS = NULL) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.td b/lib/Target/MBlaze/MBlazeRegisterInfo.td
index d0a1e7556c439..5e935103389e6 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.td
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.td
@@ -1,4 +1,4 @@
-//===- MBlazeRegisterInfo.td - MBlaze Register defs -------------*- C++ -*-===//
+//===- MBlazeRegisterInfo.td - MBlaze Register defs --------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/MBlaze/MBlazeSchedule.td b/lib/Target/MBlaze/MBlazeSchedule.td
index 1fec9e694005c..4a65542a447c6 100644
--- a/lib/Target/MBlaze/MBlazeSchedule.td
+++ b/lib/Target/MBlaze/MBlazeSchedule.td
@@ -1,4 +1,4 @@
-//===- MBlazeSchedule.td - MBlaze Scheduling Definitions --------*- C++ -*-===//
+//===- MBlazeSchedule.td - MBlaze Scheduling Definitions ---*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/MSIL/CMakeLists.txt b/lib/Target/MSIL/CMakeLists.txt
deleted file mode 100644
index b1d47ef05ec5e..0000000000000
--- a/lib/Target/MSIL/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_llvm_target(MSIL
-  MSILWriter.cpp
-  )
diff --git a/lib/Target/MSIL/MSILWriter.cpp b/lib/Target/MSIL/MSILWriter.cpp
deleted file mode 100644
index cc350e8a4f892..0000000000000
--- a/lib/Target/MSIL/MSILWriter.cpp
+++ /dev/null
@@ -1,1706 +0,0 @@
-//===-- MSILWriter.cpp - Library for converting LLVM code to MSIL ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This library converts LLVM code to MSIL code.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MSILWriter.h"
-#include "llvm/CallingConv.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/TypeSymbolTable.h"
-#include "llvm/Analysis/ConstantsScanner.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/InstVisitor.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/CodeGen/Passes.h"
-using namespace llvm;
-
-namespace llvm {
-  // TargetMachine for the MSIL 
-  struct MSILTarget : public TargetMachine {
-    MSILTarget(const Target &T, const std::string &TT, const std::string &FS)
-      : TargetMachine(T) {}
-
-    virtual bool addPassesToEmitFile(PassManagerBase &PM,
-                                     formatted_raw_ostream &Out,
-                                     CodeGenFileType FileType,
-                                     CodeGenOpt::Level OptLevel,
-                                     bool DisableVerify);
-
-    virtual const TargetData *getTargetData() const { return 0; }
-  };
-}
-
-extern "C" void LLVMInitializeMSILTarget() {
-  // Register the target.
-  RegisterTargetMachine<MSILTarget> X(TheMSILTarget);
-}
-
-bool MSILModule::runOnModule(Module &M) {
-  ModulePtr = &M;
-  TD = &getAnalysis<TargetData>();
-  bool Changed = false;
-  // Find named types.  
-  TypeSymbolTable& Table = M.getTypeSymbolTable();
-  std::set<const Type *> Types = getAnalysis<FindUsedTypes>().getTypes();
-  for (TypeSymbolTable::iterator I = Table.begin(), E = Table.end(); I!=E; ) {
-    if (!I->second->isStructTy() && !I->second->isOpaqueTy())
-      Table.remove(I++);
-    else {
-      std::set<const Type *>::iterator T = Types.find(I->second);
-      if (T==Types.end())
-        Table.remove(I++);
-      else {
-        Types.erase(T);
-        ++I;
-      }
-    }
-  }
-  // Find unnamed types.
-  unsigned RenameCounter = 0;
-  for (std::set<const Type *>::const_iterator I = Types.begin(),
-       E = Types.end(); I!=E; ++I)
-    if (const StructType *STy = dyn_cast<StructType>(*I)) {
-      while (ModulePtr->addTypeName("unnamed$"+utostr(RenameCounter), STy))
-        ++RenameCounter;
-      Changed = true;
-    }
-  // Pointer for FunctionPass.
-  UsedTypes = &getAnalysis<FindUsedTypes>().getTypes();
-  return Changed;
-}
-
-char MSILModule::ID = 0;
-char MSILWriter::ID = 0;
-
-bool MSILWriter::runOnFunction(Function &F) {
-  if (F.isDeclaration()) return false;
-
-  // Do not codegen any 'available_externally' functions at all, they have
-  // definitions outside the translation unit.
-  if (F.hasAvailableExternallyLinkage())
-    return false;
-
-  LInfo = &getAnalysis<LoopInfo>();
-  printFunction(F);
-  return false;
-}
-
-
-bool MSILWriter::doInitialization(Module &M) {
-  ModulePtr = &M;
-  Out << ".assembly extern mscorlib {}\n";
-  Out << ".assembly MSIL {}\n\n";
-  Out << "// External\n";
-  printExternals();
-  Out << "// Declarations\n";
-  printDeclarations(M.getTypeSymbolTable());
-  Out << "// Definitions\n";
-  printGlobalVariables();
-  Out << "// Startup code\n";
-  printModuleStartup();
-  return false;
-}
-
-
-bool MSILWriter::doFinalization(Module &M) {
-  return false;
-}
-
-
-void MSILWriter::printModuleStartup() {
-  Out <<
-  ".method static public int32 $MSIL_Startup() {\n"
-  "\t.entrypoint\n"
-  "\t.locals (native int i)\n"
-  "\t.locals (native int argc)\n"
-  "\t.locals (native int ptr)\n"
-  "\t.locals (void* argv)\n"
-  "\t.locals (string[] args)\n"
-  "\tcall\tstring[] [mscorlib]System.Environment::GetCommandLineArgs()\n"
-  "\tdup\n"
-  "\tstloc\targs\n"
-  "\tldlen\n"
-  "\tconv.i4\n"
-  "\tdup\n"
-  "\tstloc\targc\n";
-  printPtrLoad(TD->getPointerSize());
-  Out <<
-  "\tmul\n"
-  "\tlocalloc\n"
-  "\tstloc\targv\n"
-  "\tldc.i4.0\n"
-  "\tstloc\ti\n"
-  "L_01:\n"
-  "\tldloc\ti\n"
-  "\tldloc\targc\n"
-  "\tceq\n"
-  "\tbrtrue\tL_02\n"
-  "\tldloc\targs\n"
-  "\tldloc\ti\n"
-  "\tldelem.ref\n"
-  "\tcall\tnative int [mscorlib]System.Runtime.InteropServices.Marshal::"
-           "StringToHGlobalAnsi(string)\n"
-  "\tstloc\tptr\n"
-  "\tldloc\targv\n"
-  "\tldloc\ti\n";
-  printPtrLoad(TD->getPointerSize());
-  Out << 
-  "\tmul\n"
-  "\tadd\n"
-  "\tldloc\tptr\n"
-  "\tstind.i\n"
-  "\tldloc\ti\n"
-  "\tldc.i4.1\n"
-  "\tadd\n"
-  "\tstloc\ti\n"
-  "\tbr\tL_01\n"
-  "L_02:\n"
-  "\tcall void $MSIL_Init()\n";
-
-  // Call user 'main' function.
-  const Function* F = ModulePtr->getFunction("main");
-  if (!F || F->isDeclaration()) {
-    Out << "\tldc.i4.0\n\tret\n}\n";
-    return;
-  }
-  bool BadSig = true;
-  std::string Args("");
-  Function::const_arg_iterator Arg1,Arg2;
-
-  switch (F->arg_size()) {
-  case 0:
-    BadSig = false;
-    break;
-  case 1:
-    Arg1 = F->arg_begin();
-    if (Arg1->getType()->isIntegerTy()) {
-      Out << "\tldloc\targc\n";
-      Args = getTypeName(Arg1->getType());
-      BadSig = false;
-    }
-    break;
-  case 2:
-    Arg1 = Arg2 = F->arg_begin(); ++Arg2;
-    if (Arg1->getType()->isIntegerTy() && 
-        Arg2->getType()->getTypeID() == Type::PointerTyID) {
-      Out << "\tldloc\targc\n\tldloc\targv\n";
-      Args = getTypeName(Arg1->getType())+","+getTypeName(Arg2->getType());
-      BadSig = false;
-    }
-    break;
-  default:
-    BadSig = true;
-  }
-
-  bool RetVoid = (F->getReturnType()->getTypeID() == Type::VoidTyID);
-  if (BadSig || (!F->getReturnType()->isIntegerTy() && !RetVoid)) {
-    Out << "\tldc.i4.0\n";
-  } else {
-    Out << "\tcall\t" << getTypeName(F->getReturnType()) <<
-      getConvModopt(F->getCallingConv()) << "main(" << Args << ")\n";
-    if (RetVoid)
-      Out << "\tldc.i4.0\n";
-    else
-      Out << "\tconv.i4\n";
-  }
-  Out << "\tret\n}\n";
-}
-
-bool MSILWriter::isZeroValue(const Value* V) {
-  if (const Constant *C = dyn_cast<Constant>(V))
-    return C->isNullValue();
-  return false;
-}
-
-
-std::string MSILWriter::getValueName(const Value* V) {
-  std::string Name;
-  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
-    Name = GV->getName();
-  else {
-    unsigned &No = AnonValueNumbers[V];
-    if (No == 0) No = ++NextAnonValueNumber;
-    Name = "tmp" + utostr(No);
-  }
-  
-  // Name into the quotes allow control and space characters.
-  return "'"+Name+"'";
-}
-
-
-std::string MSILWriter::getLabelName(const std::string& Name) {
-  if (Name.find('.')!=std::string::npos) {
-    std::string Tmp(Name);
-    // Replace unaccepable characters in the label name.
-    for (std::string::iterator I = Tmp.begin(), E = Tmp.end(); I!=E; ++I)
-      if (*I=='.') *I = '@';
-    return Tmp;
-  }
-  return Name;
-}
-
-
-std::string MSILWriter::getLabelName(const Value* V) {
-  std::string Name;
-  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
-    Name = GV->getName();
-  else {
-    unsigned &No = AnonValueNumbers[V];
-    if (No == 0) No = ++NextAnonValueNumber;
-    Name = "tmp" + utostr(No);
-  }
-  
-  return getLabelName(Name);
-}
-
-
-std::string MSILWriter::getConvModopt(CallingConv::ID CallingConvID) {
-  switch (CallingConvID) {
-  case CallingConv::C:
-  case CallingConv::Cold:
-  case CallingConv::Fast:
-    return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvCdecl) ";
-  case CallingConv::X86_FastCall:
-    return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvFastcall) ";
-  case CallingConv::X86_StdCall:
-    return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvStdcall) ";
-  case CallingConv::X86_ThisCall:
-    return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvThiscall) ";
-  default:
-    errs() << "CallingConvID = " << CallingConvID << '\n';
-    llvm_unreachable("Unsupported calling convention");
-  }
-  return ""; // Not reached
-}
-
-
-std::string MSILWriter::getArrayTypeName(Type::TypeID TyID, const Type* Ty) {
-  std::string Tmp = "";
-  const Type* ElemTy = Ty;
-  assert(Ty->getTypeID()==TyID && "Invalid type passed");
-  // Walk trought array element types.
-  for (;;) {
-    // Multidimensional array.
-    if (ElemTy->getTypeID()==TyID) {
-      if (const ArrayType* ATy = dyn_cast<ArrayType>(ElemTy))
-        Tmp += utostr(ATy->getNumElements());
-      else if (const VectorType* VTy = dyn_cast<VectorType>(ElemTy))
-        Tmp += utostr(VTy->getNumElements());
-      ElemTy = cast<SequentialType>(ElemTy)->getElementType();
-    }
-    // Base element type found.
-    if (ElemTy->getTypeID()!=TyID) break;
-    Tmp += ",";
-  }
-  return getTypeName(ElemTy, false, true)+"["+Tmp+"]";
-}
-
-
-std::string MSILWriter::getPrimitiveTypeName(const Type* Ty, bool isSigned) {
-  unsigned NumBits = 0;
-  switch (Ty->getTypeID()) {
-  case Type::VoidTyID:
-    return "void ";
-  case Type::IntegerTyID:
-    NumBits = getBitWidth(Ty);
-    if(NumBits==1)
-      return "bool ";
-    if (!isSigned)
-      return "unsigned int"+utostr(NumBits)+" ";
-    return "int"+utostr(NumBits)+" ";
-  case Type::FloatTyID:
-    return "float32 ";
-  case Type::DoubleTyID:
-    return "float64 "; 
-  default:
-    errs() << "Type = " << *Ty << '\n';
-    llvm_unreachable("Invalid primitive type");
-  }
-  return ""; // Not reached
-}
-
-
-std::string MSILWriter::getTypeName(const Type* Ty, bool isSigned,
-                                    bool isNested) {
-  if (Ty->isPrimitiveType() || Ty->isIntegerTy())
-    return getPrimitiveTypeName(Ty,isSigned);
-  // FIXME: "OpaqueType" support
-  switch (Ty->getTypeID()) {
-  case Type::PointerTyID:
-    return "void* ";
-  case Type::StructTyID:
-    if (isNested)
-      return ModulePtr->getTypeName(Ty);
-    return "valuetype '"+ModulePtr->getTypeName(Ty)+"' ";
-  case Type::ArrayTyID:
-    if (isNested)
-      return getArrayTypeName(Ty->getTypeID(),Ty);
-    return "valuetype '"+getArrayTypeName(Ty->getTypeID(),Ty)+"' ";
-  case Type::VectorTyID:
-    if (isNested)
-      return getArrayTypeName(Ty->getTypeID(),Ty);
-    return "valuetype '"+getArrayTypeName(Ty->getTypeID(),Ty)+"' ";
-  default:
-    errs() << "Type = " << *Ty << '\n';
-    llvm_unreachable("Invalid type in getTypeName()");
-  }
-  return ""; // Not reached
-}
-
-
-MSILWriter::ValueType MSILWriter::getValueLocation(const Value* V) {
-  // Function argument
-  if (isa<Argument>(V))
-    return ArgumentVT;
-  // Function
-  else if (const Function* F = dyn_cast<Function>(V))
-    return F->hasLocalLinkage() ? InternalVT : GlobalVT;
-  // Variable
-  else if (const GlobalVariable* G = dyn_cast<GlobalVariable>(V))
-    return G->hasLocalLinkage() ? InternalVT : GlobalVT;
-  // Constant
-  else if (isa<Constant>(V))
-    return isa<ConstantExpr>(V) ? ConstExprVT : ConstVT;
-  // Local variable
-  return LocalVT;
-}
-
-
-std::string MSILWriter::getTypePostfix(const Type* Ty, bool Expand,
-                                       bool isSigned) {
-  unsigned NumBits = 0;
-  switch (Ty->getTypeID()) {
-  // Integer constant, expanding for stack operations.
-  case Type::IntegerTyID:
-    NumBits = getBitWidth(Ty);
-    // Expand integer value to "int32" or "int64".
-    if (Expand) return (NumBits<=32 ? "i4" : "i8");
-    if (NumBits==1) return "i1";
-    return (isSigned ? "i" : "u")+utostr(NumBits/8);
-  // Float constant.
-  case Type::FloatTyID:
-    return "r4";
-  case Type::DoubleTyID:
-    return "r8";
-  case Type::PointerTyID:
-    return "i"+utostr(TD->getTypeAllocSize(Ty));
-  default:
-    errs() << "TypeID = " << Ty->getTypeID() << '\n';
-    llvm_unreachable("Invalid type in TypeToPostfix()");
-  }
-  return ""; // Not reached
-}
-
-
-void MSILWriter::printConvToPtr() {
-  switch (ModulePtr->getPointerSize()) {
-  case Module::Pointer32:
-    printSimpleInstruction("conv.u4");
-    break;
-  case Module::Pointer64:
-    printSimpleInstruction("conv.u8");
-    break;
-  default:
-    llvm_unreachable("Module use not supporting pointer size");
-  }
-}
-
-
-void MSILWriter::printPtrLoad(uint64_t N) {
-  switch (ModulePtr->getPointerSize()) {
-  case Module::Pointer32:
-    printSimpleInstruction("ldc.i4",utostr(N).c_str());
-    // FIXME: Need overflow test?
-    if (!isUInt<32>(N)) {
-      errs() << "Value = " << utostr(N) << '\n';
-      llvm_unreachable("32-bit pointer overflowed");
-    }
-    break;
-  case Module::Pointer64:
-    printSimpleInstruction("ldc.i8",utostr(N).c_str());
-    break;
-  default:
-    llvm_unreachable("Module use not supporting pointer size");
-  }
-}
-
-
-void MSILWriter::printValuePtrLoad(const Value* V) {
-  printValueLoad(V);
-  printConvToPtr();
-}
-
-
-void MSILWriter::printConstLoad(const Constant* C) {
-  if (const ConstantInt* CInt = dyn_cast<ConstantInt>(C)) {
-    // Integer constant
-    Out << "\tldc." << getTypePostfix(C->getType(),true) << '\t';
-    if (CInt->isMinValue(true))
-      Out << CInt->getSExtValue();
-    else
-      Out << CInt->getZExtValue();
-  } else if (const ConstantFP* FP = dyn_cast<ConstantFP>(C)) {
-    // Float constant
-    uint64_t X;
-    unsigned Size;
-    if (FP->getType()->getTypeID()==Type::FloatTyID) {
-      X = (uint32_t)FP->getValueAPF().bitcastToAPInt().getZExtValue();
-      Size = 4;  
-    } else {
-      X = FP->getValueAPF().bitcastToAPInt().getZExtValue();
-      Size = 8;  
-    }
-    Out << "\tldc.r" << Size << "\t( " << utohexstr(X) << ')';
-  } else if (isa<UndefValue>(C)) {
-    // Undefined constant value = NULL.
-    printPtrLoad(0);
-  } else {
-    errs() << "Constant = " << *C << '\n';
-    llvm_unreachable("Invalid constant value");
-  }
-  Out << '\n';
-}
-
-
-void MSILWriter::printValueLoad(const Value* V) {
-  MSILWriter::ValueType Location = getValueLocation(V);
-  switch (Location) {
-  // Global variable or function address.
-  case GlobalVT:
-  case InternalVT:
-    if (const Function* F = dyn_cast<Function>(V)) {
-      std::string Name = getConvModopt(F->getCallingConv())+getValueName(F);
-      printSimpleInstruction("ldftn",
-        getCallSignature(F->getFunctionType(),NULL,Name).c_str());
-    } else {
-      std::string Tmp;
-      const Type* ElemTy = cast<PointerType>(V->getType())->getElementType();
-      if (Location==GlobalVT && cast<GlobalVariable>(V)->hasDLLImportLinkage()) {
-        Tmp = "void* "+getValueName(V);
-        printSimpleInstruction("ldsfld",Tmp.c_str());
-      } else {
-        Tmp = getTypeName(ElemTy)+getValueName(V);
-        printSimpleInstruction("ldsflda",Tmp.c_str());
-      }
-    }
-    break;
-  // Function argument.
-  case ArgumentVT:
-    printSimpleInstruction("ldarg",getValueName(V).c_str());
-    break;
-  // Local function variable.
-  case LocalVT:
-    printSimpleInstruction("ldloc",getValueName(V).c_str());
-    break;
-  // Constant value.
-  case ConstVT:
-    if (isa<ConstantPointerNull>(V))
-      printPtrLoad(0);
-    else
-      printConstLoad(cast<Constant>(V));
-    break;
-  // Constant expression.
-  case ConstExprVT:
-    printConstantExpr(cast<ConstantExpr>(V));
-    break;
-  default:
-    errs() << "Value = " << *V << '\n';
-    llvm_unreachable("Invalid value location");
-  }
-}
-
-
-void MSILWriter::printValueSave(const Value* V) {
-  switch (getValueLocation(V)) {
-  case ArgumentVT:
-    printSimpleInstruction("starg",getValueName(V).c_str());
-    break;
-  case LocalVT:
-    printSimpleInstruction("stloc",getValueName(V).c_str());
-    break;
-  default:
-    errs() << "Value  = " << *V << '\n';
-    llvm_unreachable("Invalid value location");
-  }
-}
-
-
-void MSILWriter::printBinaryInstruction(const char* Name, const Value* Left,
-                                        const Value* Right) {
-  printValueLoad(Left);
-  printValueLoad(Right);
-  Out << '\t' << Name << '\n';
-}
-
-
-void MSILWriter::printSimpleInstruction(const char* Inst, const char* Operand) {
-  if(Operand) 
-    Out << '\t' << Inst << '\t' << Operand << '\n';
-  else
-    Out << '\t' << Inst << '\n';
-}
-
-
-void MSILWriter::printPHICopy(const BasicBlock* Src, const BasicBlock* Dst) {
-  for (BasicBlock::const_iterator I = Dst->begin(); isa<PHINode>(I); ++I) {
-    const PHINode* Phi = cast<PHINode>(I);
-    const Value* Val = Phi->getIncomingValueForBlock(Src);
-    if (isa<UndefValue>(Val)) continue;
-    printValueLoad(Val);
-    printValueSave(Phi);
-  }
-}
-
-
-void MSILWriter::printBranchToBlock(const BasicBlock* CurrBB,
-                                    const BasicBlock* TrueBB,
-                                    const BasicBlock* FalseBB) {
-  if (TrueBB==FalseBB) {
-    // "TrueBB" and "FalseBB" destination equals
-    printPHICopy(CurrBB,TrueBB);
-    printSimpleInstruction("pop");
-    printSimpleInstruction("br",getLabelName(TrueBB).c_str());
-  } else if (FalseBB==NULL) {
-    // If "FalseBB" not used the jump have condition
-    printPHICopy(CurrBB,TrueBB);
-    printSimpleInstruction("brtrue",getLabelName(TrueBB).c_str());
-  } else if (TrueBB==NULL) {
-    // If "TrueBB" not used the jump is unconditional
-    printPHICopy(CurrBB,FalseBB);
-    printSimpleInstruction("br",getLabelName(FalseBB).c_str());
-  } else {
-    // Copy PHI instructions for each block
-    std::string TmpLabel;
-    // Print PHI instructions for "TrueBB"
-    if (isa<PHINode>(TrueBB->begin())) {
-      TmpLabel = getLabelName(TrueBB)+"$phi_"+utostr(getUniqID());
-      printSimpleInstruction("brtrue",TmpLabel.c_str());
-    } else {
-      printSimpleInstruction("brtrue",getLabelName(TrueBB).c_str());
-    }
-    // Print PHI instructions for "FalseBB"
-    if (isa<PHINode>(FalseBB->begin())) {
-      printPHICopy(CurrBB,FalseBB);
-      printSimpleInstruction("br",getLabelName(FalseBB).c_str());
-    } else {
-      printSimpleInstruction("br",getLabelName(FalseBB).c_str());
-    }
-    if (isa<PHINode>(TrueBB->begin())) {
-      // Handle "TrueBB" PHI Copy
-      Out << TmpLabel << ":\n";
-      printPHICopy(CurrBB,TrueBB);
-      printSimpleInstruction("br",getLabelName(TrueBB).c_str());
-    }
-  }
-}
-
-
-void MSILWriter::printBranchInstruction(const BranchInst* Inst) {
-  if (Inst->isUnconditional()) {
-    printBranchToBlock(Inst->getParent(),NULL,Inst->getSuccessor(0));
-  } else {
-    printValueLoad(Inst->getCondition());
-    printBranchToBlock(Inst->getParent(),Inst->getSuccessor(0),
-                       Inst->getSuccessor(1));
-  }
-}
-
-
-void MSILWriter::printSelectInstruction(const Value* Cond, const Value* VTrue,
-                                        const Value* VFalse) {
-  std::string TmpLabel = std::string("select$true_")+utostr(getUniqID());
-  printValueLoad(VTrue);
-  printValueLoad(Cond);
-  printSimpleInstruction("brtrue",TmpLabel.c_str());
-  printSimpleInstruction("pop");
-  printValueLoad(VFalse);
-  Out << TmpLabel << ":\n";
-}
-
-
-void MSILWriter::printIndirectLoad(const Value* V) {
-  const Type* Ty = V->getType();
-  printValueLoad(V);
-  if (const PointerType* P = dyn_cast<PointerType>(Ty))
-    Ty = P->getElementType();
-  std::string Tmp = "ldind."+getTypePostfix(Ty, false);
-  printSimpleInstruction(Tmp.c_str());
-}
-
-
-void MSILWriter::printIndirectSave(const Value* Ptr, const Value* Val) {
-  printValueLoad(Ptr);
-  printValueLoad(Val);
-  printIndirectSave(Val->getType());
-}
-
-
-void MSILWriter::printIndirectSave(const Type* Ty) {
-  // Instruction need signed postfix for any type.
-  std::string postfix = getTypePostfix(Ty, false);
-  if (*postfix.begin()=='u') *postfix.begin() = 'i';
-  postfix = "stind."+postfix;
-  printSimpleInstruction(postfix.c_str());
-}
-
-
-void MSILWriter::printCastInstruction(unsigned int Op, const Value* V,
-                                      const Type* Ty, const Type* SrcTy) {
-  std::string Tmp("");
-  printValueLoad(V);
-  switch (Op) {
-  // Signed
-  case Instruction::SExt:
-    // If sign extending int, convert first from unsigned to signed
-    // with the same bit size - because otherwise we will loose the sign.
-    if (SrcTy) {
-      Tmp = "conv."+getTypePostfix(SrcTy,false,true);
-      printSimpleInstruction(Tmp.c_str());
-    }
-    // FALLTHROUGH
-  case Instruction::SIToFP:
-  case Instruction::FPToSI:
-    Tmp = "conv."+getTypePostfix(Ty,false,true);
-    printSimpleInstruction(Tmp.c_str());
-    break;
-  // Unsigned
-  case Instruction::FPTrunc:
-  case Instruction::FPExt:
-  case Instruction::UIToFP:
-  case Instruction::Trunc:
-  case Instruction::ZExt:
-  case Instruction::FPToUI:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-    Tmp = "conv."+getTypePostfix(Ty,false);
-    printSimpleInstruction(Tmp.c_str());
-    break;
-  // Do nothing
-  case Instruction::BitCast:
-    // FIXME: meaning that ld*/st* instruction do not change data format.
-    break;
-  default:
-    errs() << "Opcode = " << Op << '\n';
-    llvm_unreachable("Invalid conversion instruction");
-  }
-}
-
-
-void MSILWriter::printGepInstruction(const Value* V, gep_type_iterator I,
-                                     gep_type_iterator E) {
-  unsigned Size;
-  // Load address
-  printValuePtrLoad(V);
-  // Calculate element offset.
-  for (; I!=E; ++I){
-    Size = 0;
-    const Value* IndexValue = I.getOperand();
-    if (const StructType* StrucTy = dyn_cast<StructType>(*I)) {
-      uint64_t FieldIndex = cast<ConstantInt>(IndexValue)->getZExtValue();
-      // Offset is the sum of all previous structure fields.
-      for (uint64_t F = 0; F<FieldIndex; ++F)
-        Size += TD->getTypeAllocSize(StrucTy->getContainedType((unsigned)F));
-      printPtrLoad(Size);
-      printSimpleInstruction("add");
-      continue;
-    } else if (const SequentialType* SeqTy = dyn_cast<SequentialType>(*I)) {
-      Size = TD->getTypeAllocSize(SeqTy->getElementType());
-    } else {
-      Size = TD->getTypeAllocSize(*I);
-    }
-    // Add offset of current element to stack top.
-    if (!isZeroValue(IndexValue)) {
-      // Constant optimization.
-      if (const ConstantInt* C = dyn_cast<ConstantInt>(IndexValue)) {
-        if (C->getValue().isNegative()) {
-          printPtrLoad(C->getValue().abs().getZExtValue()*Size);
-          printSimpleInstruction("sub");
-          continue;
-        } else
-          printPtrLoad(C->getZExtValue()*Size);
-      } else {
-        printPtrLoad(Size);
-        printValuePtrLoad(IndexValue);
-        printSimpleInstruction("mul");
-      }
-      printSimpleInstruction("add");
-    }
-  }
-}
-
-
-std::string MSILWriter::getCallSignature(const FunctionType* Ty,
-                                         const Instruction* Inst,
-                                         std::string Name) {
-  std::string Tmp("");
-  if (Ty->isVarArg()) Tmp += "vararg ";
-  // Name and return type.
-  Tmp += getTypeName(Ty->getReturnType())+Name+"(";
-  // Function argument type list.
-  unsigned NumParams = Ty->getNumParams();
-  for (unsigned I = 0; I!=NumParams; ++I) {
-    if (I!=0) Tmp += ",";
-    Tmp += getTypeName(Ty->getParamType(I));
-  }
-  // CLR needs to know the exact amount of parameters received by vararg
-  // function, because caller cleans the stack.
-  if (Ty->isVarArg() && Inst) {
-    // Origin to function arguments in "CallInst" or "InvokeInst".
-    unsigned Org = isa<InvokeInst>(Inst) ? 3 : 1;
-    // Print variable argument types.
-    unsigned NumOperands = Inst->getNumOperands()-Org;
-    if (NumParams<NumOperands) {
-      if (NumParams!=0) Tmp += ", ";
-      Tmp += "... , ";
-      for (unsigned J = NumParams; J!=NumOperands; ++J) {
-        if (J!=NumParams) Tmp += ", ";
-        Tmp += getTypeName(Inst->getOperand(J+Org)->getType());
-      }
-    }
-  }
-  return Tmp+")";
-}
-
-
-void MSILWriter::printFunctionCall(const Value* FnVal,
-                                   const Instruction* Inst) {
-  // Get function calling convention.
-  std::string Name = "";
-  if (const CallInst* Call = dyn_cast<CallInst>(Inst))
-    Name = getConvModopt(Call->getCallingConv());
-  else if (const InvokeInst* Invoke = dyn_cast<InvokeInst>(Inst))
-    Name = getConvModopt(Invoke->getCallingConv());
-  else {
-    errs() << "Instruction = " << Inst->getName() << '\n';
-    llvm_unreachable("Need \"Invoke\" or \"Call\" instruction only");
-  }
-  if (const Function* F = dyn_cast<Function>(FnVal)) {
-    // Direct call.
-    Name += getValueName(F);
-    printSimpleInstruction("call",
-      getCallSignature(F->getFunctionType(),Inst,Name).c_str());
-  } else {
-    // Indirect function call.
-    const PointerType* PTy = cast<PointerType>(FnVal->getType());
-    const FunctionType* FTy = cast<FunctionType>(PTy->getElementType());
-    // Load function address.
-    printValueLoad(FnVal);
-    printSimpleInstruction("calli",getCallSignature(FTy,Inst,Name).c_str());
-  }
-}
-
-
-void MSILWriter::printIntrinsicCall(const IntrinsicInst* Inst) {
-  std::string Name;
-  switch (Inst->getIntrinsicID()) {
-  case Intrinsic::vastart:
-    Name = getValueName(Inst->getArgOperand(0));
-    Name.insert(Name.length()-1,"$valist");
-    // Obtain the argument handle.
-    printSimpleInstruction("ldloca",Name.c_str());
-    printSimpleInstruction("arglist");
-    printSimpleInstruction("call",
-      "instance void [mscorlib]System.ArgIterator::.ctor"
-      "(valuetype [mscorlib]System.RuntimeArgumentHandle)");
-    // Save as pointer type "void*"
-    printValueLoad(Inst->getArgOperand(0));
-    printSimpleInstruction("ldloca",Name.c_str());
-    printIndirectSave(PointerType::getUnqual(
-          IntegerType::get(Inst->getContext(), 8)));
-    break;
-  case Intrinsic::vaend:
-    // Close argument list handle.
-    printIndirectLoad(Inst->getArgOperand(0));
-    printSimpleInstruction("call","instance void [mscorlib]System.ArgIterator::End()");
-    break;
-  case Intrinsic::vacopy:
-    // Copy "ArgIterator" valuetype.
-    printIndirectLoad(Inst->getArgOperand(0));
-    printIndirectLoad(Inst->getArgOperand(1));
-    printSimpleInstruction("cpobj","[mscorlib]System.ArgIterator");
-    break;        
-  default:
-    errs() << "Intrinsic ID = " << Inst->getIntrinsicID() << '\n';
-    llvm_unreachable("Invalid intrinsic function");
-  }
-}
-
-
-void MSILWriter::printCallInstruction(const Instruction* Inst) {
-  if (isa<IntrinsicInst>(Inst)) {
-    // Handle intrinsic function.
-    printIntrinsicCall(cast<IntrinsicInst>(Inst));
-  } else {
-    const CallInst *CI = cast<CallInst>(Inst);
-    // Load arguments to stack and call function.
-    for (int I = 0, E = CI->getNumArgOperands(); I!=E; ++I)
-      printValueLoad(CI->getArgOperand(I));
-    printFunctionCall(CI->getCalledFunction(), Inst);
-  }
-}
-
-
-void MSILWriter::printICmpInstruction(unsigned Predicate, const Value* Left,
-                                      const Value* Right) {
-  switch (Predicate) {
-  case ICmpInst::ICMP_EQ:
-    printBinaryInstruction("ceq",Left,Right);
-    break;
-  case ICmpInst::ICMP_NE:
-    // Emulate = not neg (Op1 eq Op2)
-    printBinaryInstruction("ceq",Left,Right);
-    printSimpleInstruction("neg");
-    printSimpleInstruction("not");
-    break;
-  case ICmpInst::ICMP_ULE:
-  case ICmpInst::ICMP_SLE:
-    // Emulate = (Op1 eq Op2) or (Op1 lt Op2)
-    printBinaryInstruction("ceq",Left,Right);
-    if (Predicate==ICmpInst::ICMP_ULE)
-      printBinaryInstruction("clt.un",Left,Right);
-    else
-      printBinaryInstruction("clt",Left,Right);
-    printSimpleInstruction("or");
-    break;
-  case ICmpInst::ICMP_UGE:
-  case ICmpInst::ICMP_SGE:
-    // Emulate = (Op1 eq Op2) or (Op1 gt Op2)
-    printBinaryInstruction("ceq",Left,Right);
-    if (Predicate==ICmpInst::ICMP_UGE)
-      printBinaryInstruction("cgt.un",Left,Right);
-    else
-      printBinaryInstruction("cgt",Left,Right);
-    printSimpleInstruction("or");
-    break;
-  case ICmpInst::ICMP_ULT:
-    printBinaryInstruction("clt.un",Left,Right);
-    break;
-  case ICmpInst::ICMP_SLT:
-    printBinaryInstruction("clt",Left,Right);
-    break;
-  case ICmpInst::ICMP_UGT:
-    printBinaryInstruction("cgt.un",Left,Right);
-    break;
-  case ICmpInst::ICMP_SGT:
-    printBinaryInstruction("cgt",Left,Right);
-    break;
-  default:
-    errs() << "Predicate = " << Predicate << '\n';
-    llvm_unreachable("Invalid icmp predicate");
-  }
-}
-
-
-void MSILWriter::printFCmpInstruction(unsigned Predicate, const Value* Left,
-                                      const Value* Right) {
-  // FIXME: Correct comparison
-  std::string NanFunc = "bool [mscorlib]System.Double::IsNaN(float64)";
-  switch (Predicate) {
-  case FCmpInst::FCMP_UGT:
-    // X >  Y || llvm_fcmp_uno(X, Y)
-    printBinaryInstruction("cgt",Left,Right);
-    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
-    printSimpleInstruction("or");
-    break;
-  case FCmpInst::FCMP_OGT:
-    // X >  Y
-    printBinaryInstruction("cgt",Left,Right);
-    break;
-  case FCmpInst::FCMP_UGE:
-    // X >= Y || llvm_fcmp_uno(X, Y)
-    printBinaryInstruction("ceq",Left,Right);
-    printBinaryInstruction("cgt",Left,Right);
-    printSimpleInstruction("or");
-    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
-    printSimpleInstruction("or");
-    break;
-  case FCmpInst::FCMP_OGE:
-    // X >= Y
-    printBinaryInstruction("ceq",Left,Right);
-    printBinaryInstruction("cgt",Left,Right);
-    printSimpleInstruction("or");
-    break;
-  case FCmpInst::FCMP_ULT:
-    // X <  Y || llvm_fcmp_uno(X, Y)
-    printBinaryInstruction("clt",Left,Right);
-    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
-    printSimpleInstruction("or");
-    break;
-  case FCmpInst::FCMP_OLT:
-    // X <  Y
-    printBinaryInstruction("clt",Left,Right);
-    break;
-  case FCmpInst::FCMP_ULE:
-    // X <= Y || llvm_fcmp_uno(X, Y)
-    printBinaryInstruction("ceq",Left,Right);
-    printBinaryInstruction("clt",Left,Right);
-    printSimpleInstruction("or");
-    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
-    printSimpleInstruction("or");
-    break;
-  case FCmpInst::FCMP_OLE:
-    // X <= Y
-    printBinaryInstruction("ceq",Left,Right);
-    printBinaryInstruction("clt",Left,Right);
-    printSimpleInstruction("or");
-    break;
-  case FCmpInst::FCMP_UEQ:
-    // X == Y || llvm_fcmp_uno(X, Y)
-    printBinaryInstruction("ceq",Left,Right);
-    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
-    printSimpleInstruction("or");
-    break;
-  case FCmpInst::FCMP_OEQ:
-    // X == Y
-    printBinaryInstruction("ceq",Left,Right);
-    break;
-  case FCmpInst::FCMP_UNE:
-    // X != Y
-    printBinaryInstruction("ceq",Left,Right);
-    printSimpleInstruction("neg");
-    printSimpleInstruction("not");
-    break;
-  case FCmpInst::FCMP_ONE:
-    // X != Y && llvm_fcmp_ord(X, Y)
-    printBinaryInstruction("ceq",Left,Right);
-    printSimpleInstruction("not");
-    break;
-  case FCmpInst::FCMP_ORD:
-    // return X == X && Y == Y
-    printBinaryInstruction("ceq",Left,Left);
-    printBinaryInstruction("ceq",Right,Right);
-    printSimpleInstruction("or");
-    break;
-  case FCmpInst::FCMP_UNO:
-    // X != X || Y != Y
-    printBinaryInstruction("ceq",Left,Left);
-    printSimpleInstruction("not");
-    printBinaryInstruction("ceq",Right,Right);
-    printSimpleInstruction("not");
-    printSimpleInstruction("or");
-    break;
-  default:
-    llvm_unreachable("Illegal FCmp predicate");
-  }
-}
-
-
-void MSILWriter::printInvokeInstruction(const InvokeInst* Inst) {
-  std::string Label = "leave$normal_"+utostr(getUniqID());
-  Out << ".try {\n";
-  // Load arguments
-  for (int I = 0, E = Inst->getNumArgOperands(); I!=E; ++I)
-    printValueLoad(Inst->getArgOperand(I));
-  // Print call instruction
-  printFunctionCall(Inst->getOperand(0),Inst);
-  // Save function result and leave "try" block
-  printValueSave(Inst);
-  printSimpleInstruction("leave",Label.c_str());
-  Out << "}\n";
-  Out << "catch [mscorlib]System.Exception {\n";
-  // Redirect to unwind block
-  printSimpleInstruction("pop");
-  printBranchToBlock(Inst->getParent(),NULL,Inst->getUnwindDest());
-  Out << "}\n" << Label << ":\n";
-  // Redirect to continue block
-  printBranchToBlock(Inst->getParent(),NULL,Inst->getNormalDest());
-}
-
-
-void MSILWriter::printSwitchInstruction(const SwitchInst* Inst) {
-  // FIXME: Emulate with IL "switch" instruction
-  // Emulate = if () else if () else if () else ...
-  for (unsigned int I = 1, E = Inst->getNumCases(); I!=E; ++I) {
-    printValueLoad(Inst->getCondition());
-    printValueLoad(Inst->getCaseValue(I));
-    printSimpleInstruction("ceq");
-    // Condition jump to successor block
-    printBranchToBlock(Inst->getParent(),Inst->getSuccessor(I),NULL);
-  }
-  // Jump to default block
-  printBranchToBlock(Inst->getParent(),NULL,Inst->getDefaultDest());
-}
-
-
-void MSILWriter::printVAArgInstruction(const VAArgInst* Inst) {
-  printIndirectLoad(Inst->getOperand(0));
-  printSimpleInstruction("call",
-    "instance typedref [mscorlib]System.ArgIterator::GetNextArg()");
-  printSimpleInstruction("refanyval","void*");
-  std::string Name = 
-    "ldind."+getTypePostfix(PointerType::getUnqual(
-            IntegerType::get(Inst->getContext(), 8)),false);
-  printSimpleInstruction(Name.c_str());
-}
-
-
-void MSILWriter::printAllocaInstruction(const AllocaInst* Inst) {
-  uint64_t Size = TD->getTypeAllocSize(Inst->getAllocatedType());
-  // Constant optimization.
-  if (const ConstantInt* CInt = dyn_cast<ConstantInt>(Inst->getOperand(0))) {
-    printPtrLoad(CInt->getZExtValue()*Size);
-  } else {
-    printPtrLoad(Size);
-    printValueLoad(Inst->getOperand(0));
-    printSimpleInstruction("mul");
-  }
-  printSimpleInstruction("localloc");
-}
-
-
-void MSILWriter::printInstruction(const Instruction* Inst) {
-  const Value *Left = 0, *Right = 0;
-  if (Inst->getNumOperands()>=1) Left = Inst->getOperand(0);
-  if (Inst->getNumOperands()>=2) Right = Inst->getOperand(1);
-  // Print instruction
-  // FIXME: "ShuffleVector","ExtractElement","InsertElement" support.
-  switch (Inst->getOpcode()) {
-  // Terminator
-  case Instruction::Ret:
-    if (Inst->getNumOperands()) {
-      printValueLoad(Left);
-      printSimpleInstruction("ret");
-    } else
-      printSimpleInstruction("ret");
-    break;
-  case Instruction::Br:
-    printBranchInstruction(cast<BranchInst>(Inst));
-    break;
-  // Binary
-  case Instruction::Add:
-  case Instruction::FAdd:
-    printBinaryInstruction("add",Left,Right);
-    break;
-  case Instruction::Sub:
-  case Instruction::FSub:
-    printBinaryInstruction("sub",Left,Right);
-    break;
-  case Instruction::Mul:
-  case Instruction::FMul:
-    printBinaryInstruction("mul",Left,Right);
-    break;
-  case Instruction::UDiv:
-    printBinaryInstruction("div.un",Left,Right);
-    break;
-  case Instruction::SDiv:
-  case Instruction::FDiv:
-    printBinaryInstruction("div",Left,Right);
-    break;
-  case Instruction::URem:
-    printBinaryInstruction("rem.un",Left,Right);
-    break;
-  case Instruction::SRem:
-  case Instruction::FRem:
-    printBinaryInstruction("rem",Left,Right);
-    break;
-  // Binary Condition
-  case Instruction::ICmp:
-    printICmpInstruction(cast<ICmpInst>(Inst)->getPredicate(),Left,Right);
-    break;
-  case Instruction::FCmp:
-    printFCmpInstruction(cast<FCmpInst>(Inst)->getPredicate(),Left,Right);
-    break;
-  // Bitwise Binary
-  case Instruction::And:
-    printBinaryInstruction("and",Left,Right);
-    break;
-  case Instruction::Or:
-    printBinaryInstruction("or",Left,Right);
-    break;
-  case Instruction::Xor:
-    printBinaryInstruction("xor",Left,Right);
-    break;
-  case Instruction::Shl:
-    printValueLoad(Left);
-    printValueLoad(Right);
-    printSimpleInstruction("conv.i4");
-    printSimpleInstruction("shl");
-    break;
-  case Instruction::LShr:
-    printValueLoad(Left);
-    printValueLoad(Right);
-    printSimpleInstruction("conv.i4");
-    printSimpleInstruction("shr.un");
-    break;
-  case Instruction::AShr:
-    printValueLoad(Left);
-    printValueLoad(Right);
-    printSimpleInstruction("conv.i4");
-    printSimpleInstruction("shr");
-    break;
-  case Instruction::Select:
-    printSelectInstruction(Inst->getOperand(0),Inst->getOperand(1),Inst->getOperand(2));
-    break;
-  case Instruction::Load:
-    printIndirectLoad(Inst->getOperand(0));
-    break;
-  case Instruction::Store:
-    printIndirectSave(Inst->getOperand(1), Inst->getOperand(0));
-    break;
-  case Instruction::SExt:
-    printCastInstruction(Inst->getOpcode(),Left,
-                         cast<CastInst>(Inst)->getDestTy(),
-                         cast<CastInst>(Inst)->getSrcTy());
-    break;
-  case Instruction::Trunc:
-  case Instruction::ZExt:
-  case Instruction::FPTrunc:
-  case Instruction::FPExt:
-  case Instruction::UIToFP:
-  case Instruction::SIToFP:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-  case Instruction::BitCast:
-    printCastInstruction(Inst->getOpcode(),Left,
-                         cast<CastInst>(Inst)->getDestTy());
-    break;
-  case Instruction::GetElementPtr:
-    printGepInstruction(Inst->getOperand(0),gep_type_begin(Inst),
-                        gep_type_end(Inst));
-    break;
-  case Instruction::Call:
-    printCallInstruction(cast<CallInst>(Inst));
-    break;
-  case Instruction::Invoke:
-    printInvokeInstruction(cast<InvokeInst>(Inst));
-    break;
-  case Instruction::Unwind:
-    printSimpleInstruction("newobj",
-      "instance void [mscorlib]System.Exception::.ctor()");
-    printSimpleInstruction("throw");
-    break;
-  case Instruction::Switch:
-    printSwitchInstruction(cast<SwitchInst>(Inst));
-    break;
-  case Instruction::Alloca:
-    printAllocaInstruction(cast<AllocaInst>(Inst));
-    break;
-  case Instruction::Unreachable:
-    printSimpleInstruction("ldstr", "\"Unreachable instruction\"");
-    printSimpleInstruction("newobj",
-      "instance void [mscorlib]System.Exception::.ctor(string)");
-    printSimpleInstruction("throw");
-    break;
-  case Instruction::VAArg:
-    printVAArgInstruction(cast<VAArgInst>(Inst));
-    break;
-  default:
-    errs() << "Instruction = " << Inst->getName() << '\n';
-    llvm_unreachable("Unsupported instruction");
-  }
-}
-
-
-void MSILWriter::printLoop(const Loop* L) {
-  Out << getLabelName(L->getHeader()->getName()) << ":\n";
-  const std::vector<BasicBlock*>& blocks = L->getBlocks();
-  for (unsigned I = 0, E = blocks.size(); I!=E; I++) {
-    BasicBlock* BB = blocks[I];
-    Loop* BBLoop = LInfo->getLoopFor(BB);
-    if (BBLoop == L)
-      printBasicBlock(BB);
-    else if (BB==BBLoop->getHeader() && BBLoop->getParentLoop()==L)
-      printLoop(BBLoop);
-  }
-  printSimpleInstruction("br",getLabelName(L->getHeader()->getName()).c_str());
-}
-
-
-void MSILWriter::printBasicBlock(const BasicBlock* BB) {
-  Out << getLabelName(BB) << ":\n";
-  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I) {
-    const Instruction* Inst = I;
-    // Comment llvm original instruction
-    // Out << "\n//" << *Inst << "\n";
-    // Do not handle PHI instruction in current block
-    if (Inst->getOpcode()==Instruction::PHI) continue;
-    // Print instruction
-    printInstruction(Inst);
-    // Save result
-    if (Inst->getType()!=Type::getVoidTy(BB->getContext())) {
-      // Do not save value after invoke, it done in "try" block
-      if (Inst->getOpcode()==Instruction::Invoke) continue;
-      printValueSave(Inst);
-    }
-  }
-}
-
-
-void MSILWriter::printLocalVariables(const Function& F) {
-  std::string Name;
-  const Type* Ty = NULL;
-  std::set<const Value*> Printed;
-  const Value* VaList = NULL;
-  unsigned StackDepth = 8;
-  // Find local variables
-  for (const_inst_iterator I = inst_begin(&F), E = inst_end(&F); I!=E; ++I) {
-    if (I->getOpcode()==Instruction::Call ||
-        I->getOpcode()==Instruction::Invoke) {
-      // Test stack depth.
-      if (StackDepth<I->getNumOperands())
-        StackDepth = I->getNumOperands();
-    }
-    const AllocaInst* AI = dyn_cast<AllocaInst>(&*I);
-    if (AI && !isa<GlobalVariable>(AI)) {
-      // Local variable allocation.
-      Ty = PointerType::getUnqual(AI->getAllocatedType());
-      Name = getValueName(AI);
-      Out << "\t.locals (" << getTypeName(Ty) << Name << ")\n";
-    } else if (I->getType()!=Type::getVoidTy(F.getContext())) {
-      // Operation result.
-      Ty = I->getType();
-      Name = getValueName(&*I);
-      Out << "\t.locals (" << getTypeName(Ty) << Name << ")\n";
-    }
-    // Test on 'va_list' variable    
-    bool isVaList = false;     
-    if (const VAArgInst* VaInst = dyn_cast<VAArgInst>(&*I)) {
-      // "va_list" as "va_arg" instruction operand.
-      isVaList = true;
-      VaList = VaInst->getOperand(0);
-    } else if (const IntrinsicInst* Inst = dyn_cast<IntrinsicInst>(&*I)) {
-      // "va_list" as intrinsic function operand. 
-      switch (Inst->getIntrinsicID()) {
-      case Intrinsic::vastart:
-      case Intrinsic::vaend:
-      case Intrinsic::vacopy:
-        isVaList = true;
-        VaList = Inst->getArgOperand(0);
-        break;
-      default:
-        isVaList = false;
-      }
-    }
-    // Print "va_list" variable.
-    if (isVaList && Printed.insert(VaList).second) {
-      Name = getValueName(VaList);
-      Name.insert(Name.length()-1,"$valist");
-      Out << "\t.locals (valuetype [mscorlib]System.ArgIterator "
-          << Name << ")\n";
-    }
-  }
-  printSimpleInstruction(".maxstack",utostr(StackDepth*2).c_str());
-}
-
-
-void MSILWriter::printFunctionBody(const Function& F) {
-  // Print body
-  for (Function::const_iterator I = F.begin(), E = F.end(); I!=E; ++I) {
-    if (Loop *L = LInfo->getLoopFor(I)) {
-      if (L->getHeader()==I && L->getParentLoop()==0)
-        printLoop(L);
-    } else {
-      printBasicBlock(I);
-    }
-  }
-}
-
-
-void MSILWriter::printConstantExpr(const ConstantExpr* CE) {
-  const Value *left = 0, *right = 0;
-  if (CE->getNumOperands()>=1) left = CE->getOperand(0);
-  if (CE->getNumOperands()>=2) right = CE->getOperand(1);
-  // Print instruction
-  switch (CE->getOpcode()) {
-  case Instruction::Trunc:
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::FPTrunc:
-  case Instruction::FPExt:
-  case Instruction::UIToFP:
-  case Instruction::SIToFP:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-  case Instruction::BitCast:
-    printCastInstruction(CE->getOpcode(),left,CE->getType());
-    break;
-  case Instruction::GetElementPtr:
-    printGepInstruction(CE->getOperand(0),gep_type_begin(CE),gep_type_end(CE));
-    break;
-  case Instruction::ICmp:
-    printICmpInstruction(CE->getPredicate(),left,right);
-    break;
-  case Instruction::FCmp:
-    printFCmpInstruction(CE->getPredicate(),left,right);
-    break;
-  case Instruction::Select:
-    printSelectInstruction(CE->getOperand(0),CE->getOperand(1),CE->getOperand(2));
-    break;
-  case Instruction::Add:
-  case Instruction::FAdd:
-    printBinaryInstruction("add",left,right);
-    break;
-  case Instruction::Sub:
-  case Instruction::FSub:
-    printBinaryInstruction("sub",left,right);
-    break;
-  case Instruction::Mul:
-  case Instruction::FMul:
-    printBinaryInstruction("mul",left,right);
-    break;
-  case Instruction::UDiv:
-    printBinaryInstruction("div.un",left,right);
-    break;
-  case Instruction::SDiv:
-  case Instruction::FDiv:
-    printBinaryInstruction("div",left,right);
-    break;
-  case Instruction::URem:
-    printBinaryInstruction("rem.un",left,right);
-    break;
-  case Instruction::SRem:
-  case Instruction::FRem:
-    printBinaryInstruction("rem",left,right);
-    break;
-  case Instruction::And:
-    printBinaryInstruction("and",left,right);
-    break;
-  case Instruction::Or:
-    printBinaryInstruction("or",left,right);
-    break;
-  case Instruction::Xor:
-    printBinaryInstruction("xor",left,right);
-    break;
-  case Instruction::Shl:
-    printBinaryInstruction("shl",left,right);
-    break;
-  case Instruction::LShr:
-    printBinaryInstruction("shr.un",left,right);
-    break;
-  case Instruction::AShr:
-    printBinaryInstruction("shr",left,right);
-    break;
-  default:
-    errs() << "Expression = " << *CE << "\n";
-    llvm_unreachable("Invalid constant expression");
-  }
-}
-
-
-void MSILWriter::printStaticInitializerList() {
-  // List of global variables with uninitialized fields.
-  for (std::map<const GlobalVariable*,std::vector<StaticInitializer> >::iterator
-       VarI = StaticInitList.begin(), VarE = StaticInitList.end(); VarI!=VarE;
-       ++VarI) {
-    const std::vector<StaticInitializer>& InitList = VarI->second;
-    if (InitList.empty()) continue;
-    // For each uninitialized field.
-    for (std::vector<StaticInitializer>::const_iterator I = InitList.begin(),
-         E = InitList.end(); I!=E; ++I) {
-      if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(I->constant)) {
-        // Out << "\n// Init " << getValueName(VarI->first) << ", offset " <<
-        //  utostr(I->offset) << ", type "<< *I->constant->getType() << "\n\n";
-        // Load variable address
-        printValueLoad(VarI->first);
-        // Add offset
-        if (I->offset!=0) {
-          printPtrLoad(I->offset);
-          printSimpleInstruction("add");
-        }
-        // Load value
-        printConstantExpr(CE);
-        // Save result at offset
-        std::string postfix = getTypePostfix(CE->getType(),true);
-        if (*postfix.begin()=='u') *postfix.begin() = 'i';
-        postfix = "stind."+postfix;
-        printSimpleInstruction(postfix.c_str());
-      } else {
-        errs() << "Constant = " << *I->constant << '\n';
-        llvm_unreachable("Invalid static initializer");
-      }
-    }
-  }
-}
-
-
-void MSILWriter::printFunction(const Function& F) {
-  bool isSigned = F.paramHasAttr(0, Attribute::SExt);
-  Out << "\n.method static ";
-  Out << (F.hasLocalLinkage() ? "private " : "public ");
-  if (F.isVarArg()) Out << "vararg ";
-  Out << getTypeName(F.getReturnType(),isSigned) << 
-    getConvModopt(F.getCallingConv()) << getValueName(&F) << '\n';
-  // Arguments
-  Out << "\t(";
-  unsigned ArgIdx = 1;
-  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I!=E;
-       ++I, ++ArgIdx) {
-    isSigned = F.paramHasAttr(ArgIdx, Attribute::SExt);
-    if (I!=F.arg_begin()) Out << ", ";
-    Out << getTypeName(I->getType(),isSigned) << getValueName(I);
-  }
-  Out << ") cil managed\n";
-  // Body
-  Out << "{\n";
-  printLocalVariables(F);
-  printFunctionBody(F);
-  Out << "}\n";
-}
-
-
-void MSILWriter::printDeclarations(const TypeSymbolTable& ST) {
-  std::string Name;
-  std::set<const Type*> Printed;
-  for (std::set<const Type*>::const_iterator
-       UI = UsedTypes->begin(), UE = UsedTypes->end(); UI!=UE; ++UI) {
-    const Type* Ty = *UI;
-    if (Ty->isArrayTy() || Ty->isVectorTy() || Ty->isStructTy())
-      Name = getTypeName(Ty, false, true);
-    // Type with no need to declare.
-    else continue;
-    // Print not duplicated type
-    if (Printed.insert(Ty).second) {
-      Out << ".class value explicit ansi sealed '" << Name << "'";
-      Out << " { .pack " << 1 << " .size " << TD->getTypeAllocSize(Ty);
-      Out << " }\n\n";
-    }
-  }
-}
-
-
-unsigned int MSILWriter::getBitWidth(const Type* Ty) {
-  unsigned int N = Ty->getPrimitiveSizeInBits();
-  assert(N!=0 && "Invalid type in getBitWidth()");
-  switch (N) {
-  case 1:
-  case 8:
-  case 16:
-  case 32:
-  case 64:
-    return N;
-  default:
-    errs() << "Bits = " << N << '\n';
-    llvm_unreachable("Unsupported integer width");
-  }
-  return 0; // Not reached
-}
-
-
-void MSILWriter::printStaticConstant(const Constant* C, uint64_t& Offset) {
-  uint64_t TySize = 0;
-  const Type* Ty = C->getType();
-  // Print zero initialized constant.
-  if (isa<ConstantAggregateZero>(C) || C->isNullValue()) {
-    TySize = TD->getTypeAllocSize(C->getType());
-    Offset += TySize;
-    Out << "int8 (0) [" << TySize << "]";
-    return;
-  }
-  // Print constant initializer
-  switch (Ty->getTypeID()) {
-  case Type::IntegerTyID: {
-    TySize = TD->getTypeAllocSize(Ty);
-    const ConstantInt* Int = cast<ConstantInt>(C);
-    Out << getPrimitiveTypeName(Ty,true) << "(" << Int->getSExtValue() << ")";
-    break;
-  }
-  case Type::FloatTyID:
-  case Type::DoubleTyID: {
-    TySize = TD->getTypeAllocSize(Ty);
-    const ConstantFP* FP = cast<ConstantFP>(C);
-    if (Ty->getTypeID() == Type::FloatTyID)
-      Out << "int32 (" << 
-        (uint32_t)FP->getValueAPF().bitcastToAPInt().getZExtValue() << ')';
-    else
-      Out << "int64 (" << 
-        FP->getValueAPF().bitcastToAPInt().getZExtValue() << ')';
-    break;
-  }
-  case Type::ArrayTyID:
-  case Type::VectorTyID:
-  case Type::StructTyID:
-    for (unsigned I = 0, E = C->getNumOperands(); I<E; I++) {
-      if (I!=0) Out << ",\n";
-      printStaticConstant(cast<Constant>(C->getOperand(I)), Offset);
-    }
-    break;
-  case Type::PointerTyID:
-    TySize = TD->getTypeAllocSize(C->getType());
-    // Initialize with global variable address
-    if (const GlobalVariable *G = dyn_cast<GlobalVariable>(C)) {
-      std::string name = getValueName(G);
-      Out << "&(" << name.insert(name.length()-1,"$data") << ")";
-    } else {
-      // Dynamic initialization
-      if (!isa<ConstantPointerNull>(C) && !C->isNullValue())
-        InitListPtr->push_back(StaticInitializer(C,Offset));
-      // Null pointer initialization
-      if (TySize==4) Out << "int32 (0)";
-      else if (TySize==8) Out << "int64 (0)";
-      else llvm_unreachable("Invalid pointer size");
-    }
-    break;
-  default:
-    errs() << "TypeID = " << Ty->getTypeID() << '\n';
-    llvm_unreachable("Invalid type in printStaticConstant()");
-  }
-  // Increase offset.
-  Offset += TySize;
-}
-
-
-void MSILWriter::printStaticInitializer(const Constant* C,
-                                        const std::string& Name) {
-  switch (C->getType()->getTypeID()) {
-  case Type::IntegerTyID:
-  case Type::FloatTyID:
-  case Type::DoubleTyID: 
-    Out << getPrimitiveTypeName(C->getType(), false);
-    break;
-  case Type::ArrayTyID:
-  case Type::VectorTyID:
-  case Type::StructTyID:
-  case Type::PointerTyID:
-    Out << getTypeName(C->getType());
-    break;
-  default:
-    errs() << "Type = " << *C << "\n";
-    llvm_unreachable("Invalid constant type");
-  }
-  // Print initializer
-  std::string label = Name;
-  label.insert(label.length()-1,"$data");
-  Out << Name << " at " << label << '\n';
-  Out << ".data " << label << " = {\n";
-  uint64_t offset = 0;
-  printStaticConstant(C,offset);
-  Out << "\n}\n\n";
-}
-
-
-void MSILWriter::printVariableDefinition(const GlobalVariable* G) {
-  const Constant* C = G->getInitializer();
-  if (C->isNullValue() || isa<ConstantAggregateZero>(C) || isa<UndefValue>(C))
-    InitListPtr = 0;
-  else
-    InitListPtr = &StaticInitList[G];
-  printStaticInitializer(C,getValueName(G));
-}
-
-
-void MSILWriter::printGlobalVariables() {
-  if (ModulePtr->global_empty()) return;
-  Module::global_iterator I,E;
-  for (I = ModulePtr->global_begin(), E = ModulePtr->global_end(); I!=E; ++I) {
-    // Variable definition
-    Out << ".field static " << (I->isDeclaration() ? "public " :
-                                                     "private ");
-    if (I->isDeclaration()) {
-      Out << getTypeName(I->getType()) << getValueName(&*I) << "\n\n";
-    } else
-      printVariableDefinition(&*I);
-  }
-}
-
-
-const char* MSILWriter::getLibraryName(const Function* F) {
-  return getLibraryForSymbol(F->getName(), true, F->getCallingConv());
-}
-
-
-const char* MSILWriter::getLibraryName(const GlobalVariable* GV) {
-  return getLibraryForSymbol(GV->getName(), false, CallingConv::C);
-}
-
-
-const char* MSILWriter::getLibraryForSymbol(StringRef Name, bool isFunction,
-                                            CallingConv::ID CallingConv) {
-  // TODO: Read *.def file with function and libraries definitions.
-  return "MSVCRT.DLL";  
-}
-
-
-void MSILWriter::printExternals() {
-  Module::const_iterator I,E;
-  // Functions.
-  for (I=ModulePtr->begin(),E=ModulePtr->end(); I!=E; ++I) {
-    // Skip intrisics
-    if (I->isIntrinsic()) continue;
-    if (I->isDeclaration()) {
-      const Function* F = I; 
-      std::string Name = getConvModopt(F->getCallingConv())+getValueName(F);
-      std::string Sig = 
-        getCallSignature(cast<FunctionType>(F->getFunctionType()), NULL, Name);
-      Out << ".method static hidebysig pinvokeimpl(\""
-          << getLibraryName(F) << "\")\n\t" << Sig << " preservesig {}\n\n";
-    }
-  }
-  // External variables and static initialization.
-  Out <<
-  ".method public hidebysig static pinvokeimpl(\"KERNEL32.DLL\" ansi winapi)"
-  "  native int LoadLibrary(string) preservesig {}\n"
-  ".method public hidebysig static pinvokeimpl(\"KERNEL32.DLL\" ansi winapi)"
-  "  native int GetProcAddress(native int, string) preservesig {}\n";
-  Out <<
-  ".method private static void* $MSIL_Import(string lib,string sym)\n"
-  " managed cil\n{\n"
-  "\tldarg\tlib\n"
-  "\tcall\tnative int LoadLibrary(string)\n"
-  "\tldarg\tsym\n"
-  "\tcall\tnative int GetProcAddress(native int,string)\n"
-  "\tdup\n"
-  "\tbrtrue\tL_01\n"
-  "\tldstr\t\"Can no import variable\"\n"
-  "\tnewobj\tinstance void [mscorlib]System.Exception::.ctor(string)\n"
-  "\tthrow\n"
-  "L_01:\n"
-  "\tret\n"
-  "}\n\n"
-  ".method static private void $MSIL_Init() managed cil\n{\n";
-  printStaticInitializerList();
-  // Foreach global variable.
-  for (Module::global_iterator I = ModulePtr->global_begin(),
-       E = ModulePtr->global_end(); I!=E; ++I) {
-    if (!I->isDeclaration() || !I->hasDLLImportLinkage()) continue;
-    // Use "LoadLibrary"/"GetProcAddress" to recive variable address.
-    std::string Tmp = getTypeName(I->getType())+getValueName(&*I);
-    printSimpleInstruction("ldsflda",Tmp.c_str());
-    Out << "\tldstr\t\"" << getLibraryName(&*I) << "\"\n";
-    Out << "\tldstr\t\"" << I->getName() << "\"\n";
-    printSimpleInstruction("call","void* $MSIL_Import(string,string)");
-    printIndirectSave(I->getType());
-  }
-  printSimpleInstruction("ret");
-  Out << "}\n\n";
-}
-
-
-//===----------------------------------------------------------------------===//
-//                      External Interface declaration
-//===----------------------------------------------------------------------===//
-
-bool MSILTarget::addPassesToEmitFile(PassManagerBase &PM,
-                                     formatted_raw_ostream &o,
-                                     CodeGenFileType FileType,
-                                     CodeGenOpt::Level OptLevel,
-                                     bool DisableVerify)
-{
-  if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
-  MSILWriter* Writer = new MSILWriter(o);
-  PM.add(createGCLoweringPass());
-  // FIXME: Handle switch through native IL instruction "switch"
-  PM.add(createLowerSwitchPass());
-  PM.add(createCFGSimplificationPass());
-  PM.add(new MSILModule(Writer->UsedTypes,Writer->TD));
-  PM.add(Writer);
-  PM.add(createGCInfoDeleter());
-  return false;
-}
diff --git a/lib/Target/MSIL/MSILWriter.h b/lib/Target/MSIL/MSILWriter.h
deleted file mode 100644
index 92a3abe5c0a74..0000000000000
--- a/lib/Target/MSIL/MSILWriter.h
+++ /dev/null
@@ -1,258 +0,0 @@
-//===-- MSILWriter.h - TargetMachine for the MSIL ---------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the MSILWriter that is used by the MSIL.
-//
-//===----------------------------------------------------------------------===//
-#ifndef MSILWRITER_H
-#define MSILWRITER_H
-
-#include "llvm/CallingConv.h"
-#include "llvm/Constants.h"
-#include "llvm/Module.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Pass.h"
-#include "llvm/PassManager.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/FindUsedTypes.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetMachine.h"
-
-namespace llvm {
-  extern Target TheMSILTarget;
-
-  class MSILModule : public ModulePass {
-    Module *ModulePtr;
-    const std::set<const Type *>*& UsedTypes;
-    const TargetData*& TD;
-
-  public:
-    static char ID;
-    MSILModule(const std::set<const Type *>*& _UsedTypes,
-               const TargetData*& _TD)
-      : ModulePass(&ID), UsedTypes(_UsedTypes), TD(_TD) {}
-
-    void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<FindUsedTypes>();
-      AU.addRequired<TargetData>();
-    }
-
-    virtual const char *getPassName() const {
-      return "MSIL backend definitions";
-    }
-
-    virtual bool runOnModule(Module &M);
-
-  };
-
-  class MSILWriter : public FunctionPass {
-    struct StaticInitializer {
-      const Constant* constant;
-      uint64_t offset;
-      
-      StaticInitializer()
-        : constant(0), offset(0) {}
-
-      StaticInitializer(const Constant* _constant, uint64_t _offset)
-        : constant(_constant), offset(_offset) {} 
-    };
-
-    uint64_t UniqID;
-
-    uint64_t getUniqID() {
-      return ++UniqID;
-    }
-
-  public:
-    formatted_raw_ostream &Out;
-    Module* ModulePtr;
-    const TargetData* TD;
-    LoopInfo *LInfo;
-    std::vector<StaticInitializer>* InitListPtr;
-    std::map<const GlobalVariable*,std::vector<StaticInitializer> >
-      StaticInitList;
-    const std::set<const Type *>* UsedTypes;
-    static char ID;
-    DenseMap<const Value*, unsigned> AnonValueNumbers;
-    unsigned NextAnonValueNumber;
-
-    MSILWriter(formatted_raw_ostream &o) : FunctionPass(&ID), Out(o),
-         NextAnonValueNumber(0) {
-      UniqID = 0;
-    }
-
-    enum ValueType {
-      UndefVT,
-      GlobalVT,
-      InternalVT,
-      ArgumentVT,
-      LocalVT,
-      ConstVT,
-      ConstExprVT
-    };
-
-    bool isVariable(ValueType V) {
-      return V==GlobalVT || V==InternalVT || V==ArgumentVT || V==LocalVT;
-    }
-
-    bool isConstValue(ValueType V) {
-      return V==ConstVT || V==ConstExprVT;
-    }
-
-    virtual const char *getPassName() const { return "MSIL backend"; }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<LoopInfo>();
-      AU.setPreservesAll();
-    }
-
-    bool runOnFunction(Function &F);
-
-    virtual bool doInitialization(Module &M);
-
-    virtual bool doFinalization(Module &M);
-
-    void printModuleStartup();
-
-    bool isZeroValue(const Value* V);
-
-    std::string getValueName(const Value* V);
-
-    std::string getLabelName(const Value* V);
-
-    std::string getLabelName(const std::string& Name);
-
-    std::string getConvModopt(CallingConv::ID CallingConvID);
-
-    std::string getArrayTypeName(Type::TypeID TyID, const Type* Ty);
-
-    std::string getPrimitiveTypeName(const Type* Ty, bool isSigned);
-
-    std::string getFunctionTypeName(const Type* Ty);
-
-    std::string getPointerTypeName(const Type* Ty);
-
-    std::string getTypeName(const Type* Ty, bool isSigned = false,
-                            bool isNested = false);
-
-    ValueType getValueLocation(const Value* V);
-
-    std::string getTypePostfix(const Type* Ty, bool Expand,
-                               bool isSigned = false);
-
-    void printConvToPtr();
-
-    void printPtrLoad(uint64_t N);
-
-    void printValuePtrLoad(const Value* V);
-
-    void printConstLoad(const Constant* C);
-
-    void printValueLoad(const Value* V);
-
-    void printValueSave(const Value* V);
-
-    void printBinaryInstruction(const char* Name, const Value* Left,
-                                const Value* Right);
-
-    void printSimpleInstruction(const char* Inst, const char* Operand = NULL);
-
-    void printPHICopy(const BasicBlock* Src, const BasicBlock* Dst);
-
-    void printBranchToBlock(const BasicBlock* CurrBB,
-                            const BasicBlock* TrueBB,
-                            const BasicBlock* FalseBB);
-
-    void printBranchInstruction(const BranchInst* Inst);
-
-    void printSelectInstruction(const Value* Cond, const Value* VTrue,
-                                const Value* VFalse);
-
-    void printIndirectLoad(const Value* V);
-
-    void printIndirectSave(const Value* Ptr, const Value* Val);
-
-    void printIndirectSave(const Type* Ty);
-
-    void printCastInstruction(unsigned int Op, const Value* V,
-                              const Type* Ty, const Type* SrcTy=0);
-
-    void printGepInstruction(const Value* V, gep_type_iterator I,
-                             gep_type_iterator E);
-
-    std::string getCallSignature(const FunctionType* Ty,
-                                 const Instruction* Inst,
-                                 std::string Name);
-
-    void printFunctionCall(const Value* FnVal, const Instruction* Inst);
-
-    void printIntrinsicCall(const IntrinsicInst* Inst);
-
-    void printCallInstruction(const Instruction* Inst);
-
-    void printICmpInstruction(unsigned Predicate, const Value* Left,
-                              const Value* Right);
-
-    void printFCmpInstruction(unsigned Predicate, const Value* Left,
-                              const Value* Right);
-
-    void printInvokeInstruction(const InvokeInst* Inst);
-
-    void printSwitchInstruction(const SwitchInst* Inst);
-
-    void printVAArgInstruction(const VAArgInst* Inst);
-
-    void printAllocaInstruction(const AllocaInst* Inst);
-
-    void printInstruction(const Instruction* Inst);
-
-    void printLoop(const Loop* L);
-
-    void printBasicBlock(const BasicBlock* BB);
-    
-    void printLocalVariables(const Function& F);
-
-    void printFunctionBody(const Function& F);
-
-    void printConstantExpr(const ConstantExpr* CE);
-
-    void printStaticInitializerList();
-
-    void printFunction(const Function& F);
-
-    void printDeclarations(const TypeSymbolTable& ST);
-
-    unsigned int getBitWidth(const Type* Ty);
-
-    void printStaticConstant(const Constant* C, uint64_t& Offset);
-
-    void printStaticInitializer(const Constant* C, const std::string& Name);
-
-    void printVariableDefinition(const GlobalVariable* G);
-
-    void printGlobalVariables();
-
-    const char* getLibraryName(const Function* F);
-
-    const char* getLibraryName(const GlobalVariable* GV); 
-    
-    const char* getLibraryForSymbol(StringRef Name, bool isFunction,
-                                    CallingConv::ID CallingConv);
-
-    void printExternals();
-  };
-
-}
-
-#endif
-
diff --git a/lib/Target/MSIL/README.TXT b/lib/Target/MSIL/README.TXT
deleted file mode 100644
index d797c71fd39f0..0000000000000
--- a/lib/Target/MSIL/README.TXT
+++ /dev/null
@@ -1,26 +0,0 @@
-//===---------------------------------------------------------------------===// 
-
-Vector instructions support.
-
-ShuffleVector
-ExtractElement
-InsertElement
-
-//===---------------------------------------------------------------------===// 
-
-Add "OpaqueType" type.
-
-//===---------------------------------------------------------------------===// 
-
-"switch" instruction emulation with CLI "switch" instruction.
-
-//===---------------------------------------------------------------------===// 
-
-Write linker for external function, because function export need to know 
-dynamic library where function located.
-
-.method static hidebysig pinvokeimpl("msvcrt.dll" cdecl)
-    void free(void*) preservesig {}
-
-
-
diff --git a/lib/Target/MSIL/TargetInfo/CMakeLists.txt b/lib/Target/MSIL/TargetInfo/CMakeLists.txt
deleted file mode 100644
index 9f0c3a09341a9..0000000000000
--- a/lib/Target/MSIL/TargetInfo/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMMSILInfo
-  MSILTargetInfo.cpp
-  )
-
diff --git a/lib/Target/MSIL/TargetInfo/MSILTargetInfo.cpp b/lib/Target/MSIL/TargetInfo/MSILTargetInfo.cpp
deleted file mode 100644
index dfd42814e51cc..0000000000000
--- a/lib/Target/MSIL/TargetInfo/MSILTargetInfo.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//===-- MSILTargetInfo.cpp - MSIL Target Implementation -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MSILWriter.h"
-#include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
-using namespace llvm;
-
-Target llvm::TheMSILTarget;
-
-static unsigned MSIL_TripleMatchQuality(const std::string &TT) {
-  // This class always works, but shouldn't be the default in most cases.
-  return 1;
-}
-
-extern "C" void LLVMInitializeMSILTargetInfo() { 
-  TargetRegistry::RegisterTarget(TheMSILTarget, "msil",    
-                                  "MSIL backend",
-                                  &MSIL_TripleMatchQuality);
-}
diff --git a/lib/Target/MSIL/TargetInfo/Makefile b/lib/Target/MSIL/TargetInfo/Makefile
deleted file mode 100644
index 30b0950db0f75..0000000000000
--- a/lib/Target/MSIL/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/MSIL/TargetInfo/Makefile -----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMMSILInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/MSP430/MSP430BranchSelector.cpp b/lib/Target/MSP430/MSP430BranchSelector.cpp
index 68cb342b08f45..bd644435c76fa 100644
--- a/lib/Target/MSP430/MSP430BranchSelector.cpp
+++ b/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -10,7 +10,7 @@
 // This file contains a pass that scans a machine function to determine which
 // conditional branches need more than 10 bits of displacement to reach their
 // target basic block.  It does this in two passes; a calculation of basic block
-// positions pass, and a branch psuedo op to machine branch opcode pass.  This
+// positions pass, and a branch pseudo op to machine branch opcode pass.  This
 // pass should be run last, just before the assembly printer.
 //
 //===----------------------------------------------------------------------===//
@@ -30,7 +30,7 @@ STATISTIC(NumExpanded, "Number of branches expanded to long format");
 namespace {
   struct MSP430BSel : public MachineFunctionPass {
     static char ID;
-    MSP430BSel() : MachineFunctionPass(&ID) {}
+    MSP430BSel() : MachineFunctionPass(ID) {}
 
     /// BlockSizes - The sizes of the basic blocks in the function.
     std::vector<unsigned> BlockSizes;
@@ -52,7 +52,8 @@ FunctionPass *llvm::createMSP430BranchSelectionPass() {
 }
 
 bool MSP430BSel::runOnMachineFunction(MachineFunction &Fn) {
-  const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+  const MSP430InstrInfo *TII =
+             static_cast<const MSP430InstrInfo*>(Fn.getTarget().getInstrInfo());
   // Give the blocks of the function a dense, in-order, numbering.
   Fn.RenumberBlocks();
   BlockSizes.resize(Fn.getNumBlockIDs());
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index df28d07f5d717..bfab844f5b1a7 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -100,27 +100,6 @@ void MSP430InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 }
 
 bool
-MSP430InstrInfo::isMoveInstr(const MachineInstr& MI,
-                             unsigned &SrcReg, unsigned &DstReg,
-                             unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
-  SrcSubIdx = DstSubIdx = 0; // No sub-registers yet.
-
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case MSP430::MOV8rr:
-  case MSP430::MOV16rr:
-   assert(MI.getNumOperands() >= 2 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           "invalid register-register move instruction");
-    SrcReg = MI.getOperand(1).getReg();
-    DstReg = MI.getOperand(0).getReg();
-    return true;
-  }
-}
-
-bool
 MSP430InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MI,
                                         const std::vector<CalleeSavedInfo> &CSI,
@@ -361,7 +340,7 @@ unsigned MSP430InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
     switch (Desc.getOpcode()) {
     default:
       assert(0 && "Unknown instruction size!");
-    case TargetOpcode::DBG_LABEL:
+    case TargetOpcode::PROLOG_LABEL:
     case TargetOpcode::EH_LABEL:
     case TargetOpcode::IMPLICIT_DEF:
     case TargetOpcode::KILL:
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index ebbda1aeef513..49ccc032bf29f 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -54,10 +54,6 @@ public:
                    unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const;
 
-  bool isMoveInstr(const MachineInstr& MI,
-                   unsigned &SrcReg, unsigned &DstReg,
-                   unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-
   virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MI,
                                    unsigned SrcReg, bool isKill,
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 608ca49fcf78e..3c3fa73477a59 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -101,7 +101,7 @@ bool MSP430RegisterInfo::hasFP(const MachineFunction &MF) const {
           MFI->isFrameAddressTaken());
 }
 
-bool MSP430RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
+bool MSP430RegisterInfo::hasReservedCallFrame(const MachineFunction &MF) const {
   return !MF.getFrameInfo()->hasVarSizedObjects();
 }
 
@@ -163,10 +163,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
-unsigned
+void
 MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                        int SPAdj, FrameIndexValue *Value,
-                                        RegScavenger *RS) const {
+                                        int SPAdj, RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
   unsigned i = 0;
@@ -204,7 +203,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     MI.getOperand(i).ChangeToRegister(BasePtr, false);
 
     if (Offset == 0)
-      return 0;
+      return;
 
     // We need to materialize the offset via add instruction.
     unsigned DstReg = MI.getOperand(0).getReg();
@@ -215,12 +214,11 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       BuildMI(MBB, llvm::next(II), dl, TII.get(MSP430::ADD16ri), DstReg)
         .addReg(DstReg).addImm(Offset);
 
-    return 0;
+    return;
   }
 
   MI.getOperand(i).ChangeToRegister(BasePtr, false);
   MI.getOperand(i+1).ChangeToImmediate(Offset);
-  return 0;
 }
 
 void
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index 6e58d3116d273..4d2795bb40201 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -40,15 +40,14 @@ public:
   const TargetRegisterClass* getPointerRegClass(unsigned Kind = 0) const;
 
   bool hasFP(const MachineFunction &MF) const;
-  bool hasReservedCallFrame(MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
-  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
-                               int SPAdj, FrameIndexValue *Value = NULL,
-                               RegScavenger *RS = NULL) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
 
   void emitPrologue(MachineFunction &MF) const;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
index 2037a9114559e..49efe75d79d8f 100644
--- a/lib/Target/Mangler.cpp
+++ b/lib/Target/Mangler.cpp
@@ -180,7 +180,8 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
   ManglerPrefixTy PrefixTy = Mangler::Default;
   if (GV->hasPrivateLinkage() || isImplicitlyPrivate)
     PrefixTy = Mangler::Private;
-  else if (GV->hasLinkerPrivateLinkage() || GV->hasLinkerPrivateWeakLinkage())
+  else if (GV->hasLinkerPrivateLinkage() || GV->hasLinkerPrivateWeakLinkage() ||
+           GV->hasLinkerPrivateWeakDefAutoLinkage())
     PrefixTy = Mangler::LinkerPrivate;
   
   // If this global has a name, handle it simply.
diff --git a/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp b/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp
index 8ae05b75e919d..6660f6b624309 100644
--- a/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp
@@ -18,6 +18,8 @@
 #include "MipsInstrInfo.h"
 #include "MipsTargetMachine.h"
 #include "MipsMachineFunction.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Instructions.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -75,6 +77,7 @@ namespace {
     }
     virtual void EmitFunctionBodyStart();
     virtual void EmitFunctionBodyEnd();
+    virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const;
     static const char *getRegisterName(unsigned RegNo);
 
     virtual void EmitFunctionEntryLabel();
@@ -227,6 +230,23 @@ void MipsAsmPrinter::EmitFunctionBodyEnd() {
 }
 
 
+/// isBlockOnlyReachableByFallthough - Return true if the basic block has
+/// exactly one predecessor and the control transfer mechanism between
+/// the predecessor and this block is a fall-through.
+bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) 
+    const {
+  // The predecessor has to be immediately before this block.
+  const MachineBasicBlock *Pred = *MBB->pred_begin();
+
+  // If the predecessor is a switch statement, assume a jump table
+  // implementation, so it is not a fall through.
+  if (const BasicBlock *bb = Pred->getBasicBlock())
+    if (isa<SwitchInst>(bb->getTerminator()))
+      return false;
+  
+  return AsmPrinter::isBlockOnlyReachableByFallthrough(MBB);
+}
+
 // Print out an operand for an inline asm expression.
 bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 
                                      unsigned AsmVariant,const char *ExtraCode,
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index aa036aef83d0b..a51c3779c7f4f 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -1,4 +1,4 @@
-//===- Mips.td - Describe the Mips Target Machine ---------------*- C++ -*-===//
+//===- Mips.td - Describe the Mips Target Machine ----------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index c2bfb8fa738c6..8f313efaf8daa 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -1,4 +1,4 @@
-//===- MipsCallingConv.td - Calling Conventions for Mips --------*- C++ -*-===//
+//===- MipsCallingConv.td - Calling Conventions for Mips ---*- tablegen -*-===//
 // 
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index a2b615d8add2a..597ea0d6c2072 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -32,7 +32,7 @@ namespace {
 
     static char ID;
     Filler(TargetMachine &tm) 
-      : MachineFunctionPass(&ID), TM(tm), TII(tm.getInstrInfo()) { }
+      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
 
     virtual const char *getPassName() const {
       return "Mips Delay Slot Filler";
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 3888bbf09ec7a..a47cf7b4f201e 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -137,7 +137,7 @@ SelectAddr(SDNode *Op, SDValue Addr, SDValue &Offset, SDValue &Base)
   // Operand is a result from an ADD.
   if (Addr.getOpcode() == ISD::ADD) {
     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
-      if (Predicate_immSExt16(CN)) {
+      if (isInt<16>(CN->getSExtValue())) {
 
         // If the first operand is a FI, get the TargetFI Node
         if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
@@ -184,8 +184,9 @@ SDNode *MipsDAGToDAGISel::SelectLoadFp64(SDNode *N) {
   if (!Subtarget.isMips1() || NVT != MVT::f64)
     return NULL;
 
-  if (!Predicate_unindexedload(N) ||
-      !Predicate_load(N))
+  LoadSDNode *LN = cast<LoadSDNode>(N);
+  if (LN->getExtensionType() != ISD::NON_EXTLOAD ||
+      LN->getAddressingMode() != ISD::UNINDEXED)
     return NULL;
 
   SDValue Chain = N->getOperand(0);
@@ -248,8 +249,8 @@ SDNode *MipsDAGToDAGISel::SelectStoreFp64(SDNode *N) {
 
   SDValue Chain = N->getOperand(0);
 
-  if (!Predicate_unindexedstore(N) ||
-      !Predicate_store(N))
+  StoreSDNode *SN = cast<StoreSDNode>(N);
+  if (SN->isTruncatingStore() || SN->getAddressingMode() != ISD::UNINDEXED)
     return NULL;
 
   SDValue N1 = N->getOperand(1);
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index b6ff2c371d5c9..b0b99bad16071 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -317,13 +317,13 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     BB->addSuccessor(sinkMBB);
 
     //  sinkMBB:
-    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
     //  ...
     BB = sinkMBB;
     BuildMI(*BB, BB->begin(), dl,
             TII->get(Mips::PHI), MI->getOperand(0).getReg())
-      .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
-      .addReg(MI->getOperand(3).getReg()).addMBB(thisMBB);
+      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB)
+      .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB);
 
     MI->eraseFromParent();   // The pseudo instruction is gone now.
     return BB;
@@ -542,7 +542,7 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG) const
 
   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
 
-  if (IsPIC) {
+  if (!IsPIC) {
     SDValue Ops[] = { JTI };
     HiPart = DAG.getNode(MipsISD::Hi, dl, DAG.getVTList(MVT::i32), Ops, 1);
   } else // Emit Load from Global Pointer
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index e948917eb80eb..cff79966dcd3b 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -1,4 +1,4 @@
-//===- MipsInstrFPU.td - Mips FPU Instruction Information -------*- C++ -*-===//
+//===- MipsInstrFPU.td - Mips FPU Instruction Information --*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 0853272f7280e..98ae2fa7da456 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -1,4 +1,4 @@
-//===- MipsRegisterInfo.td - Mips Register defs -----------------*- C++ -*-===//
+//===- MipsRegisterInfo.td - Mips Register defs ------------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 6c09a3e10785b..aaf307b1ce3ff 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -30,53 +30,6 @@ static bool isZeroImm(const MachineOperand &op) {
   return op.isImm() && op.getImm() == 0;
 }
 
-/// Return true if the instruction is a register to register move and
-/// leave the source and dest operands in the passed parameters.
-bool MipsInstrInfo::
-isMoveInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg,
-            unsigned &SrcSubIdx, unsigned &DstSubIdx) const 
-{
-  SrcSubIdx = DstSubIdx = 0; // No sub-registers.
-
-  // addu $dst, $src, $zero || addu $dst, $zero, $src
-  // or   $dst, $src, $zero || or   $dst, $zero, $src
-  if ((MI.getOpcode() == Mips::ADDu) || (MI.getOpcode() == Mips::OR)) {
-    if (MI.getOperand(1).getReg() == Mips::ZERO) {
-      DstReg = MI.getOperand(0).getReg();
-      SrcReg = MI.getOperand(2).getReg();
-      return true;
-    } else if (MI.getOperand(2).getReg() == Mips::ZERO) {
-      DstReg = MI.getOperand(0).getReg();
-      SrcReg = MI.getOperand(1).getReg();
-      return true;
-    }
-  }
-
-  // mov $fpDst, $fpSrc
-  // mfc $gpDst, $fpSrc
-  // mtc $fpDst, $gpSrc
-  if (MI.getOpcode() == Mips::FMOV_S32 || 
-      MI.getOpcode() == Mips::FMOV_D32 || 
-      MI.getOpcode() == Mips::MFC1 || 
-      MI.getOpcode() == Mips::MTC1 ||
-      MI.getOpcode() == Mips::MOVCCRToCCR) {
-    DstReg = MI.getOperand(0).getReg();
-    SrcReg = MI.getOperand(1).getReg();
-    return true;
-  }
-
-  // addiu $dst, $src, 0
-  if (MI.getOpcode() == Mips::ADDiu) {
-    if ((MI.getOperand(1).isReg()) && (isZeroImm(MI.getOperand(2)))) {
-      DstReg = MI.getOperand(0).getReg();
-      SrcReg = MI.getOperand(1).getReg();
-      return true;
-    }
-  }
-
-  return false;
-}
-
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
 /// load from a stack slot, return the virtual or physical register number of
 /// the destination along with the FrameIndex of the loaded stack slot.  If
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index d6f87f9b0ce82..52a3d39840ba6 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -174,12 +174,6 @@ public:
   ///
   virtual const MipsRegisterInfo &getRegisterInfo() const { return RI; }
 
-  /// Return true if the instruction is a register to register move and return
-  /// the source and dest operands and their sub-register indices by reference.
-  virtual bool isMoveInstr(const MachineInstr &MI,
-                           unsigned &SrcReg, unsigned &DstReg,
-                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-  
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 5337c9fb816a2..320c5b8834831 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -1,4 +1,4 @@
-//===- MipsInstrInfo.td - Mips Register defs --------------------*- C++ -*-===//
+//===- MipsInstrInfo.td - Mips Register defs ---------------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -96,12 +96,7 @@ def HI16 : SDNodeXForm<imm, [{
 
 // Node immediate fits as 16-bit sign extended on target immediate.
 // e.g. addi, andi
-def immSExt16  : PatLeaf<(imm), [{
-  if (N->getValueType(0) == MVT::i32)
-    return (int32_t)N->getZExtValue() == (short)N->getZExtValue();
-  else
-    return (int64_t)N->getZExtValue() == (short)N->getZExtValue();
-}]>;
+def immSExt16  : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
 
 // Node immediate fits as 16-bit zero extended on target immediate.
 // The LO16 param means that only the lower 16 bits of the node
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index e15f0a58e501b..69436d2acb546 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -327,10 +327,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 // FrameIndex represent objects inside a abstract stack.
 // We must replace FrameIndex with an stack/frame pointer
 // direct reference.
-unsigned MipsRegisterInfo::
+void MipsRegisterInfo::
 eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                    FrameIndexValue *Value, RegScavenger *RS) const
-{
+                    RegScavenger *RS) const {
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
 
@@ -361,7 +360,6 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 
   MI.getOperand(i-1).ChangeToImmediate(Offset);
   MI.getOperand(i).ChangeToRegister(getFrameRegister(MF), false);
-  return 0;
 }
 
 void MipsRegisterInfo::
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index b500a650f7cc9..89282f8fa1465 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -51,9 +51,8 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo {
                                      MachineBasicBlock::iterator I) const;
 
   /// Stack Frame Processing Methods
-  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
-                               int SPAdj, FrameIndexValue *Value = NULL,
-                               RegScavenger *RS = NULL) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index be78a2266268c..60efe31fbaf82 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -1,4 +1,4 @@
-//===- MipsRegisterInfo.td - Mips Register defs -----------------*- C++ -*-===//
+//===- MipsRegisterInfo.td - Mips Register defs ------------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
index 616a79bf831cb..055ff32372184 100644
--- a/lib/Target/Mips/MipsSchedule.td
+++ b/lib/Target/Mips/MipsSchedule.td
@@ -1,4 +1,4 @@
-//===- MipsSchedule.td - Mips Scheduling Definitions ------------*- C++ -*-===//
+//===- MipsSchedule.td - Mips Scheduling Definitions -------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/PIC16/CMakeLists.txt b/lib/Target/PIC16/CMakeLists.txt
index cd4afe8e2342a..2b6cb9e4e461d 100644
--- a/lib/Target/PIC16/CMakeLists.txt
+++ b/lib/Target/PIC16/CMakeLists.txt
@@ -10,7 +10,7 @@ tablegen(PIC16GenDAGISel.inc -gen-dag-isel)
 tablegen(PIC16GenCallingConv.inc -gen-callingconv)
 tablegen(PIC16GenSubtarget.inc -gen-subtarget)
 
-add_llvm_target(PIC16
+add_llvm_target(PIC16CodeGen
   PIC16DebugInfo.cpp
   PIC16InstrInfo.cpp
   PIC16ISelDAGToDAG.cpp
diff --git a/lib/Target/PIC16/PIC16.h b/lib/Target/PIC16/PIC16.h
index cee55f4f260fe..08bb3e6f055b6 100644
--- a/lib/Target/PIC16/PIC16.h
+++ b/lib/Target/PIC16/PIC16.h
@@ -58,13 +58,10 @@ namespace PIC16CC {
     ESNames() {}
     public:
     ~ESNames() {
-      std::vector<char*>::iterator it = stk.end();
-      it--;
-      while(stk.end() != stk.begin())
+      while (!stk.empty())
         {
-        char* p = *it;
+        char* p = stk.back();
         delete [] p;
-        it--;
         stk.pop_back();
         }
     }
diff --git a/lib/Target/PIC16/PIC16ISelLowering.cpp b/lib/Target/PIC16/PIC16ISelLowering.cpp
index 54a6a28992bf9..527b31d0cc9f3 100644
--- a/lib/Target/PIC16/PIC16ISelLowering.cpp
+++ b/lib/Target/PIC16/PIC16ISelLowering.cpp
@@ -312,6 +312,16 @@ PIC16TargetLowering::PIC16TargetLowering(PIC16TargetMachine &TM)
   computeRegisterProperties();
 }
 
+std::pair<const TargetRegisterClass*, uint8_t>
+PIC16TargetLowering::findRepresentativeClass(EVT VT) const {
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(VT);
+  case MVT::i16:
+    return std::make_pair(PIC16::FSR16RegisterClass, 1);
+  }
+}
+
 // getOutFlag - Extract the flag result if the Op has it.
 static SDValue getOutFlag(SDValue &Op) {
   // Flag is the last value of the node.
diff --git a/lib/Target/PIC16/PIC16ISelLowering.h b/lib/Target/PIC16/PIC16ISelLowering.h
index 0a7506cb497f5..d942af46a9e92 100644
--- a/lib/Target/PIC16/PIC16ISelLowering.h
+++ b/lib/Target/PIC16/PIC16ISelLowering.h
@@ -50,7 +50,7 @@ namespace llvm {
       CALL,          // PIC16 Call instruction 
       CALLW,         // PIC16 CALLW instruction 
       SUBCC,         // Compare for equality or inequality.
-      SELECT_ICC,    // Psuedo to be caught in schedular and expanded to brcond.
+      SELECT_ICC,    // Pseudo to be caught in scheduler and expanded to brcond.
       BRCOND,        // Conditional branch.
       RET,           // Return.
       Dummy
@@ -181,6 +181,9 @@ namespace llvm {
       // FIXME: The function never seems to be aligned.
       return 1;
     }
+  protected:
+    std::pair<const TargetRegisterClass*, uint8_t>
+    findRepresentativeClass(EVT VT) const;
   private:
     // If the Node is a BUILD_PAIR representing a direct Address,
     // then this function will return true.
diff --git a/lib/Target/PIC16/PIC16InstrInfo.cpp b/lib/Target/PIC16/PIC16InstrInfo.cpp
index e784f746f7f9b..81257f3c41083 100644
--- a/lib/Target/PIC16/PIC16InstrInfo.cpp
+++ b/lib/Target/PIC16/PIC16InstrInfo.cpp
@@ -167,21 +167,6 @@ void PIC16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
-bool PIC16InstrInfo::isMoveInstr(const MachineInstr &MI,
-                                 unsigned &SrcReg, unsigned &DestReg,
-                                 unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
-  SrcSubIdx = DstSubIdx = 0; // No sub-registers.
-
-  if (MI.getOpcode() == PIC16::copy_fsr
-      || MI.getOpcode() == PIC16::copy_w) {
-    DestReg = MI.getOperand(0).getReg();
-    SrcReg = MI.getOperand(1).getReg();
-    return true;
-  }
-
-  return false;
-}
-
 /// InsertBranch - Insert a branch into the end of the specified
 /// MachineBasicBlock.  This operands to this method are the same as those
 /// returned by AnalyzeBranch.  This is invoked in cases where AnalyzeBranch
diff --git a/lib/Target/PIC16/PIC16InstrInfo.h b/lib/Target/PIC16/PIC16InstrInfo.h
index a3a77f11ba160..661b335d3b6c5 100644
--- a/lib/Target/PIC16/PIC16InstrInfo.h
+++ b/lib/Target/PIC16/PIC16InstrInfo.h
@@ -61,10 +61,6 @@ public:
                            MachineBasicBlock::iterator I, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const;
-  virtual bool isMoveInstr(const MachineInstr &MI,
-                           unsigned &SrcReg, unsigned &DstReg,
-                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-
   virtual 
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB,
diff --git a/lib/Target/PIC16/PIC16MemSelOpt.cpp b/lib/Target/PIC16/PIC16MemSelOpt.cpp
index 241170b11c2ad..b6aa38f765ea7 100644
--- a/lib/Target/PIC16/PIC16MemSelOpt.cpp
+++ b/lib/Target/PIC16/PIC16MemSelOpt.cpp
@@ -38,7 +38,7 @@ using namespace llvm;
 namespace {
   struct MemSelOpt : public MachineFunctionPass {
     static char ID;
-    MemSelOpt() : MachineFunctionPass(&ID) {}
+    MemSelOpt() : MachineFunctionPass(ID) {}
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreservedID(MachineLoopInfoID);
diff --git a/lib/Target/PIC16/PIC16Passes/PIC16Cloner.cpp b/lib/Target/PIC16/PIC16Passes/PIC16Cloner.cpp
index 27f1cf572ae6c..56f0211570928 100644
--- a/lib/Target/PIC16/PIC16Passes/PIC16Cloner.cpp
+++ b/lib/Target/PIC16/PIC16Passes/PIC16Cloner.cpp
@@ -256,7 +256,7 @@ PIC16Cloner::cloneFunction(Function *OrgF) {
    CloneAutos(OrgF);
 
    // Now create the clone.
-   ClonedF = CloneFunction(OrgF, VMap);
+   ClonedF = CloneFunction(OrgF, VMap, /*ModuleLevelChanges=*/false);
 
    // The new function should be for interrupt line. Therefore should have 
    // the name suffixed with IL and section attribute marked with IL. 
diff --git a/lib/Target/PIC16/PIC16Passes/PIC16Cloner.h b/lib/Target/PIC16/PIC16Passes/PIC16Cloner.h
index e8b5aa45cdca9..e7d67ce09629a 100644
--- a/lib/Target/PIC16/PIC16Passes/PIC16Cloner.h
+++ b/lib/Target/PIC16/PIC16Passes/PIC16Cloner.h
@@ -35,7 +35,7 @@ namespace llvm {
   class PIC16Cloner : public ModulePass { 
   public:
     static char ID; // Class identification 
-    PIC16Cloner() : ModulePass(&ID)  {}
+    PIC16Cloner() : ModulePass(ID)  {}
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<CallGraph>();
diff --git a/lib/Target/PIC16/PIC16Passes/PIC16Overlay.cpp b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.cpp
index 5ecb6aa551576..0f8928a4b5f50 100644
--- a/lib/Target/PIC16/PIC16Passes/PIC16Overlay.cpp
+++ b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.cpp
@@ -171,8 +171,9 @@ void PIC16Overlay::MarkIndirectlyCalledFunctions(Module &M) {
   for (Module::iterator MI = M.begin(), E = M.end(); MI != E; ++MI) {
     for (Value::use_iterator I = MI->use_begin(), E = MI->use_end(); I != E;
          ++I) {
-      if ((!isa<CallInst>(I) && !isa<InvokeInst>(I))
-          || !CallSite(cast<Instruction>(I)).isCallee(I)) {
+      User *U = *I;
+      if ((!isa<CallInst>(U) && !isa<InvokeInst>(U))
+          || !CallSite(cast<Instruction>(U)).isCallee(I)) {
         setColor(MI, ++IndirectCallColor);
         break;
       }
diff --git a/lib/Target/PIC16/PIC16Passes/PIC16Overlay.h b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.h
index 5a2551fabcda9..2f611e65de1fd 100644
--- a/lib/Target/PIC16/PIC16Passes/PIC16Overlay.h
+++ b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.h
@@ -39,7 +39,7 @@ namespace  llvm {
     unsigned IndirectCallColor;
   public:
     static char ID; // Class identification 
-    PIC16Overlay() : ModulePass(&ID) {
+    PIC16Overlay() : ModulePass(ID) {
       OverlayStr = "Overlay=";
       InterruptDepth = PIC16OVERLAY::StartInterruptColor;
       IndirectCallColor = PIC16OVERLAY::StartIndirectCallColor;
diff --git a/lib/Target/PIC16/PIC16RegisterInfo.cpp b/lib/Target/PIC16/PIC16RegisterInfo.cpp
index dff98d12c2ae0..76de47fdf0f4a 100644
--- a/lib/Target/PIC16/PIC16RegisterInfo.cpp
+++ b/lib/Target/PIC16/PIC16RegisterInfo.cpp
@@ -44,13 +44,10 @@ bool PIC16RegisterInfo::hasFP(const MachineFunction &MF) const {
   return false;
 }
 
-unsigned PIC16RegisterInfo::
+void PIC16RegisterInfo::
 eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                    FrameIndexValue *Value, RegScavenger *RS) const
-{
-  /* NOT YET IMPLEMENTED */
-  return 0;
-}
+                    RegScavenger *RS) const
+{ /* NOT YET IMPLEMENTED */ }
 
 void PIC16RegisterInfo::emitPrologue(MachineFunction &MF) const
 {    /* NOT YET IMPLEMENTED */  }
diff --git a/lib/Target/PIC16/PIC16RegisterInfo.h b/lib/Target/PIC16/PIC16RegisterInfo.h
index 5536a617d2beb..20052b0034428 100644
--- a/lib/Target/PIC16/PIC16RegisterInfo.h
+++ b/lib/Target/PIC16/PIC16RegisterInfo.h
@@ -44,9 +44,8 @@ class PIC16RegisterInfo : public PIC16GenRegisterInfo {
   virtual BitVector getReservedRegs(const MachineFunction &MF) const;
   virtual bool hasFP(const MachineFunction &MF) const;
 
-  virtual unsigned eliminateFrameIndex(MachineBasicBlock::iterator MI,
-                                       int SPAdj, FrameIndexValue *Value = NULL,
-                                       RegScavenger *RS=NULL) const;
+  virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                   int SPAdj, RegScavenger *RS=NULL) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
diff --git a/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
index e35dc579f2cd5..c1a5663be9315 100644
--- a/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -327,6 +328,19 @@ namespace {
 
     void printPredicateOperand(const MachineInstr *MI, unsigned OpNo,
                                raw_ostream &O, const char *Modifier);
+
+    MachineLocation getDebugValueLocation(const MachineInstr *MI) const {
+
+      MachineLocation Location;
+      assert (MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+      // Frame address.  Currently handles register +- offset only.
+      if (MI->getOperand(0).isReg() && MI->getOperand(2).isImm())
+        Location.set(MI->getOperand(0).getReg(), MI->getOperand(2).getImm());
+      else {
+        DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+      }
+      return Location;
+    }
   };
 
   /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index 52948c868b9c9..e161d23600e27 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -10,7 +10,7 @@
 // This file contains a pass that scans a machine function to determine which
 // conditional branches need more than 16 bits of displacement to reach their
 // target basic block.  It does this in two passes; a calculation of basic block
-// positions pass, and a branch psuedo op to machine branch opcode pass.  This
+// positions pass, and a branch pseudo op to machine branch opcode pass.  This
 // pass should be run last, just before the assembly printer.
 //
 //===----------------------------------------------------------------------===//
@@ -31,7 +31,7 @@ STATISTIC(NumExpanded, "Number of branches expanded to long format");
 namespace {
   struct PPCBSel : public MachineFunctionPass {
     static char ID;
-    PPCBSel() : MachineFunctionPass(&ID) {}
+    PPCBSel() : MachineFunctionPass(ID) {}
 
     /// BlockSizes - The sizes of the basic blocks in the function.
     std::vector<unsigned> BlockSizes;
@@ -53,7 +53,8 @@ FunctionPass *llvm::createPPCBranchSelectionPass() {
 }
 
 bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
-  const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+  const PPCInstrInfo *TII =
+                static_cast<const PPCInstrInfo*>(Fn.getTarget().getInstrInfo());
   // Give the blocks of the function a dense, in-order, numbering.
   Fn.RenumberBlocks();
   BlockSizes.resize(Fn.getNumBlockIDs());
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index 155fba22d9d7b..441db94581aea 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -1,4 +1,4 @@
-//===- PPCCallingConv.td - Calling Conventions for PowerPC ------*- C++ -*-===//
+//===- PPCCallingConv.td - Calling Conventions for PowerPC -*- tablegen -*-===//
 // 
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
index 361fa70fb4c45..df9ab52389ba8 100644
--- a/lib/Target/PowerPC/PPCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp
@@ -45,7 +45,7 @@ namespace {
   public:
     
     PPCCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
-      : MachineFunctionPass(&ID), TM(tm), MCE(mce) {}
+      : MachineFunctionPass(ID), TM(tm), MCE(mce) {}
 
     /// getBinaryCodeForInstr - This function, generated by the
     /// CodeEmitterGenerator using TableGen, produces the binary encoding for
@@ -110,7 +110,7 @@ void PPCCodeEmitter::emitBasicBlock(MachineBasicBlock &MBB) {
     default:
       MCE.emitWordBE(getBinaryCodeForInstr(MI));
       break;
-    case TargetOpcode::DBG_LABEL:
+    case TargetOpcode::PROLOG_LABEL:
     case TargetOpcode::EH_LABEL:
       MCE.emitLabel(MI.getOperand(0).getMCSymbol());
       break;
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index d47d989b34c00..14d1b154a5c9c 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2467,18 +2467,31 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
 
   unsigned CallOpc = isSVR4ABI ? PPCISD::CALL_SVR4 : PPCISD::CALL_Darwin;
 
-  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
-  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
-  // node so that legalize doesn't hack it.
-  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, 
-                                        Callee.getValueType());
-  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
-    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType());
-  else if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG))
+  bool needIndirectCall = true;
+  if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) {
     // If this is an absolute destination address, use the munged value.
     Callee = SDValue(Dest, 0);
-  else {
+    needIndirectCall = false;
+  }
+  // XXX Work around for http://llvm.org/bugs/show_bug.cgi?id=5201
+  // Use indirect calls for ALL functions calls in JIT mode, since the
+  // far-call stubs may be outside relocation limits for a BL instruction.
+  if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) {
+    // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+    // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+    // node so that legalize doesn't hack it.
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
+					  Callee.getValueType());
+      needIndirectCall = false;
+    }
+  }
+  if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+      Callee = DAG.getTargetExternalSymbol(S->getSymbol(),
+					   Callee.getValueType());
+      needIndirectCall = false;
+  }
+  if (needIndirectCall) {
     // Otherwise, this is an indirect call.  We have to use a MTCTR/BCTRL pair
     // to do the call, we can't use PPCISD::CALL.
     SDValue MTCTROps[] = {Chain, Callee, InFlag};
@@ -3942,17 +3955,17 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     }
 
     // t = vsplti c, result = vsldoi t, t, 1
-    if (SextVal == ((i << 8) | (i >> (TypeShiftAmt-8)))) {
+    if (SextVal == ((i << 8) | (i < 0 ? 0xFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 2
-    if (SextVal == ((i << 16) | (i >> (TypeShiftAmt-16)))) {
+    if (SextVal == ((i << 16) | (i < 0 ? 0xFFFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 3
-    if (SextVal == ((i << 24) | (i >> (TypeShiftAmt-24)))) {
+    if (SextVal == ((i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl);
     }
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 1574aa3fb23a5..c17108fa92309 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -18,8 +18,11 @@
 #include "PPCGenInstrInfo.inc"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -36,67 +39,6 @@ PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm)
   : TargetInstrInfoImpl(PPCInsts, array_lengthof(PPCInsts)), TM(tm),
     RI(*TM.getSubtargetImpl(), *this) {}
 
-bool PPCInstrInfo::isMoveInstr(const MachineInstr& MI,
-                               unsigned& sourceReg,
-                               unsigned& destReg,
-                               unsigned& sourceSubIdx,
-                               unsigned& destSubIdx) const {
-  sourceSubIdx = destSubIdx = 0; // No sub-registers.
-
-  unsigned oc = MI.getOpcode();
-  if (oc == PPC::OR || oc == PPC::OR8 || oc == PPC::VOR ||
-      oc == PPC::OR4To8 || oc == PPC::OR8To4) {                // or r1, r2, r2
-    assert(MI.getNumOperands() >= 3 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           MI.getOperand(2).isReg() &&
-           "invalid PPC OR instruction!");
-    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
-      sourceReg = MI.getOperand(1).getReg();
-      destReg = MI.getOperand(0).getReg();
-      return true;
-    }
-  } else if (oc == PPC::ADDI) {             // addi r1, r2, 0
-    assert(MI.getNumOperands() >= 3 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(2).isImm() &&
-           "invalid PPC ADDI instruction!");
-    if (MI.getOperand(1).isReg() && MI.getOperand(2).getImm() == 0) {
-      sourceReg = MI.getOperand(1).getReg();
-      destReg = MI.getOperand(0).getReg();
-      return true;
-    }
-  } else if (oc == PPC::ORI) {             // ori r1, r2, 0
-    assert(MI.getNumOperands() >= 3 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           MI.getOperand(2).isImm() &&
-           "invalid PPC ORI instruction!");
-    if (MI.getOperand(2).getImm() == 0) {
-      sourceReg = MI.getOperand(1).getReg();
-      destReg = MI.getOperand(0).getReg();
-      return true;
-    }
-  } else if (oc == PPC::FMR || oc == PPC::FMRSD) { // fmr r1, r2
-    assert(MI.getNumOperands() >= 2 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           "invalid PPC FMR instruction");
-    sourceReg = MI.getOperand(1).getReg();
-    destReg = MI.getOperand(0).getReg();
-    return true;
-  } else if (oc == PPC::MCRF) {             // mcrf cr1, cr2
-    assert(MI.getNumOperands() >= 2 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           "invalid PPC MCRF instruction");
-    sourceReg = MI.getOperand(1).getReg();
-    destReg = MI.getOperand(0).getReg();
-    return true;
-  }
-  return false;
-}
-
 unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, 
                                            int &FrameIndex) const {
   switch (MI->getOpcode()) {
@@ -524,6 +466,14 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
   for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
     MBB.insert(MI, NewMIs[i]);
+
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FrameIdx),
+                            MachineMemOperand::MOStore, /*Offset=*/0,
+                            MFI.getObjectSize(FrameIdx),
+                            MFI.getObjectAlignment(FrameIdx));
+  NewMIs.back()->addMemOperand(MF, MMO);
 }
 
 void
@@ -637,6 +587,14 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs);
   for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
     MBB.insert(MI, NewMIs[i]);
+
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FrameIdx),
+                            MachineMemOperand::MOLoad, /*Offset=*/0,
+                            MFI.getObjectSize(FrameIdx),
+                            MFI.getObjectAlignment(FrameIdx));
+  NewMIs.back()->addMemOperand(MF, MMO);
 }
 
 MachineInstr*
@@ -667,7 +625,7 @@ unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
     const char *AsmStr = MI->getOperand(0).getSymbolName();
     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
   }
-  case PPC::DBG_LABEL:
+  case PPC::PROLOG_LABEL:
   case PPC::EH_LABEL:
   case PPC::GC_LABEL:
   case PPC::DBG_VALUE:
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index eadb21e217024..fc7b7b3cb8972 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -82,12 +82,6 @@ public:
   ///
   virtual const PPCRegisterInfo &getRegisterInfo() const { return RI; }
 
-  /// Return true if the instruction is a register to register move and return
-  /// the source and dest operands and their sub-register indices by reference.
-  virtual bool isMoveInstr(const MachineInstr &MI,
-                           unsigned &SrcReg, unsigned &DstReg,
-                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-
   unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                int &FrameIndex) const;
   unsigned isStoreToStackSlot(const MachineInstr *MI,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 63b4581a37f9a..eb100ec75280a 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1022,9 +1022,7 @@ let Uses = [RM] in {
   }
 }
 
-/// FMR is split into 2 versions, one for 4/8 byte FP, and one for extending.
-///
-/// Note that these are defined as pseudo-ops on the PPC970 because they are
+/// Note that FMR is defined as pseudo-ops on the PPC970 because they are
 /// often coalesced away and we don't want the dispatch group builder to think
 /// that they will fill slots (which could cause the load of a LSU reject to
 /// sneak into a d-group with a store).
@@ -1032,10 +1030,6 @@ def FMR   : XForm_26<63, 72, (outs F4RC:$frD), (ins F4RC:$frB),
                      "fmr $frD, $frB", FPGeneral,
                      []>,  // (set F4RC:$frD, F4RC:$frB)
                      PPC970_Unit_Pseudo;
-def FMRSD  : XForm_26<63, 72, (outs F8RC:$frD), (ins F4RC:$frB),
-                      "fmr $frD, $frB", FPGeneral,
-                      [(set F8RC:$frD, (fextend F4RC:$frB))]>,
-                      PPC970_Unit_Pseudo;
 
 let PPC970_Unit = 3 in {  // FPU Operations.
 // These are artificially split into two different forms, for 4/8 byte FP.
@@ -1476,10 +1470,13 @@ def : Pat<(extloadi16 iaddr:$src),
           (LHZ iaddr:$src)>;
 def : Pat<(extloadi16 xaddr:$src),
           (LHZX xaddr:$src)>;
-def : Pat<(extloadf32 iaddr:$src),
-          (FMRSD (LFS iaddr:$src))>;
-def : Pat<(extloadf32 xaddr:$src),
-          (FMRSD (LFSX xaddr:$src))>;
+def : Pat<(f64 (extloadf32 iaddr:$src)),
+          (COPY_TO_REGCLASS (LFS iaddr:$src), F8RC)>;
+def : Pat<(f64 (extloadf32 xaddr:$src)),
+          (COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>;
+
+def : Pat<(f64 (fextend F4RC:$src)),
+          (COPY_TO_REGCLASS F4RC:$src, F8RC)>;
 
 // Memory barriers
 def : Pat<(membarrier (i32 imm /*ll*/),
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 4d6132a9ec50c..653e143ba407f 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -449,8 +449,8 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II,
   // Get stack alignments.
   unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
   unsigned MaxAlign = MFI->getMaxAlignment();
-  assert(MaxAlign <= TargetAlign &&
-         "Dynamic alloca with large aligns not supported");
+  if (MaxAlign > TargetAlign)
+    report_fatal_error("Dynamic alloca with large aligns not supported");
 
   // Determine the previous frame's address.  If FrameSize can't be
   // represented as 16 bits or we need special alignment, then we load the
@@ -580,10 +580,9 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
   MBB.erase(II);
 }
 
-unsigned
+void
 PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                     int SPAdj, FrameIndexValue *Value,
-                                     RegScavenger *RS) const {
+                                     int SPAdj, RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
   // Get the instruction.
@@ -622,14 +621,14 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (FPSI && FrameIndex == FPSI &&
       (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) {
     lowerDynamicAlloc(II, SPAdj, RS);
-    return 0;
+    return;
   }
 
   // Special case for pseudo-op SPILL_CR.
   if (EnableRegisterScavenging) // FIXME (64-bit): Enable by default.
     if (OpC == PPC::SPILL_CR) {
       lowerCRSpilling(II, FrameIndex, SPAdj, RS);
-      return 0;
+      return;
     }
 
   // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
@@ -674,7 +673,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     if (isIXAddr)
       Offset >>= 2;    // The actual encoded value has the low two bits zero.
     MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
-    return 0;
+    return;
   }
 
   // The offset doesn't fit into a single register, scavenge one to build the
@@ -710,11 +709,10 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   } else {
     OperandBase = OffsetOperandNo;
   }
-    
+
   unsigned StackReg = MI.getOperand(FIOperandNo).getReg();
   MI.getOperand(OperandBase).ChangeToRegister(StackReg, false);
   MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false);
-  return 0;
 }
 
 /// VRRegNo - Map from a numbered VR register to its enum value.
@@ -1318,7 +1316,7 @@ PPCRegisterInfo::emitPrologue(MachineFunction &MF) const {
   if (needsFrameMoves) {
     // Mark effective beginning of when frame pointer becomes valid.
     FrameLabel = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::DBG_LABEL)).addSym(FrameLabel);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::PROLOG_LABEL)).addSym(FrameLabel);
   
     // Show update of SP.
     if (NegFrameSize) {
@@ -1361,7 +1359,7 @@ PPCRegisterInfo::emitPrologue(MachineFunction &MF) const {
       ReadyLabel = MMI.getContext().CreateTempSymbol();
 
       // Mark effective beginning of when frame pointer is ready.
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::DBG_LABEL)).addSym(ReadyLabel);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::PROLOG_LABEL)).addSym(ReadyLabel);
 
       MachineLocation FPDst(HasFP ? (isPPC64 ? PPC::X31 : PPC::R31) :
                                     (isPPC64 ? PPC::X1 : PPC::R1));
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index f026847a540b3..890b24b9c0a8b 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -63,9 +63,8 @@ public:
                          int SPAdj, RegScavenger *RS) const;
   void lowerCRSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex,
                        int SPAdj, RegScavenger *RS) const;
-  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
-                               int SPAdj, FrameIndexValue *Value = NULL,
-                               RegScavenger *RS = NULL) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
 
   /// determineFrameLayout - Determine the size of the frame and maximum call
   /// frame size.
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 40914ba62a70e..5d46065d96f22 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -69,6 +69,7 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &FS,
   , HasFSQRT(false)
   , HasSTFIWX(false)
   , HasLazyResolverStubs(false)
+  , IsJITCodeModel(false)
   , DarwinVers(0) {
 
   // Determine default and user specified characteristics
@@ -117,6 +118,9 @@ void PPCSubtarget::SetJITMode() {
   // everything is.  This matters for PPC64, which codegens in PIC mode without
   // stubs.
   HasLazyResolverStubs = false;
+
+  // Calls to external functions need to use indirect calls
+  IsJITCodeModel = true;
 }
 
 
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 75fcf6238a27a..00ec7474c9e39 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -63,6 +63,7 @@ protected:
   bool HasFSQRT;
   bool HasSTFIWX;
   bool HasLazyResolverStubs;
+  bool IsJITCodeModel;
   
   /// DarwinVers - Nonzero if this is a darwin platform.  Otherwise, the numeric
   /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc.
@@ -124,6 +125,9 @@ public:
   bool hasLazyResolverStub(const GlobalValue *GV, 
                            const TargetMachine &TM) const;
   
+  // isJITCodeModel - True if we're generating code for the JIT
+  bool isJITCodeModel() const { return IsJITCodeModel; }
+
   // Specific obvious features.
   bool hasFSQRT() const { return HasFSQRT; }
   bool hasSTFIWX() const { return HasSTFIWX; }
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 4d7ee08de1dee..4faf8bcfd4199 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -1919,5 +1919,21 @@ something like the following, which eliminates a branch:
 	ret
 .LBB0_2:
 	jmp	foo  # TAILCALL
+//===---------------------------------------------------------------------===//
+Given a branch where the two target blocks are identical ("ret i32 %b" in
+both), simplifycfg will simplify them away. But not so for a switch statement:
+
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+        switch i32 %a, label %bb3 [
+                i32 4, label %bb
+                i32 6, label %bb
+        ]
 
+bb:             ; preds = %entry, %entry
+        ret i32 %b
+
+bb3:            ; preds = %entry
+        ret i32 %b
+}
 //===---------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 9e148ada8853e..aae5da8560056 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -32,7 +32,7 @@ namespace {
 
     static char ID;
     Filler(TargetMachine &tm) 
-      : MachineFunctionPass(&ID), TM(tm), TII(tm.getInstrInfo()) { }
+      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
 
     virtual const char *getPassName() const {
       return "SPARC Delay Slot Filler";
diff --git a/lib/Target/Sparc/FPMover.cpp b/lib/Target/Sparc/FPMover.cpp
index 88b0927b35500..1423b1e64d66e 100644
--- a/lib/Target/Sparc/FPMover.cpp
+++ b/lib/Target/Sparc/FPMover.cpp
@@ -36,7 +36,7 @@ namespace {
     
     static char ID;
     explicit FPMover(TargetMachine &tm) 
-      : MachineFunctionPass(&ID), TM(tm) { }
+      : MachineFunctionPass(ID), TM(tm) { }
 
     virtual const char *getPassName() const {
       return "Sparc Double-FP Move Fixer";
diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
index 925d782d988be..764336665d0bb 100644
--- a/lib/Target/Sparc/Sparc.td
+++ b/lib/Target/Sparc/Sparc.td
@@ -1,4 +1,4 @@
-//===- Sparc.td - Describe the Sparc Target Machine -------------*- C++ -*-===//
+//===- Sparc.td - Describe the Sparc Target Machine --------*- tablegen -*-===//
 // 
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index 698923e3c9e08..4ea94c4cb560f 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -84,7 +84,7 @@ bool SparcDAGToDAGISel::SelectADDRri(SDNode *Op, SDValue Addr,
 
   if (Addr.getOpcode() == ISD::ADD) {
     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
-      if (Predicate_simm13(CN)) {
+      if (isInt<13>(CN->getSExtValue())) {
         if (FrameIndexSDNode *FIN =
                 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
           // Constant offset from frame ref.
@@ -120,9 +120,9 @@ bool SparcDAGToDAGISel::SelectADDRrr(SDNode *Op, SDValue Addr,
     return false;  // direct calls.
 
   if (Addr.getOpcode() == ISD::ADD) {
-    if (isa<ConstantSDNode>(Addr.getOperand(1)) &&
-        Predicate_simm13(Addr.getOperand(1).getNode()))
-      return false;  // Let the reg+imm pattern catch this!
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      if (isInt<13>(CN->getSExtValue()))
+        return false;  // Let the reg+imm pattern catch this!
     if (Addr.getOperand(0).getOpcode() == SPISD::Lo ||
         Addr.getOperand(1).getOpcode() == SPISD::Lo)
       return false;  // Let the reg+imm pattern catch this!
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index 3a4c80ad076a4..7ede8e7ebbe46 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -28,46 +28,6 @@ SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
     RI(ST, *this), Subtarget(ST) {
 }
 
-static bool isZeroImm(const MachineOperand &op) {
-  return op.isImm() && op.getImm() == 0;
-}
-
-/// Return true if the instruction is a register to register move and
-/// leave the source and dest operands in the passed parameters.
-///
-bool SparcInstrInfo::isMoveInstr(const MachineInstr &MI,
-                                 unsigned &SrcReg, unsigned &DstReg,
-                                 unsigned &SrcSR, unsigned &DstSR) const {
-  SrcSR = DstSR = 0; // No sub-registers.
-
-  // We look for 3 kinds of patterns here:
-  // or with G0 or 0
-  // add with G0 or 0
-  // fmovs or FpMOVD (pseudo double move).
-  if (MI.getOpcode() == SP::ORrr || MI.getOpcode() == SP::ADDrr) {
-    if (MI.getOperand(1).getReg() == SP::G0) {
-      DstReg = MI.getOperand(0).getReg();
-      SrcReg = MI.getOperand(2).getReg();
-      return true;
-    } else if (MI.getOperand(2).getReg() == SP::G0) {
-      DstReg = MI.getOperand(0).getReg();
-      SrcReg = MI.getOperand(1).getReg();
-      return true;
-    }
-  } else if ((MI.getOpcode() == SP::ORri || MI.getOpcode() == SP::ADDri) &&
-             isZeroImm(MI.getOperand(2)) && MI.getOperand(1).isReg()) {
-    DstReg = MI.getOperand(0).getReg();
-    SrcReg = MI.getOperand(1).getReg();
-    return true;
-  } else if (MI.getOpcode() == SP::FMOVS || MI.getOpcode() == SP::FpMOVD ||
-             MI.getOpcode() == SP::FMOVD) {
-    SrcReg = MI.getOperand(1).getReg();
-    DstReg = MI.getOperand(0).getReg();
-    return true;
-  }
-  return false;
-}
-
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
 /// load from a stack slot, return the virtual or physical register number of
 /// the destination along with the FrameIndex of the loaded stack slot.  If
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index 133471857bad6..c00bd2198765c 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -43,12 +43,6 @@ public:
   ///
   virtual const SparcRegisterInfo &getRegisterInfo() const { return RI; }
 
-  /// Return true if the instruction is a register to register move and return
-  /// the source and dest operands and their sub-register indices by reference.
-  virtual bool isMoveInstr(const MachineInstr &MI,
-                           unsigned &SrcReg, unsigned &DstReg,
-                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-  
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index ddadd51a93a42..467ed48487adf 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -43,17 +43,9 @@ def UseDeprecatedInsts : Predicate<"Subtarget.useDeprecatedV8Instructions()">;
 // Instruction Pattern Stuff
 //===----------------------------------------------------------------------===//
 
-def simm11  : PatLeaf<(imm), [{
-  // simm11 predicate - True if the imm fits in a 11-bit sign extended field.
-  return (((int)N->getZExtValue() << (32-11)) >> (32-11)) ==
-         (int)N->getZExtValue();
-}]>;
+def simm11  : PatLeaf<(imm), [{ return isInt<11>(N->getSExtValue()); }]>;
 
-def simm13  : PatLeaf<(imm), [{
-  // simm13 predicate - True if the imm fits in a 13-bit sign extended field.
-  return (((int)N->getZExtValue() << (32-13)) >> (32-13)) ==
-         (int)N->getZExtValue();
-}]>;
+def simm13  : PatLeaf<(imm), [{ return isInt<13>(N->getSExtValue()); }]>;
 
 def LO10 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant((unsigned)N->getZExtValue() & 1023,
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index 427cc7fd45774..c85db20d2b748 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -69,10 +69,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
-unsigned
+void
 SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                       int SPAdj, FrameIndexValue *Value,
-                                       RegScavenger *RS) const {
+                                       int SPAdj, RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
   unsigned i = 0;
@@ -108,7 +107,6 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     MI.getOperand(i).ChangeToRegister(SP::G1, false);
     MI.getOperand(i+1).ChangeToImmediate(Offset & ((1 << 10)-1));
   }
-  return 0;
 }
 
 void SparcRegisterInfo::
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index 9f0cda707b3ec..020ce567c9568 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -40,9 +40,8 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
-  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
-                               int SPAdj, FrameIndexValue *Value = NULL,
-                               RegScavenger *RS = NULL) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index c03864fe41e4f..367bed3a85395 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -141,31 +141,6 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
-bool
-SystemZInstrInfo::isMoveInstr(const MachineInstr& MI,
-                              unsigned &SrcReg, unsigned &DstReg,
-                              unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case SystemZ::MOV32rr:
-  case SystemZ::MOV64rr:
-  case SystemZ::MOV64rrP:
-  case SystemZ::MOV128rr:
-  case SystemZ::FMOV32rr:
-  case SystemZ::FMOV64rr:
-    assert(MI.getNumOperands() >= 2 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           "invalid register-register move instruction");
-    SrcReg = MI.getOperand(1).getReg();
-    DstReg = MI.getOperand(0).getReg();
-    SrcSubIdx = MI.getOperand(1).getSubReg();
-    DstSubIdx = MI.getOperand(0).getSubReg();
-    return true;
-  }
-}
-
 unsigned SystemZInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
                                                int &FrameIndex) const {
   switch (MI->getOpcode()) {
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 0559619248a63..c248f2489c493 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -65,9 +65,6 @@ public:
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const;
 
-  bool isMoveInstr(const MachineInstr& MI,
-                   unsigned &SrcReg, unsigned &DstReg,
-                   unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
   unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
   unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
 
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index ae96b0b08ff62..f8d3e6ac8a6fb 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -92,10 +92,9 @@ int SystemZRegisterInfo::getFrameIndexOffset(const MachineFunction &MF,
   return Offset;
 }
 
-unsigned
+void
 SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                         int SPAdj, FrameIndexValue *Value,
-                                         RegScavenger *RS) const {
+                                         int SPAdj, RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unxpected");
 
   unsigned i = 0;
@@ -117,13 +116,13 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // Offset is a either 12-bit unsigned or 20-bit signed integer.
   // FIXME: handle "too long" displacements.
-  int Offset = getFrameIndexOffset(MF, FrameIndex) + MI.getOperand(i+1).getImm();
+  int Offset =
+    getFrameIndexOffset(MF, FrameIndex) + MI.getOperand(i+1).getImm();
 
   // Check whether displacement is too long to fit into 12 bit zext field.
   MI.setDesc(TII.getMemoryInstr(MI.getOpcode(), Offset));
 
   MI.getOperand(i+1).ChangeToImmediate(Offset);
-  return 0;
 }
 
 void
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index 670025f86e08f..5dae865cb79a0 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -34,7 +34,7 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
-  bool hasReservedCallFrame(MachineFunction &MF) const { return true; }
+  bool hasReservedCallFrame(const MachineFunction &MF) const { return true; }
   bool hasFP(const MachineFunction &MF) const;
 
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
@@ -43,9 +43,8 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
-  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
-                               int SPAdj, FrameIndexValue *Value = NULL,
-                               RegScavenger *RS = NULL) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
 
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
index 5870d8a87004d..f35c96dadcee5 100644
--- a/lib/Target/TargetData.cpp
+++ b/lib/Target/TargetData.cpp
@@ -34,8 +34,7 @@ using namespace llvm;
 // Handle the Pass registration stuff necessary to use TargetData's.
 
 // Register the default SparcV9 implementation...
-static RegisterPass<TargetData> X("targetdata", "Target Data Layout", false, 
-                                  true);
+INITIALIZE_PASS(TargetData, "targetdata", "Target Data Layout", false, true);
 char TargetData::ID = 0;
 
 //===----------------------------------------------------------------------===//
@@ -98,8 +97,8 @@ unsigned StructLayout::getElementContainingOffset(uint64_t Offset) const {
 //===----------------------------------------------------------------------===//
 
 TargetAlignElem
-TargetAlignElem::get(AlignTypeEnum align_type, unsigned char abi_align,
-                     unsigned char pref_align, uint32_t bit_width) {
+TargetAlignElem::get(AlignTypeEnum align_type, unsigned abi_align,
+                     unsigned pref_align, uint32_t bit_width) {
   assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
   TargetAlignElem retval;
   retval.AlignType = align_type;
@@ -197,10 +196,10 @@ void TargetData::init(StringRef Desc) {
       }
       unsigned Size = getInt(Specifier.substr(1));
       Split = Token.split(':');
-      unsigned char ABIAlign = getInt(Split.first) / 8;
+      unsigned ABIAlign = getInt(Split.first) / 8;
       
       Split = Split.second.split(':');
-      unsigned char PrefAlign = getInt(Split.first) / 8;
+      unsigned PrefAlign = getInt(Split.first) / 8;
       if (PrefAlign == 0)
         PrefAlign = ABIAlign;
       setAlignment(AlignType, ABIAlign, PrefAlign, Size);
@@ -227,19 +226,19 @@ void TargetData::init(StringRef Desc) {
 ///
 /// @note This has to exist, because this is a pass, but it should never be
 /// used.
-TargetData::TargetData() : ImmutablePass(&ID) {
+TargetData::TargetData() : ImmutablePass(ID) {
   report_fatal_error("Bad TargetData ctor used.  "
                     "Tool did not specify a TargetData to use?");
 }
 
 TargetData::TargetData(const Module *M) 
-  : ImmutablePass(&ID) {
+  : ImmutablePass(ID) {
   init(M->getDataLayout());
 }
 
 void
-TargetData::setAlignment(AlignTypeEnum align_type, unsigned char abi_align,
-                         unsigned char pref_align, uint32_t bit_width) {
+TargetData::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
+                         unsigned pref_align, uint32_t bit_width) {
   assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
   for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
     if (Alignments[i].AlignType == align_type &&
@@ -455,15 +454,6 @@ uint64_t TargetData::getTypeSizeInBits(const Type *Ty) const {
   case Type::StructTyID:
     // Get the layout annotation... which is lazily created on demand.
     return getStructLayout(cast<StructType>(Ty))->getSizeInBits();
-  case Type::UnionTyID: {
-    const UnionType *UnTy = cast<UnionType>(Ty);
-    uint64_t Size = 0;
-    for (UnionType::element_iterator i = UnTy->element_begin(),
-             e = UnTy->element_end(); i != e; ++i) {
-      Size = std::max(Size, getTypeSizeInBits(*i));
-    }
-    return Size;
-  }
   case Type::IntegerTyID:
     return cast<IntegerType>(Ty)->getBitWidth();
   case Type::VoidTyID:
@@ -496,7 +486,7 @@ uint64_t TargetData::getTypeSizeInBits(const Type *Ty) const {
   Get the ABI (\a abi_or_pref == true) or preferred alignment (\a abi_or_pref
   == false) for the requested type \a Ty.
  */
-unsigned char TargetData::getAlignment(const Type *Ty, bool abi_or_pref) const {
+unsigned TargetData::getAlignment(const Type *Ty, bool abi_or_pref) const {
   int AlignType = -1;
 
   assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
@@ -518,18 +508,7 @@ unsigned char TargetData::getAlignment(const Type *Ty, bool abi_or_pref) const {
     // Get the layout annotation... which is lazily created on demand.
     const StructLayout *Layout = getStructLayout(cast<StructType>(Ty));
     unsigned Align = getAlignmentInfo(AGGREGATE_ALIGN, 0, abi_or_pref, Ty);
-    return std::max(Align, (unsigned)Layout->getAlignment());
-  }
-  case Type::UnionTyID: {
-    const UnionType *UnTy = cast<UnionType>(Ty);
-    unsigned Align = 1;
-
-    // Unions need the maximum alignment of all their entries
-    for (UnionType::element_iterator i = UnTy->element_begin(), 
-             e = UnTy->element_end(); i != e; ++i) {
-      Align = std::max(Align, (unsigned)getAlignment(*i, abi_or_pref));
-    }
-    return Align;
+    return std::max(Align, Layout->getAlignment());
   }
   case Type::IntegerTyID:
   case Type::VoidTyID:
@@ -556,18 +535,18 @@ unsigned char TargetData::getAlignment(const Type *Ty, bool abi_or_pref) const {
                           abi_or_pref, Ty);
 }
 
-unsigned char TargetData::getABITypeAlignment(const Type *Ty) const {
+unsigned TargetData::getABITypeAlignment(const Type *Ty) const {
   return getAlignment(Ty, true);
 }
 
 /// getABIIntegerTypeAlignment - Return the minimum ABI-required alignment for
 /// an integer type of the specified bitwidth.
-unsigned char TargetData::getABIIntegerTypeAlignment(unsigned BitWidth) const {
+unsigned TargetData::getABIIntegerTypeAlignment(unsigned BitWidth) const {
   return getAlignmentInfo(INTEGER_ALIGN, BitWidth, true, 0);
 }
 
 
-unsigned char TargetData::getCallFrameTypeAlignment(const Type *Ty) const {
+unsigned TargetData::getCallFrameTypeAlignment(const Type *Ty) const {
   for (unsigned i = 0, e = Alignments.size(); i != e; ++i)
     if (Alignments[i].AlignType == STACK_ALIGN)
       return Alignments[i].ABIAlign;
@@ -575,12 +554,12 @@ unsigned char TargetData::getCallFrameTypeAlignment(const Type *Ty) const {
   return getABITypeAlignment(Ty);
 }
 
-unsigned char TargetData::getPrefTypeAlignment(const Type *Ty) const {
+unsigned TargetData::getPrefTypeAlignment(const Type *Ty) const {
   return getAlignment(Ty, false);
 }
 
-unsigned char TargetData::getPreferredTypeAlignmentShift(const Type *Ty) const {
-  unsigned Align = (unsigned) getPrefTypeAlignment(Ty);
+unsigned TargetData::getPreferredTypeAlignmentShift(const Type *Ty) const {
+  unsigned Align = getPrefTypeAlignment(Ty);
   assert(!(Align & (Align-1)) && "Alignment is not a power of two!");
   return Log2_32(Align);
 }
@@ -615,18 +594,13 @@ uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices,
 
       // Update Ty to refer to current element
       Ty = STy->getElementType(FieldNo);
-    } else if (const UnionType *UnTy = dyn_cast<UnionType>(*TI)) {
-        unsigned FieldNo = cast<ConstantInt>(Indices[CurIDX])->getZExtValue();
-
-        // Offset into union is canonically 0, but type changes
-        Ty = UnTy->getElementType(FieldNo);
     } else {
       // Update Ty to refer to current element
       Ty = cast<SequentialType>(Ty)->getElementType();
 
       // Get the array index and the size of each array element.
       if (int64_t arrayIdx = cast<ConstantInt>(Indices[CurIDX])->getSExtValue())
-        Result += arrayIdx * (int64_t)getTypeAllocSize(Ty);
+        Result += (uint64_t)arrayIdx * getTypeAllocSize(Ty);
     }
   }
 
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 47c91df1400e6..705b1c097e55f 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -30,7 +30,8 @@ namespace llvm {
   bool NoFramePointerElimNonLeaf;
   bool NoExcessFPPrecision;
   bool UnsafeFPMath;
-  bool FiniteOnlyFPMathOption;
+  bool NoInfsFPMath;
+  bool NoNaNsFPMath;
   bool HonorSignDependentRoundingFPMathOption;
   bool UseSoftFloat;
   FloatABI::ABIType FloatABIType;
@@ -80,9 +81,14 @@ EnableUnsafeFPMath("enable-unsafe-fp-math",
   cl::location(UnsafeFPMath),
   cl::init(false));
 static cl::opt<bool, true>
-EnableFiniteOnlyFPMath("enable-finite-only-fp-math",
-  cl::desc("Enable optimizations that assumes non- NaNs / +-Infs"),
-  cl::location(FiniteOnlyFPMathOption),
+EnableNoInfsFPMath("enable-no-infs-fp-math",
+  cl::desc("Enable FP math optimizations that assume no +-Infs"),
+  cl::location(NoInfsFPMath),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableNoNaNsFPMath("enable-no-nans-fp-math",
+  cl::desc("Enable FP math optimizations that assume no NaNs"),
+  cl::location(NoNaNsFPMath),
   cl::init(false));
 static cl::opt<bool, true>
 EnableHonorSignDependentRoundingFPMath("enable-sign-dependent-rounding-fp-math",
@@ -290,12 +296,6 @@ namespace llvm {
   /// result is "less precise" than doing those operations individually.
   bool LessPreciseFPMAD() { return UnsafeFPMath || LessPreciseFPMADOption; }
 
-  /// FiniteOnlyFPMath - This returns true when the -enable-finite-only-fp-math
-  /// option is specified on the command line. If this returns false (default),
-  /// the code generator is not allowed to assume that FP arithmetic arguments
-  /// and results are never NaNs or +-Infs.
-  bool FiniteOnlyFPMath() { return FiniteOnlyFPMathOption; }
-  
   /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume
   /// that the rounding mode of the FPU can change from its default.
   bool HonorSignDependentRoundingFPMath() {
diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp
index 49bfad54136d3..55f222c7c1c95 100644
--- a/lib/Target/TargetRegisterInfo.cpp
+++ b/lib/Target/TargetRegisterInfo.cpp
@@ -63,7 +63,7 @@ TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, EVT VT) const {
 /// getAllocatableSetForRC - Toggle the bits that represent allocatable
 /// registers for the specific register class.
 static void getAllocatableSetForRC(const MachineFunction &MF,
-                                   const TargetRegisterClass *RC, BitVector &R){  
+                                   const TargetRegisterClass *RC, BitVector &R){
   for (TargetRegisterClass::iterator I = RC->allocation_order_begin(MF),
          E = RC->allocation_order_end(MF); I != E; ++I)
     R.set(*I);
@@ -74,12 +74,16 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
   BitVector Allocatable(NumRegs);
   if (RC) {
     getAllocatableSetForRC(MF, RC, Allocatable);
-    return Allocatable;
+  } else {
+    for (TargetRegisterInfo::regclass_iterator I = regclass_begin(),
+         E = regclass_end(); I != E; ++I)
+      getAllocatableSetForRC(MF, *I, Allocatable);
   }
 
-  for (TargetRegisterInfo::regclass_iterator I = regclass_begin(),
-         E = regclass_end(); I != E; ++I)
-    getAllocatableSetForRC(MF, *I, Allocatable);
+  // Mask out the reserved registers
+  BitVector Reserved = getReservedRegs(MF);
+  Allocatable ^= Reserved & Allocatable;
+
   return Allocatable;
 }
 
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index f1e66ab9d2c3f..f8588d818b75d 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -9,6 +9,8 @@
 
 #include "llvm/Target/TargetAsmParser.h"
 #include "X86.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
@@ -19,6 +21,7 @@
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegistry.h"
 #include "llvm/Target/TargetAsmParser.h"
 using namespace llvm;
@@ -28,6 +31,7 @@ struct X86Operand;
 
 class X86ATTAsmParser : public TargetAsmParser {
   MCAsmParser &Parser;
+  TargetMachine &TM;
 
 protected:
   unsigned Is64Bit : 1;
@@ -37,8 +41,6 @@ private:
 
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
-  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
-
   bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
 
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
@@ -48,13 +50,14 @@ private:
 
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
 
-  void InstructionCleanup(MCInst &Inst);
+  bool MatchInstruction(SMLoc IDLoc,
+                        const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                        MCInst &Inst);
 
-  /// @name Auto-generated Match Functions
+  /// @name Auto-generated Matcher Functions
   /// {
 
-  bool MatchInstruction(const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCInst &Inst);
+  unsigned ComputeAvailableFeatures(const X86Subtarget *Subtarget) const;
 
   bool MatchInstructionImpl(
     const SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCInst &Inst);
@@ -62,27 +65,32 @@ private:
   /// }
 
 public:
-  X86ATTAsmParser(const Target &T, MCAsmParser &_Parser)
-    : TargetAsmParser(T), Parser(_Parser) {}
+  X86ATTAsmParser(const Target &T, MCAsmParser &_Parser, TargetMachine &TM)
+    : TargetAsmParser(T), Parser(_Parser), TM(TM) {
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(
+                           &TM.getSubtarget<X86Subtarget>()));
+  }
 
   virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc,
                                 SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
   virtual bool ParseDirective(AsmToken DirectiveID);
 };
- 
+
 class X86_32ATTAsmParser : public X86ATTAsmParser {
 public:
-  X86_32ATTAsmParser(const Target &T, MCAsmParser &_Parser)
-    : X86ATTAsmParser(T, _Parser) {
+  X86_32ATTAsmParser(const Target &T, MCAsmParser &_Parser, TargetMachine &TM)
+    : X86ATTAsmParser(T, _Parser, TM) {
     Is64Bit = false;
   }
 };
 
 class X86_64ATTAsmParser : public X86ATTAsmParser {
 public:
-  X86_64ATTAsmParser(const Target &T, MCAsmParser &_Parser)
-    : X86ATTAsmParser(T, _Parser) {
+  X86_64ATTAsmParser(const Target &T, MCAsmParser &_Parser, TargetMachine &TM)
+    : X86ATTAsmParser(T, _Parser, TM) {
     Is64Bit = true;
   }
 };
@@ -90,7 +98,7 @@ public:
 } // end anonymous namespace
 
 /// @name Auto-generated Match Functions
-/// {  
+/// {
 
 static unsigned MatchRegisterName(StringRef Name);
 
@@ -109,7 +117,7 @@ struct X86Operand : public MCParsedAsmOperand {
   } Kind;
 
   SMLoc StartLoc, EndLoc;
-  
+
   union {
     struct {
       const char *Data;
@@ -141,6 +149,8 @@ struct X86Operand : public MCParsedAsmOperand {
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const { return EndLoc; }
 
+  virtual void dump(raw_ostream &OS) const {}
+
   StringRef getToken() const {
     assert(Kind == Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
@@ -185,7 +195,7 @@ struct X86Operand : public MCParsedAsmOperand {
   bool isToken() const {return Kind == Token; }
 
   bool isImm() const { return Kind == Immediate; }
-  
+
   bool isImmSExti16i8() const {
     if (!isImm())
       return false;
@@ -260,10 +270,6 @@ struct X86Operand : public MCParsedAsmOperand {
       !getMemIndexReg() && getMemScale() == 1;
   }
 
-  bool isNoSegMem() const {
-    return Kind == Memory && !getMemSegReg();
-  }
-
   bool isReg() const { return Kind == Register; }
 
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
@@ -298,14 +304,6 @@ struct X86Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
   }
 
-  void addNoSegMemOperands(MCInst &Inst, unsigned N) const {
-    assert((N == 4) && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
-    Inst.addOperand(MCOperand::CreateImm(getMemScale()));
-    Inst.addOperand(MCOperand::CreateReg(getMemIndexReg()));
-    addExpr(Inst, getMemDisp());
-  }
-
   static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
     X86Operand *Res = new X86Operand(Token, Loc, Loc);
     Res->Tok.Data = Str.data();
@@ -376,13 +374,19 @@ bool X86ATTAsmParser::ParseRegister(unsigned &RegNo,
   // FIXME: Validate register for the current architecture; we have to do
   // validation later, so maybe there is no need for this here.
   RegNo = MatchRegisterName(Tok.getString());
-  
+
+  // FIXME: This should be done using Requires<In32BitMode> and
+  // Requires<In64BitMode> so "eiz" usage in 64-bit instructions
+  // can be also checked.
+  if (RegNo == X86::RIZ && !Is64Bit)
+    return Error(Tok.getLoc(), "riz register in 64-bit mode only");
+
   // Parse %st(1) and "%st" as "%st(0)"
   if (RegNo == 0 && Tok.getString() == "st") {
     RegNo = X86::ST0;
     EndLoc = Tok.getLoc();
     Parser.Lex(); // Eat 'st'
-    
+
     // Check to see if we have '(4)' after %st.
     if (getLexer().isNot(AsmToken::LParen))
       return false;
@@ -403,15 +407,15 @@ bool X86ATTAsmParser::ParseRegister(unsigned &RegNo,
     case 7: RegNo = X86::ST7; break;
     default: return Error(IntTok.getLoc(), "invalid stack index");
     }
-    
+
     if (getParser().Lex().isNot(AsmToken::RParen))
       return Error(Parser.getTok().getLoc(), "expected ')'");
-    
+
     EndLoc = Tok.getLoc();
     Parser.Lex(); // Eat ')'
     return false;
   }
-  
+
   // If this is "db[0-7]", match it as an alias
   // for dr[0-7].
   if (RegNo == 0 && Tok.getString().size() == 3 &&
@@ -426,14 +430,14 @@ bool X86ATTAsmParser::ParseRegister(unsigned &RegNo,
     case '6': RegNo = X86::DR6; break;
     case '7': RegNo = X86::DR7; break;
     }
-    
+
     if (RegNo != 0) {
       EndLoc = Tok.getLoc();
       Parser.Lex(); // Eat it.
       return false;
     }
   }
-  
+
   if (RegNo == 0)
     return Error(Tok.getLoc(), "invalid register name");
 
@@ -452,13 +456,17 @@ X86Operand *X86ATTAsmParser::ParseOperand() {
     unsigned RegNo;
     SMLoc Start, End;
     if (ParseRegister(RegNo, Start, End)) return 0;
-    
+    if (RegNo == X86::EIZ || RegNo == X86::RIZ) {
+      Error(Start, "eiz and riz can only be used as index registers");
+      return 0;
+    }
+
     // If this is a segment register followed by a ':', then this is the start
     // of a memory reference, otherwise this is a normal register reference.
     if (getLexer().isNot(AsmToken::Colon))
       return X86Operand::CreateReg(RegNo, Start, End);
-    
-    
+
+
     getParser().Lex(); // Eat the colon.
     return ParseMemOperand(RegNo, Start);
   }
@@ -477,7 +485,7 @@ X86Operand *X86ATTAsmParser::ParseOperand() {
 /// ParseMemOperand: segment: disp(basereg, indexreg, scale).  The '%ds:' prefix
 /// has already been parsed if present.
 X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
- 
+
   // We have to disambiguate a parenthesized expression "(4+5)" from the start
   // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)".  The
   // only way to do this without lookahead is to eat the '(' and see what is
@@ -486,7 +494,7 @@ X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
   if (getLexer().isNot(AsmToken::LParen)) {
     SMLoc ExprEnd;
     if (getParser().ParseExpression(Disp, ExprEnd)) return 0;
-    
+
     // After parsing the base expression we could either have a parenthesized
     // memory address or not.  If not, return now.  If so, eat the (.
     if (getLexer().isNot(AsmToken::LParen)) {
@@ -495,7 +503,7 @@ X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
         return X86Operand::CreateMem(Disp, MemStart, ExprEnd);
       return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd);
     }
-    
+
     // Eat the '('.
     Parser.Lex();
   } else {
@@ -503,17 +511,17 @@ X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
     // so we have to eat the ( to see beyond it.
     SMLoc LParenLoc = Parser.getTok().getLoc();
     Parser.Lex(); // Eat the '('.
-    
+
     if (getLexer().is(AsmToken::Percent) || getLexer().is(AsmToken::Comma)) {
       // Nothing to do here, fall into the code below with the '(' part of the
       // memory operand consumed.
     } else {
       SMLoc ExprEnd;
-      
+
       // It must be an parenthesized expression, parse it now.
       if (getParser().ParseParenExpression(Disp, ExprEnd))
         return 0;
-      
+
       // After parsing the base expression we could either have a parenthesized
       // memory address or not.  If not, return now.  If so, eat the (.
       if (getLexer().isNot(AsmToken::LParen)) {
@@ -522,21 +530,25 @@ X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
           return X86Operand::CreateMem(Disp, LParenLoc, ExprEnd);
         return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd);
       }
-      
+
       // Eat the '('.
       Parser.Lex();
     }
   }
-  
+
   // If we reached here, then we just ate the ( of the memory operand.  Process
   // the rest of the memory operand.
   unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
-  
+
   if (getLexer().is(AsmToken::Percent)) {
     SMLoc L;
     if (ParseRegister(BaseReg, L, L)) return 0;
+    if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) {
+      Error(L, "eiz and riz can only be used as index registers");
+      return 0;
+    }
   }
-  
+
   if (getLexer().is(AsmToken::Comma)) {
     Parser.Lex(); // Eat the comma.
 
@@ -545,11 +557,11 @@ X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
     // correctly.
     //
     // Not that even though it would be completely consistent to support syntax
-    // like "1(%eax,,1)", the assembler doesn't.
+    // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
     if (getLexer().is(AsmToken::Percent)) {
       SMLoc L;
       if (ParseRegister(IndexReg, L, L)) return 0;
-    
+
       if (getLexer().isNot(AsmToken::RParen)) {
         // Parse the scale amount:
         //  ::= ',' [scale-expression]
@@ -566,7 +578,7 @@ X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
           int64_t ScaleVal;
           if (getParser().ParseAbsoluteExpression(ScaleVal))
             return 0;
-          
+
           // Validate the scale amount.
           if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
             Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
@@ -576,19 +588,20 @@ X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
         }
       }
     } else if (getLexer().isNot(AsmToken::RParen)) {
-      // Otherwise we have the unsupported form of a scale amount without an
+      // A scale amount without an index is ignored.
       // index.
       SMLoc Loc = Parser.getTok().getLoc();
 
       int64_t Value;
       if (getParser().ParseAbsoluteExpression(Value))
         return 0;
-      
-      Error(Loc, "cannot have scale factor without index register");
-      return 0;
+
+      if (Value != 1)
+        Warning(Loc, "scale factor without index register is ignored");
+      Scale = 1;
     }
   }
-  
+
   // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
   if (getLexer().isNot(AsmToken::RParen)) {
     Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
@@ -596,7 +609,7 @@ X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
   }
   SMLoc MemEnd = Parser.getTok().getLoc();
   Parser.Lex(); // Eat the ')'.
-  
+
   return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
                                MemStart, MemEnd);
 }
@@ -743,6 +756,23 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
       }
     }
   }
+
+  // FIXME: Hack to recognize vpclmul<src1_quadword, src2_quadword>dq
+  if (PatchedName.startswith("vpclmul")) {
+    unsigned CLMULQuadWordSelect = StringSwitch<unsigned>(
+      PatchedName.slice(7, PatchedName.size() - 2))
+      .Case("lqlq", 0x00) // src1[63:0],   src2[63:0]
+      .Case("hqlq", 0x01) // src1[127:64], src2[63:0]
+      .Case("lqhq", 0x10) // src1[63:0],   src2[127:64]
+      .Case("hqhq", 0x11) // src1[127:64], src2[127:64]
+      .Default(~0U);
+    if (CLMULQuadWordSelect != ~0U) {
+      ExtraImmOp = MCConstantExpr::Create(CLMULQuadWordSelect,
+                                          getParser().getContext());
+      assert(PatchedName.endswith("dq") && "Unexpected mnemonic!");
+      PatchedName = "vpclmulqdq";
+    }
+  }
   Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
 
   if (ExtraImmOp)
@@ -785,6 +815,20 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
     Operands.erase(Operands.begin() + 1);
   }
 
+  // FIXME: Hack to handle "out[bwl]? %al, (%dx)" -> "outb %al, %dx".
+  if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") &&
+      Operands.size() == 3) {
+    X86Operand &Op = *(X86Operand*)Operands.back();
+    if (Op.isMem() && Op.Mem.SegReg == 0 &&
+        isa<MCConstantExpr>(Op.Mem.Disp) &&
+        cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+        Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+      SMLoc Loc = Op.getEndLoc();
+      Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+      delete &Op;
+    }
+  }
+  
   // FIXME: Hack to handle "f{mul*,add*,sub*,div*} $op, st(0)" the same as
   // "f{mul*,add*,sub*,div*} $op"
   if ((Name.startswith("fmul") || Name.startswith("fadd") ||
@@ -796,6 +840,16 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
     Operands.erase(Operands.begin() + 2);
   }
 
+  // FIXME: Hack to handle "imul <imm>, B" which is an alias for "imul <imm>, B,
+  // B".
+  if (Name.startswith("imul") && Operands.size() == 3 &&
+      static_cast<X86Operand*>(Operands[1])->isImm() &&
+      static_cast<X86Operand*>(Operands.back())->isReg()) {
+    X86Operand *Op = static_cast<X86Operand*>(Operands.back());
+    Operands.push_back(X86Operand::CreateReg(Op->getReg(), Op->getStartLoc(),
+                                             Op->getEndLoc()));
+  }
+
   return false;
 }
 
@@ -819,7 +873,7 @@ bool X86ATTAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
-      
+
       // FIXME: Improve diagnostic.
       if (getLexer().isNot(AsmToken::Comma))
         return Error(L, "unexpected token in directive");
@@ -831,82 +885,32 @@ bool X86ATTAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
   return false;
 }
 
-/// LowerMOffset - Lower an 'moffset' form of an instruction, which just has a
-/// imm operand, to having "rm" or "mr" operands with the offset in the disp
-/// field.
-static void LowerMOffset(MCInst &Inst, unsigned Opc, unsigned RegNo,
-                         bool isMR) {
-  MCOperand Disp = Inst.getOperand(0);
-
-  // Start over with an empty instruction.
-  Inst = MCInst();
-  Inst.setOpcode(Opc);
-  
-  if (!isMR)
-    Inst.addOperand(MCOperand::CreateReg(RegNo));
-  
-  // Add the mem operand.
-  Inst.addOperand(MCOperand::CreateReg(0));  // Segment
-  Inst.addOperand(MCOperand::CreateImm(1));  // Scale
-  Inst.addOperand(MCOperand::CreateReg(0));  // IndexReg
-  Inst.addOperand(Disp);                     // Displacement
-  Inst.addOperand(MCOperand::CreateReg(0));  // BaseReg
- 
-  if (isMR)
-    Inst.addOperand(MCOperand::CreateReg(RegNo));
-}
-
-// FIXME: Custom X86 cleanup function to implement a temporary hack to handle
-// matching INCL/DECL correctly for x86_64. This needs to be replaced by a
-// proper mechanism for supporting (ambiguous) feature dependent instructions.
-void X86ATTAsmParser::InstructionCleanup(MCInst &Inst) {
-  if (!Is64Bit) return;
-
-  switch (Inst.getOpcode()) {
-  case X86::DEC16r: Inst.setOpcode(X86::DEC64_16r); break;
-  case X86::DEC16m: Inst.setOpcode(X86::DEC64_16m); break;
-  case X86::DEC32r: Inst.setOpcode(X86::DEC64_32r); break;
-  case X86::DEC32m: Inst.setOpcode(X86::DEC64_32m); break;
-  case X86::INC16r: Inst.setOpcode(X86::INC64_16r); break;
-  case X86::INC16m: Inst.setOpcode(X86::INC64_16m); break;
-  case X86::INC32r: Inst.setOpcode(X86::INC64_32r); break;
-  case X86::INC32m: Inst.setOpcode(X86::INC64_32m); break;
-      
-  // moffset instructions are x86-32 only.
-  case X86::MOV8o8a:   LowerMOffset(Inst, X86::MOV8rm , X86::AL , false); break;
-  case X86::MOV16o16a: LowerMOffset(Inst, X86::MOV16rm, X86::AX , false); break;
-  case X86::MOV32o32a: LowerMOffset(Inst, X86::MOV32rm, X86::EAX, false); break;
-  case X86::MOV8ao8:   LowerMOffset(Inst, X86::MOV8mr , X86::AL , true); break;
-  case X86::MOV16ao16: LowerMOffset(Inst, X86::MOV16mr, X86::AX , true); break;
-  case X86::MOV32ao32: LowerMOffset(Inst, X86::MOV32mr, X86::EAX, true); break;
-  }
-}
 
 bool
-X86ATTAsmParser::MatchInstruction(const SmallVectorImpl<MCParsedAsmOperand*>
+X86ATTAsmParser::MatchInstruction(SMLoc IDLoc,
+                                  const SmallVectorImpl<MCParsedAsmOperand*>
                                     &Operands,
                                   MCInst &Inst) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+
+  X86Operand *Op = static_cast<X86Operand*>(Operands[0]);
+  assert(Op->isToken() && "Leading operand should always be a mnemonic!");
+
   // First, try a direct match.
   if (!MatchInstructionImpl(Operands, Inst))
     return false;
 
-  // Ignore anything which is obviously not a suffix match.
-  if (Operands.size() == 0)
-    return true;
-  X86Operand *Op = static_cast<X86Operand*>(Operands[0]);
-  if (!Op->isToken() || Op->getToken().size() > 15)
-    return true;
-
   // FIXME: Ideally, we would only attempt suffix matches for things which are
   // valid prefixes, and we could just infer the right unambiguous
   // type. However, that requires substantially more matcher support than the
   // following hack.
 
   // Change the operand to point to a temporary token.
-  char Tmp[16];
   StringRef Base = Op->getToken();
-  memcpy(Tmp, Base.data(), Base.size());
-  Op->setTokenValue(StringRef(Tmp, Base.size() + 1));
+  SmallString<16> Tmp;
+  Tmp += Base;
+  Tmp += ' ';
+  Op->setTokenValue(Tmp.str());
 
   // Check for the various suffix matches.
   Tmp[Base.size()] = 'b';
@@ -928,6 +932,38 @@ X86ATTAsmParser::MatchInstruction(const SmallVectorImpl<MCParsedAsmOperand*>
     return false;
 
   // Otherwise, the match failed.
+
+  // If we had multiple suffix matches, then identify this as an ambiguous
+  // match.
+  if (MatchB + MatchW + MatchL + MatchQ != 4) {
+    char MatchChars[4];
+    unsigned NumMatches = 0;
+    if (!MatchB)
+      MatchChars[NumMatches++] = 'b';
+    if (!MatchW)
+      MatchChars[NumMatches++] = 'w';
+    if (!MatchL)
+      MatchChars[NumMatches++] = 'l';
+    if (!MatchQ)
+      MatchChars[NumMatches++] = 'q';
+
+    SmallString<126> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "ambiguous instructions require an explicit suffix (could be ";
+    for (unsigned i = 0; i != NumMatches; ++i) {
+      if (i != 0)
+        OS << ", ";
+      if (i + 1 == NumMatches)
+        OS << "or ";
+      OS << "'" << Base << MatchChars[i] << "'";
+    }
+    OS << ")";
+    Error(IDLoc, OS.str());
+  } else {
+    // FIXME: We should give nicer diagnostics about the exact failure.
+    Error(IDLoc, "unrecognized instruction");
+  }
+
   return true;
 }
 
diff --git a/lib/Target/X86/AsmPrinter/CMakeLists.txt b/lib/Target/X86/AsmPrinter/CMakeLists.txt
index b70a587ec4e24..033973eeeff93 100644
--- a/lib/Target/X86/AsmPrinter/CMakeLists.txt
+++ b/lib/Target/X86/AsmPrinter/CMakeLists.txt
@@ -2,8 +2,7 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 
 add_llvm_library(LLVMX86AsmPrinter
   X86ATTInstPrinter.cpp
-  X86AsmPrinter.cpp
   X86IntelInstPrinter.cpp
-  X86MCInstLower.cpp
+  X86InstComments.cpp
   )
 add_dependencies(LLVMX86AsmPrinter X86CodeGenTable_gen)
diff --git a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
index f2cdb5ba55eb0..554b96c96e0e5 100644
--- a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
@@ -14,6 +14,7 @@
 
 #define DEBUG_TYPE "asm-printer"
 #include "X86ATTInstPrinter.h"
+#include "X86InstComments.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
@@ -31,6 +32,10 @@ using namespace llvm;
 
 void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) {
   printInstruction(MI, OS);
+  
+  // If verbose assembly is enabled, we can print some informative comments.
+  if (CommentStream)
+    EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
 }
 StringRef X86ATTInstPrinter::getOpcodeName(unsigned Opcode) const {
   return getInstructionName(Opcode);
diff --git a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h
index 3be4bae5bec22..eb986643014c7 100644
--- a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h
@@ -56,6 +56,9 @@ public:
   void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
+  void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
   void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
diff --git a/lib/Target/X86/AsmPrinter/X86InstComments.cpp b/lib/Target/X86/AsmPrinter/X86InstComments.cpp
new file mode 100644
index 0000000000000..da9d5a3579e5f
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86InstComments.cpp
@@ -0,0 +1,232 @@
+//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstComments.h"
+#include "X86GenInstrNames.inc"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+#include "../X86ShuffleDecode.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Top Level Entrypoint
+//===----------------------------------------------------------------------===//
+
+/// EmitAnyX86InstComments - This function decodes x86 instructions and prints
+/// newline terminated strings to the specified string if desired.  This
+/// information is shown in disassembly dumps when verbose assembly is enabled.
+void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+                                  const char *(*getRegName)(unsigned)) {
+  // If this is a shuffle operation, the switch should fill in this state.
+  SmallVector<unsigned, 8> ShuffleMask;
+  const char *DestName = 0, *Src1Name = 0, *Src2Name = 0;
+
+  switch (MI->getOpcode()) {
+  case X86::INSERTPSrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask);
+    break;
+
+  case X86::MOVLHPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVLHPSMask(2, ShuffleMask);
+    break;
+
+  case X86::MOVHLPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVHLPSMask(2, ShuffleMask);
+    break;
+
+  case X86::PSHUFDri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFDmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSHUFMask(4, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
+    break;
+
+  case X86::PSHUFHWri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFHWmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSHUFHWMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+  case X86::PSHUFLWri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFLWmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSHUFLWMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+
+  case X86::PUNPCKHBWrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHBWrm:
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    DecodePUNPCKHMask(16, ShuffleMask);
+    break;
+  case X86::PUNPCKHWDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHWDrm:
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    DecodePUNPCKHMask(8, ShuffleMask);
+    break;
+  case X86::PUNPCKHDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHDQrm:
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    DecodePUNPCKHMask(4, ShuffleMask);
+    break;
+  case X86::PUNPCKHQDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHQDQrm:
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    DecodePUNPCKHMask(2, ShuffleMask);
+    break;
+
+  case X86::PUNPCKLBWrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLBWrm:
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    DecodePUNPCKLMask(16, ShuffleMask);
+    break;
+  case X86::PUNPCKLWDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLWDrm:
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    DecodePUNPCKLMask(8, ShuffleMask);
+    break;
+  case X86::PUNPCKLDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLDQrm:
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    DecodePUNPCKLMask(4, ShuffleMask);
+    break;
+  case X86::PUNPCKLQDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLQDQrm:
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    DecodePUNPCKLMask(2, ShuffleMask);
+    break;
+
+  case X86::SHUFPDrri:
+    DecodeSHUFPSMask(2, MI->getOperand(3).getImm(), ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    break;
+
+  case X86::SHUFPSrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::SHUFPSrmi:
+    DecodeSHUFPSMask(4, MI->getOperand(3).getImm(), ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::UNPCKLPDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKLPDrm:
+    DecodeUNPCKLPMask(2, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::UNPCKLPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKLPSrm:
+    DecodeUNPCKLPMask(4, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::UNPCKHPDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKHPDrm:
+    DecodeUNPCKHPMask(2, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::UNPCKHPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKHPSrm:
+    DecodeUNPCKHPMask(4, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+  }
+
+
+  // If this was a shuffle operation, print the shuffle mask.
+  if (!ShuffleMask.empty()) {
+    if (DestName == 0) DestName = Src1Name;
+    OS << (DestName ? DestName : "mem") << " = ";
+
+    // If the two sources are the same, canonicalize the input elements to be
+    // from the first src so that we get larger element spans.
+    if (Src1Name == Src2Name) {
+      for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+        if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
+            ShuffleMask[i] >= e)        // From second mask.
+          ShuffleMask[i] -= e;
+      }
+    }
+
+    // The shuffle mask specifies which elements of the src1/src2 fill in the
+    // destination, with a few sentinel values.  Loop through and print them
+    // out.
+    for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+      if (i != 0)
+        OS << ',';
+      if (ShuffleMask[i] == SM_SentinelZero) {
+        OS << "zero";
+        continue;
+      }
+
+      // Otherwise, it must come from src1 or src2.  Print the span of elements
+      // that comes from this src.
+      bool isSrc1 = ShuffleMask[i] < ShuffleMask.size();
+      const char *SrcName = isSrc1 ? Src1Name : Src2Name;
+      OS << (SrcName ? SrcName : "mem") << '[';
+      bool IsFirst = true;
+      while (i != e &&
+             (int)ShuffleMask[i] >= 0 &&
+             (ShuffleMask[i] < ShuffleMask.size()) == isSrc1) {
+        if (!IsFirst)
+          OS << ',';
+        else
+          IsFirst = false;
+        OS << ShuffleMask[i] % ShuffleMask.size();
+        ++i;
+      }
+      OS << ']';
+      --i;  // For loop increments element #.
+    }
+    //MI->print(OS, 0);
+    OS << "\n";
+  }
+
+}
diff --git a/lib/Target/X86/AsmPrinter/X86InstComments.h b/lib/Target/X86/AsmPrinter/X86InstComments.h
new file mode 100644
index 0000000000000..6b86db4f9e5c9
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86InstComments.h
@@ -0,0 +1,25 @@
+//===-- X86InstComments.h - Generate verbose-asm comments for instrs ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_INST_COMMENTS_H
+#define X86_INST_COMMENTS_H
+
+namespace llvm {
+  class MCInst;
+  class raw_ostream;
+  void EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+                              const char *(*getRegName)(unsigned));
+}
+
+#endif
diff --git a/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp
index a632047f6592b..5625b0ea618f8 100644
--- a/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp
@@ -14,6 +14,7 @@
 
 #define DEBUG_TYPE "asm-printer"
 #include "X86IntelInstPrinter.h"
+#include "X86InstComments.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
@@ -30,6 +31,10 @@ using namespace llvm;
 
 void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) {
   printInstruction(MI, OS);
+  
+  // If verbose assembly is enabled, we can print some informative comments.
+  if (CommentStream)
+    EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
 }
 StringRef X86IntelInstPrinter::getOpcodeName(unsigned Opcode) const {
   return getInstructionName(Opcode);
diff --git a/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h
index 4d680744dd60b..6f120322742b2 100644
--- a/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h
@@ -64,6 +64,10 @@ public:
     O << "XMMWORD PTR ";
     printMemReference(MI, OpNo, O);
   }
+  void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "YMMWORD PTR ";
+    printMemReference(MI, OpNo, O);
+  }
   void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     O << "DWORD PTR ";
     printMemReference(MI, OpNo, O);
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 133482036ce1b..e9399f5c83224 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -18,23 +18,24 @@ tablegen(X86GenEDInfo.inc -gen-enhanced-disassembly-info)
 set(sources
   SSEDomainFix.cpp
   X86AsmBackend.cpp
-  X86CodeEmitter.cpp
+  X86AsmPrinter.cpp
   X86COFFMachineModuleInfo.cpp
+  X86CodeEmitter.cpp
   X86ELFWriterInfo.cpp
+  X86FastISel.cpp
   X86FloatingPoint.cpp
-  X86FloatingPointRegKill.cpp
   X86ISelDAGToDAG.cpp
   X86ISelLowering.cpp
   X86InstrInfo.cpp
   X86JITInfo.cpp
   X86MCAsmInfo.cpp
   X86MCCodeEmitter.cpp 
+  X86MCInstLower.cpp
   X86RegisterInfo.cpp
+  X86SelectionDAGInfo.cpp
   X86Subtarget.cpp
   X86TargetMachine.cpp
   X86TargetObjectFile.cpp
-  X86FastISel.cpp
-  X86SelectionDAGInfo.cpp
   )
 
 if( CMAKE_CL_64 )
@@ -49,4 +50,3 @@ endif()
 
 add_llvm_target(X86CodeGen ${sources})
 
-target_link_libraries (LLVMX86CodeGen LLVMSelectionDAG)
diff --git a/lib/Target/X86/README-FPStack.txt b/lib/Target/X86/README-FPStack.txt
index be28e8b394a42..39efd2dbcf1ad 100644
--- a/lib/Target/X86/README-FPStack.txt
+++ b/lib/Target/X86/README-FPStack.txt
@@ -27,8 +27,8 @@ def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
 
 //===---------------------------------------------------------------------===//
 
-The FP stackifier needs to be global.  Also, it should handle simple permutates
-to reduce number of shuffle instructions, e.g. turning:
+The FP stackifier should handle simple permutates to reduce number of shuffle
+instructions, e.g. turning:
 
 fld P	->		fld Q
 fld Q			fld P
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index b6aba93f37383..f96b22f1e2042 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -2,8 +2,46 @@
 // Random ideas for the X86 backend: SSE-specific stuff.
 //===---------------------------------------------------------------------===//
 
-- Consider eliminating the unaligned SSE load intrinsics, replacing them with
-  unaligned LLVM load instructions.
+//===---------------------------------------------------------------------===//
+
+SSE Variable shift can be custom lowered to something like this, which uses a
+small table + unaligned load + shuffle instead of going through memory.
+
+__m128i_shift_right:
+	.byte	  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+	.byte	 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+
+...
+__m128i shift_right(__m128i value, unsigned long offset) {
+  return _mm_shuffle_epi8(value,
+               _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
+}
+
+//===---------------------------------------------------------------------===//
+
+SSE has instructions for doing operations on complex numbers, we should pattern
+match them.  Compiling this:
+
+_Complex float f32(_Complex float A, _Complex float B) {
+  return A+B;
+}
+
+into:
+
+_f32:
+	movdqa	%xmm0, %xmm2
+	addss	%xmm1, %xmm2
+	pshufd	$16, %xmm2, %xmm2
+	pshufd	$1, %xmm1, %xmm1
+	pshufd	$1, %xmm0, %xmm0
+	addss	%xmm1, %xmm0
+	pshufd	$16, %xmm0, %xmm1
+	movdqa	%xmm2, %xmm0
+	unpcklps	%xmm1, %xmm0
+	ret
+
+seems silly. 
+
 
 //===---------------------------------------------------------------------===//
 
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index efc0cd82f23e9..a305ae6ec5505 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -1135,13 +1135,6 @@ void test(double *P) {
 
 //===---------------------------------------------------------------------===//
 
-handling llvm.memory.barrier on pre SSE2 cpus
-
-should generate:
-lock ; mov %esp, %esp
-
-//===---------------------------------------------------------------------===//
-
 The generated code on x86 for checking for signed overflow on a multiply the
 obvious way is much longer than it needs to be.
 
@@ -1870,3 +1863,100 @@ The code produced by gcc is 3 bytes shorter.  This sort of construct often
 shows up with bitfields.
 
 //===---------------------------------------------------------------------===//
+
+Take the following C code:
+int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+  %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
+  %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
+  %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
+  %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
+  ret i32 %conv5
+}
+
+And the following x86 code:
+	xorl	%esi, %edi
+	testb	$-1, %dil
+	sete	%al
+	movzbl	%al, %eax
+	ret
+
+A cmpb instead of the xorl+testb would be one instruction shorter.
+
+//===---------------------------------------------------------------------===//
+
+Given the following C code:
+int f(int a, int b) { return (signed char)a == (signed char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+  %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
+  %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
+  %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
+  %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
+  %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
+  %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
+  ret i32 %conv5
+}
+
+And the following x86 code:
+	movsbl	%sil, %eax
+	movsbl	%dil, %ecx
+	cmpl	%eax, %ecx
+	sete	%al
+	movzbl	%al, %eax
+	ret
+
+
+It should be possible to eliminate the sign extensions.
+
+//===---------------------------------------------------------------------===//
+
+LLVM misses a load+store narrowing opportunity in this code:
+
+%struct.bf = type { i64, i16, i16, i32 }
+
+@bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
+
+define void @t1() nounwind ssp {
+entry:
+  %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
+  %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
+  %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
+  %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
+  %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
+  store i32 %4, i32* %2, align 1
+  %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
+  %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
+  %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
+  %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
+  %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
+  store i32 %9, i32* %7, align 1
+  ret void
+}
+
+LLVM currently emits this:
+
+  movq  bfi(%rip), %rax
+  andl  $-65537, 8(%rax)
+  movq  bfi(%rip), %rax
+  andl  $-131073, 8(%rax)
+  ret
+
+It could narrow the loads and stores to emit this:
+
+  movq  bfi(%rip), %rax
+  andb  $-2, 10(%rax)
+  movq  bfi(%rip), %rax
+  andb  $-3, 10(%rax)
+  ret
+
+The trouble is that there is a TokenFactor between the store and the
+load, making it non-trivial to determine if there's anything between
+the load and the store which would prohibit narrowing.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/SSEDomainFix.cpp b/lib/Target/X86/SSEDomainFix.cpp
index dab070e1febda..13680c592e01b 100644
--- a/lib/Target/X86/SSEDomainFix.cpp
+++ b/lib/Target/X86/SSEDomainFix.cpp
@@ -115,7 +115,7 @@ class SSEDomainFixPass : public MachineFunctionPass {
   unsigned Distance;
 
 public:
-  SSEDomainFixPass() : MachineFunctionPass(&ID) {}
+  SSEDomainFixPass() : MachineFunctionPass(ID) {}
 
   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
     AU.setPreservesAll();
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 677781d3730e2..27e88505150b4 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -49,11 +49,6 @@ FunctionPass *createX86FloatingPointStackifierPass();
 /// crossings.
 FunctionPass *createSSEDomainFixPass();
 
-/// createX87FPRegKillInserterPass - This function returns a pass which
-/// inserts FP_REG_KILL instructions where needed.
-///
-FunctionPass *createX87FPRegKillInserterPass();
-
 /// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
 /// to the specified MCE object.
 FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index a53f973c1c431..a19f1acffaca8 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -67,6 +67,8 @@ def FeatureSSE4A   : SubtargetFeature<"sse4a", "HasSSE4A", "true",
 
 def FeatureAVX     : SubtargetFeature<"avx", "HasAVX", "true",
                                       "Enable AVX instructions">;
+def FeatureCLMUL   : SubtargetFeature<"clmul", "HasCLMUL", "true",
+                               "Enable carry-less multiplication instructions">;
 def FeatureFMA3    : SubtargetFeature<"fma3", "HasFMA3", "true",
                                      "Enable three-operand fused multiple-add">;
 def FeatureFMA4    : SubtargetFeature<"fma4", "HasFMA4", "true",
@@ -180,8 +182,6 @@ include "X86CallingConv.td"
 // Currently the X86 assembly parser only supports ATT syntax.
 def ATTAsmParser : AsmParser {
   string AsmParserClassName = "ATTAsmParser";
-  string AsmParserInstCleanup = "InstructionCleanup";
-  string MatchInstructionName = "MatchInstructionImpl";
   int Variant = 0;
 
   // Discard comments in assembly strings.
diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/X86AsmBackend.cpp
index 2cf65c11f94aa..69dc967f9d88c 100644
--- a/lib/Target/X86/X86AsmBackend.cpp
+++ b/lib/Target/X86/X86AsmBackend.cpp
@@ -11,9 +11,11 @@
 #include "X86.h"
 #include "X86FixupKinds.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/MC/ELFObjectWriter.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MachObjectWriter.h"
@@ -190,10 +192,6 @@ public:
     HasScatteredSymbols = true;
   }
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return 0;
-  }
-
   bool isVirtualSection(const MCSection &Section) const {
     const MCSectionELF &SE = static_cast<const MCSectionELF&>(Section);
     return SE.getType() == MCSectionELF::SHT_NOBITS;;
@@ -204,12 +202,43 @@ class ELFX86_32AsmBackend : public ELFX86AsmBackend {
 public:
   ELFX86_32AsmBackend(const Target &T)
     : ELFX86AsmBackend(T) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return new ELFObjectWriter(OS, /*Is64Bit=*/false,
+                               /*IsLittleEndian=*/true,
+                               /*HasRelocationAddend=*/false);
+  }
 };
 
 class ELFX86_64AsmBackend : public ELFX86AsmBackend {
 public:
   ELFX86_64AsmBackend(const Target &T)
     : ELFX86AsmBackend(T) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return new ELFObjectWriter(OS, /*Is64Bit=*/true,
+                               /*IsLittleEndian=*/true,
+                               /*HasRelocationAddend=*/true);
+  }
+};
+
+class WindowsX86AsmBackend : public X86AsmBackend {
+  bool Is64Bit;
+public:
+  WindowsX86AsmBackend(const Target &T, bool is64Bit)
+    : X86AsmBackend(T)
+    , Is64Bit(is64Bit) {
+    HasScatteredSymbols = true;
+  }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createWinCOFFObjectWriter(OS, Is64Bit);
+  }
+
+  bool isVirtualSection(const MCSection &Section) const {
+    const MCSectionCOFF &SE = static_cast<const MCSectionCOFF&>(Section);
+    return SE.getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
+  }
 };
 
 class DarwinX86AsmBackend : public X86AsmBackend {
@@ -290,6 +319,10 @@ TargetAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
   switch (Triple(TT).getOS()) {
   case Triple::Darwin:
     return new DarwinX86_32AsmBackend(T);
+  case Triple::MinGW32:
+  case Triple::Cygwin:
+  case Triple::Win32:
+    return new WindowsX86AsmBackend(T, false);
   default:
     return new ELFX86_32AsmBackend(T);
   }
@@ -300,6 +333,10 @@ TargetAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
   switch (Triple(TT).getOS()) {
   case Triple::Darwin:
     return new DarwinX86_64AsmBackend(T);
+  case Triple::MinGW64:
+  case Triple::Cygwin:
+  case Triple::Win32:
+    return new WindowsX86AsmBackend(T, true);
   default:
     return new ELFX86_64AsmBackend(T);
   }
diff --git a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 08e6486d5b7a3..20110ad788cd1 100644
--- a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86AsmPrinter.h"
-#include "X86ATTInstPrinter.h"
-#include "X86IntelInstPrinter.h"
+#include "AsmPrinter/X86ATTInstPrinter.h"
+#include "AsmPrinter/X86IntelInstPrinter.h"
 #include "X86MCInstLower.h"
 #include "X86.h"
 #include "X86COFFMachineModuleInfo.h"
@@ -24,6 +24,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
+#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -35,6 +36,7 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetOptions.h"
@@ -218,6 +220,10 @@ void X86AsmPrinter::print_pcrel_imm(const MachineInstr *MI, unsigned OpNo,
   const MachineOperand &MO = MI->getOperand(OpNo);
   switch (MO.getType()) {
   default: llvm_unreachable("Unknown pcrel immediate operand");
+  case MachineOperand::MO_Register:
+    // pc-relativeness was handled when computing the value in the reg.
+    printOperand(MI, OpNo, O);
+    return;
   case MachineOperand::MO_Immediate:
     O << MO.getImm();
     return;
@@ -655,6 +661,47 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
   }
 }
 
+MachineLocation 
+X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+  MachineLocation Location;
+  assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!");
+  // Frame address.  Currently handles register +- offset only.
+  
+  if (MI->getOperand(0).isReg() && MI->getOperand(3).isImm())
+    Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm());
+  else {
+    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+  }
+  return Location;
+}
+
+void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                           raw_ostream &O) {
+  // Only the target-dependent form of DBG_VALUE should get here.
+  // Referencing the offset and metadata as NOps-2 and NOps-1 is
+  // probably portable to other targets; frame pointer location is not.
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps==7);
+  O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
+  if (V.getContext().isSubprogram())
+    O << DISubprogram(V.getContext()).getDisplayName() << ":";
+  O << V.getName();
+  O << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  O << '['; 
+  if (MI->getOperand(0).isReg() && MI->getOperand(0).getReg())
+    printOperand(MI, 0, O); 
+  else
+    O << "undef";
+  O << '+'; printOperand(MI, 3, O);
+  O << ']';
+  O << "+";
+  printOperand(MI, NOps-2, O);
+}
+
+
 
 //===----------------------------------------------------------------------===//
 // Target Registry Stuff
diff --git a/lib/Target/X86/AsmPrinter/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index b5a7f8dc321ad..e61be66c75a2e 100644
--- a/lib/Target/X86/AsmPrinter/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -14,9 +14,9 @@
 #ifndef X86ASMPRINTER_H
 #define X86ASMPRINTER_H
 
-#include "../X86.h"
-#include "../X86MachineFunctionInfo.h"
-#include "../X86TargetMachine.h"
+#include "X86.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index a6a1e4e573cff..e3409effc3187 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -33,13 +33,19 @@ def RetCC_X86Common : CallingConv<[
   CCIfType<[i16], CCAssignToReg<[AX, DX]>>,
   CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>,
   CCIfType<[i64], CCAssignToReg<[RAX, RDX]>>,
-  
-  // Vector types are returned in XMM0 and XMM1, when they fit.  XMMM2 and XMM3
+
+  // Vector types are returned in XMM0 and XMM1, when they fit.  XMM2 and XMM3
   // can only be used by ABI non-compliant code. If the target doesn't have XMM
   // registers, it won't have vector types.
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
             CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
 
+  // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3
+  // can only be used by ABI non-compliant code. This vector type is only
+  // supported while using the AVX target feature.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+            CCIfSubtarget<"hasAVX()", CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>>,
+
   // MMX vector types are always returned in MM0. If the target doesn't have
   // MM0, it doesn't support these vector types.
   CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToReg<[MM0]>>,
@@ -164,11 +170,16 @@ def CC_X86_64_C : CallingConv<[
   CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
             CCIfSubtarget<"hasSSE1()",
             CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
- 
+
+  // The first 8 256-bit vector arguments are passed in YMM registers.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+            CCIfSubtarget<"hasAVX()",
+            CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7]>>>,
+
   // Integer/FP values get stored in stack slots that are 8 bytes in size and
   // 8-byte aligned if there are no more registers to hold them.
   CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
-  
+
   // Long doubles get stack slots whose size and alignment depends on the
   // subtarget.
   CCIfType<[f80], CCAssignToStack<0, 0>>,
@@ -176,6 +187,10 @@ def CC_X86_64_C : CallingConv<[
   // Vectors get 16-byte stack slots that are 16-byte aligned.
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
 
+  // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToStack<32, 32>>,
+
   // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
   CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
 ]>;
@@ -271,9 +286,18 @@ def CC_X86_32_Common : CallingConv<[
   CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
                 CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
 
+  // The first 4 AVX 256-bit vector arguments are passed in YMM registers.
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                CCIfSubtarget<"hasAVX()",
+                CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
+
   // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
 
+  // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToStack<32, 32>>,
+
   // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
   // passed in the parameter area.
   CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 4>>]>;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index f13669bd741d1..824021c0c882b 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -53,12 +53,12 @@ namespace {
   public:
     static char ID;
     explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
-      : MachineFunctionPass(&ID), II(0), TD(0), TM(tm), 
+      : MachineFunctionPass(ID), II(0), TD(0), TM(tm), 
       MCE(mce), PICBaseOffset(0), Is64BitMode(false),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
     Emitter(X86TargetMachine &tm, CodeEmitter &mce,
             const X86InstrInfo &ii, const TargetData &td, bool is64)
-      : MachineFunctionPass(&ID), II(&ii), TD(&td), TM(tm), 
+      : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm), 
       MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
@@ -146,6 +146,103 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
   return false;
 }
 
+/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+static unsigned determineREX(const MachineInstr &MI) {
+  unsigned REX = 0;
+  const TargetInstrDesc &Desc = MI.getDesc();
+  
+  // Pseudo instructions do not need REX prefix byte.
+  if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return 0;
+  if (Desc.TSFlags & X86II::REX_W)
+    REX |= 1 << 3;
+  
+  unsigned NumOps = Desc.getNumOperands();
+  if (NumOps) {
+    bool isTwoAddr = NumOps > 1 &&
+    Desc.getOperandConstraint(1, TOI::TIED_TO) != -1;
+    
+    // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+    unsigned i = isTwoAddr ? 1 : 0;
+    for (unsigned e = NumOps; i != e; ++i) {
+      const MachineOperand& MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        unsigned Reg = MO.getReg();
+        if (X86InstrInfo::isX86_64NonExtLowByteReg(Reg))
+          REX |= 0x40;
+      }
+    }
+    
+    switch (Desc.TSFlags & X86II::FormMask) {
+      case X86II::MRMInitReg:
+        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+          REX |= (1 << 0) | (1 << 2);
+        break;
+      case X86II::MRMSrcReg: {
+        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+          REX |= 1 << 2;
+        i = isTwoAddr ? 2 : 1;
+        for (unsigned e = NumOps; i != e; ++i) {
+          const MachineOperand& MO = MI.getOperand(i);
+          if (X86InstrInfo::isX86_64ExtendedReg(MO))
+            REX |= 1 << 0;
+        }
+        break;
+      }
+      case X86II::MRMSrcMem: {
+        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+          REX |= 1 << 2;
+        unsigned Bit = 0;
+        i = isTwoAddr ? 2 : 1;
+        for (; i != NumOps; ++i) {
+          const MachineOperand& MO = MI.getOperand(i);
+          if (MO.isReg()) {
+            if (X86InstrInfo::isX86_64ExtendedReg(MO))
+              REX |= 1 << Bit;
+            Bit++;
+          }
+        }
+        break;
+      }
+      case X86II::MRM0m: case X86II::MRM1m:
+      case X86II::MRM2m: case X86II::MRM3m:
+      case X86II::MRM4m: case X86II::MRM5m:
+      case X86II::MRM6m: case X86II::MRM7m:
+      case X86II::MRMDestMem: {
+        unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
+        i = isTwoAddr ? 1 : 0;
+        if (NumOps > e && X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(e)))
+          REX |= 1 << 2;
+        unsigned Bit = 0;
+        for (; i != e; ++i) {
+          const MachineOperand& MO = MI.getOperand(i);
+          if (MO.isReg()) {
+            if (X86InstrInfo::isX86_64ExtendedReg(MO))
+              REX |= 1 << Bit;
+            Bit++;
+          }
+        }
+        break;
+      }
+      default: {
+        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+          REX |= 1 << 0;
+        i = isTwoAddr ? 2 : 1;
+        for (unsigned e = NumOps; i != e; ++i) {
+          const MachineOperand& MO = MI.getOperand(i);
+          if (X86InstrInfo::isX86_64ExtendedReg(MO))
+            REX |= 1 << 2;
+        }
+        break;
+      }
+    }
+  }
+  return REX;
+}
+
+
 /// emitPCRelativeBlockAddress - This method keeps track of the information
 /// necessary to resolve the address of this block later and emits a dummy
 /// value.
@@ -569,7 +666,7 @@ void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI,
 
   // Handle REX prefix.
   if (Is64BitMode) {
-    if (unsigned REX = X86InstrInfo::determineREX(MI))
+    if (unsigned REX = determineREX(MI))
       MCE.emitByte(0x40 | REX);
   }
 
@@ -605,24 +702,29 @@ void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI,
     // base address.
     switch (Opcode) {
     default: 
-      llvm_unreachable("psuedo instructions should be removed before code"
+      llvm_unreachable("pseudo instructions should be removed before code"
                        " emission");
       break;
+    // Do nothing for Int_MemBarrier - it's just a comment.  Add a debug
+    // to make it slightly easier to see.
+    case X86::Int_MemBarrier:
+      DEBUG(dbgs() << "#MEMBARRIER\n");
+      break;
+    
     case TargetOpcode::INLINEASM:
       // We allow inline assembler nodes with empty bodies - they can
       // implicitly define registers, which is ok for JIT.
       if (MI.getOperand(0).getSymbolName()[0])
         report_fatal_error("JIT does not support inline asm!");
       break;
-    case TargetOpcode::DBG_LABEL:
+    case TargetOpcode::PROLOG_LABEL:
     case TargetOpcode::GC_LABEL:
     case TargetOpcode::EH_LABEL:
       MCE.emitLabel(MI.getOperand(0).getMCSymbol());
       break;
-        
+    
     case TargetOpcode::IMPLICIT_DEF:
     case TargetOpcode::KILL:
-    case X86::FP_REG_KILL:
       break;
     case X86::MOVPC32r: {
       // This emits the "call" portion of this pseudo instruction.
@@ -674,7 +776,8 @@ void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI,
     }
     
     assert(MO.isImm() && "Unknown RawFrm operand!");
-    if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
+    if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32 ||
+        Opcode == X86::WINCALL64pcrel32) {
       // Fix up immediate operand for pc relative calls.
       intptr_t Imm = (intptr_t)MO.getImm();
       Imm = Imm - MCE.getCurrentPCValue() - 4;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index ce1370763b77f..0c70eec4827fb 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -960,9 +960,11 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
   MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
   MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
 
-  // Fold the common case of a conditional branch with a comparison.
+  // Fold the common case of a conditional branch with a comparison
+  // in the same block (values defined on other blocks may not have
+  // initialized registers).
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
-    if (CI->hasOneUse()) {
+    if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
       EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
 
       // Try to take advantage of fallthrough opportunities.
@@ -1058,10 +1060,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
           const MachineInstr &MI = *RI;
 
           if (MI.definesRegister(Reg)) {
-            unsigned Src, Dst, SrcSR, DstSR;
-
-            if (getInstrInfo()->isMoveInstr(MI, Src, Dst, SrcSR, DstSR)) {
-              Reg = Src;
+            if (MI.isCopy()) {
+              Reg = MI.getOperand(1).getReg();
               continue;
             }
 
@@ -1648,15 +1648,26 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   MachineInstrBuilder MIB;
   if (CalleeOp) {
     // Register-indirect call.
-    unsigned CallOpc = Subtarget->is64Bit() ? X86::CALL64r : X86::CALL32r;
+    unsigned CallOpc;
+    if (Subtarget->isTargetWin64())
+      CallOpc = X86::WINCALL64r;
+    else if (Subtarget->is64Bit())
+      CallOpc = X86::CALL64r;
+    else
+      CallOpc = X86::CALL32r;
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc))
       .addReg(CalleeOp);
     
   } else {
     // Direct call.
     assert(GV && "Not a direct call");
-    unsigned CallOpc =
-      Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+    unsigned CallOpc;
+    if (Subtarget->isTargetWin64())
+      CallOpc = X86::WINCALL64pcrel32;
+    else if (Subtarget->is64Bit())
+      CallOpc = X86::CALL64pcrel32;
+    else
+      CallOpc = X86::CALLpcrel32;
     
     // See if we need any target-specific flags on the GV operand.
     unsigned char OpFlags = 0;
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index cee4ad70201a7..e6ebf669587dd 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -8,23 +8,18 @@
 //===----------------------------------------------------------------------===//
 //
 // This file defines the pass which converts floating point instructions from
-// virtual registers into register stack instructions.  This pass uses live
+// pseudo registers into register stack instructions.  This pass uses live
 // variable information to indicate where the FPn registers are used and their
 // lifetimes.
 //
-// This pass is hampered by the lack of decent CFG manipulation routines for
-// machine code.  In particular, this wants to be able to split critical edges
-// as necessary, traverse the machine basic block CFG in depth-first order, and
-// allow there to be multiple machine basic blocks for each LLVM basicblock
-// (needed for critical edge splitting).
+// The x87 hardware tracks liveness of the stack registers, so it is necessary
+// to implement exact liveness tracking between basic blocks. The CFG edges are
+// partitioned into bundles where the same FP registers must be live in
+// identical stack positions. Instructions are inserted at the end of each basic
+// block to rearrange the live registers to match the outgoing bundle.
 //
-// In particular, this pass currently barfs on critical edges.  Because of this,
-// it requires the instruction selector to insert FP_REG_KILL instructions on
-// the exits of any basic block that has critical edges going from it, or which
-// branch to a critical basic block.
-//
-// FIXME: this is not implemented yet.  The stackifier pass only works on local
-// basic blocks.
+// This approach avoids splitting critical edges at the potential cost of more
+// live register shuffling instructions when critical edges are present.
 //
 //===----------------------------------------------------------------------===//
 
@@ -32,6 +27,7 @@
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -54,7 +50,12 @@ STATISTIC(NumFP  , "Number of floating point instructions");
 namespace {
   struct FPS : public MachineFunctionPass {
     static char ID;
-    FPS() : MachineFunctionPass(&ID) {}
+    FPS() : MachineFunctionPass(ID) {
+      // This is really only to keep valgrind quiet.
+      // The logic in isLive() is too much for it.
+      memset(Stack, 0, sizeof(Stack));
+      memset(RegMap, 0, sizeof(RegMap));
+    }
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
@@ -69,11 +70,71 @@ namespace {
 
   private:
     const TargetInstrInfo *TII; // Machine instruction info.
+
+    // Two CFG edges are related if they leave the same block, or enter the same
+    // block. The transitive closure of an edge under this relation is a
+    // LiveBundle. It represents a set of CFG edges where the live FP stack
+    // registers must be allocated identically in the x87 stack.
+    //
+    // A LiveBundle is usually all the edges leaving a block, or all the edges
+    // entering a block, but it can contain more edges if critical edges are
+    // present.
+    //
+    // The set of live FP registers in a LiveBundle is calculated by bundleCFG,
+    // but the exact mapping of FP registers to stack slots is fixed later.
+    struct LiveBundle {
+      // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c.
+      unsigned Mask;
+
+      // Number of pre-assigned live registers in FixStack. This is 0 when the
+      // stack order has not yet been fixed.
+      unsigned FixCount;
+
+      // Assigned stack order for live-in registers.
+      // FixStack[i] == getStackEntry(i) for all i < FixCount.
+      unsigned char FixStack[8];
+
+      LiveBundle(unsigned m = 0) : Mask(m), FixCount(0) {}
+
+      // Have the live registers been assigned a stack order yet?
+      bool isFixed() const { return !Mask || FixCount; }
+    };
+
+    // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges
+    // with no live FP registers.
+    SmallVector<LiveBundle, 8> LiveBundles;
+
+    // Map each MBB in the current function to an (ingoing, outgoing) index into
+    // LiveBundles. Blocks with no FP registers live in or out map to (0, 0)
+    // and are not actually stored in the map.
+    DenseMap<MachineBasicBlock*, std::pair<unsigned, unsigned> > BlockBundle;
+
+    // Return a bitmask of FP registers in block's live-in list.
+    unsigned calcLiveInMask(MachineBasicBlock *MBB) {
+      unsigned Mask = 0;
+      for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
+           E = MBB->livein_end(); I != E; ++I) {
+        unsigned Reg = *I - X86::FP0;
+        if (Reg < 8)
+          Mask |= 1 << Reg;
+      }
+      return Mask;
+    }
+
+    // Partition all the CFG edges into LiveBundles.
+    void bundleCFG(MachineFunction &MF);
+
     MachineBasicBlock *MBB;     // Current basic block
     unsigned Stack[8];          // FP<n> Registers in each stack slot...
     unsigned RegMap[8];         // Track which stack slot contains each register
     unsigned StackTop;          // The current top of the FP stack.
 
+    // Set up our stack model to match the incoming registers to MBB.
+    void setupBlockStack();
+
+    // Shuffle live registers to match the expectations of successor blocks.
+    void finishBlockStack();
+
     void dumpStack() const {
       dbgs() << "Stack contents:";
       for (unsigned i = 0; i != StackTop; ++i) {
@@ -82,27 +143,36 @@ namespace {
       }
       dbgs() << "\n";
     }
-  private:
-    /// isStackEmpty - Return true if the FP stack is empty.
-    bool isStackEmpty() const {
-      return StackTop == 0;
-    }
-    
-    // getSlot - Return the stack slot number a particular register number is
-    // in.
+
+    /// getSlot - Return the stack slot number a particular register number is
+    /// in.
     unsigned getSlot(unsigned RegNo) const {
       assert(RegNo < 8 && "Regno out of range!");
       return RegMap[RegNo];
     }
 
-    // getStackEntry - Return the X86::FP<n> register in register ST(i).
+    /// isLive - Is RegNo currently live in the stack?
+    bool isLive(unsigned RegNo) const {
+      unsigned Slot = getSlot(RegNo);
+      return Slot < StackTop && Stack[Slot] == RegNo;
+    }
+
+    /// getScratchReg - Return an FP register that is not currently in use.
+    unsigned getScratchReg() {
+      for (int i = 7; i >= 0; --i)
+        if (!isLive(i))
+          return i;
+      llvm_unreachable("Ran out of scratch FP registers");
+    }
+
+    /// getStackEntry - Return the X86::FP<n> register in register ST(i).
     unsigned getStackEntry(unsigned STi) const {
       assert(STi < StackTop && "Access past stack top!");
       return Stack[StackTop-1-STi];
     }
 
-    // getSTReg - Return the X86::ST(i) register which contains the specified
-    // FP<RegNo> register.
+    /// getSTReg - Return the X86::ST(i) register which contains the specified
+    /// FP<RegNo> register.
     unsigned getSTReg(unsigned RegNo) const {
       return StackTop - 1 - getSlot(RegNo) + llvm::X86::ST0;
     }
@@ -117,10 +187,9 @@ namespace {
 
     bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
     void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
-      MachineInstr *MI = I;
-      DebugLoc dl = MI->getDebugLoc();
+      DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
       if (isAtTop(RegNo)) return;
-      
+
       unsigned STReg = getSTReg(RegNo);
       unsigned RegOnTop = getStackEntry(0);
 
@@ -137,24 +206,37 @@ namespace {
     }
 
     void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
-      DebugLoc dl = I->getDebugLoc();
+      DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
       unsigned STReg = getSTReg(RegNo);
       pushReg(AsReg);   // New register on top of stack
 
       BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
     }
 
-    // popStackAfter - Pop the current value off of the top of the FP stack
-    // after the specified instruction.
+    /// popStackAfter - Pop the current value off of the top of the FP stack
+    /// after the specified instruction.
     void popStackAfter(MachineBasicBlock::iterator &I);
 
-    // freeStackSlotAfter - Free the specified register from the register stack,
-    // so that it is no longer in a register.  If the register is currently at
-    // the top of the stack, we just pop the current instruction, otherwise we
-    // store the current top-of-stack into the specified slot, then pop the top
-    // of stack.
+    /// freeStackSlotAfter - Free the specified register from the register
+    /// stack, so that it is no longer in a register.  If the register is
+    /// currently at the top of the stack, we just pop the current instruction,
+    /// otherwise we store the current top-of-stack into the specified slot,
+    /// then pop the top of stack.
     void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
 
+    /// freeStackSlotBefore - Just the pop, no folding. Return the inserted
+    /// instruction.
+    MachineBasicBlock::iterator
+    freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo);
+
+    /// Adjust the live registers to be the set in Mask.
+    void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I);
+
+    /// Shuffle the top FixCount stack entries susch that FP reg FixStack[0] is
+    /// st(0), FP reg FixStack[1] is st(1) etc.
+    void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount,
+                         MachineBasicBlock::iterator I);
+
     bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
 
     void handleZeroArgFP(MachineBasicBlock::iterator &I);
@@ -181,7 +263,6 @@ static unsigned getFPReg(const MachineOperand &MO) {
   return Reg - X86::FP0;
 }
 
-
 /// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
 /// register references into FP stack references.
 ///
@@ -201,6 +282,10 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
   if (!FPIsUsed) return false;
 
   TII = MF.getTarget().getInstrInfo();
+
+  // Prepare cross-MBB liveness.
+  bundleCFG(MF);
+
   StackTop = 0;
 
   // Process the function in depth first order so that we process at least one
@@ -215,16 +300,111 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
     Changed |= processBasicBlock(MF, **I);
 
   // Process any unreachable blocks in arbitrary order now.
-  if (MF.size() == Processed.size())
-    return Changed;
+  if (MF.size() != Processed.size())
+    for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+      if (Processed.insert(BB))
+        Changed |= processBasicBlock(MF, *BB);
+
+  BlockBundle.clear();
+  LiveBundles.clear();
 
-  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
-    if (Processed.insert(BB))
-      Changed |= processBasicBlock(MF, *BB);
-  
   return Changed;
 }
 
+/// bundleCFG - Scan all the basic blocks to determine consistent live-in and
+/// live-out sets for the FP registers. Consistent means that the set of
+/// registers live-out from a block is identical to the live-in set of all
+/// successors. This is not enforced by the normal live-in lists since
+/// registers may be implicitly defined, or not used by all successors.
+void FPS::bundleCFG(MachineFunction &MF) {
+  assert(LiveBundles.empty() && "Stale data in LiveBundles");
+  assert(BlockBundle.empty() && "Stale data in BlockBundle");
+  SmallPtrSet<MachineBasicBlock*, 8> PropDown, PropUp;
+
+  // LiveBundle[0] is the empty live-in set.
+  LiveBundles.resize(1);
+
+  // First gather the actual live-in masks for all MBBs.
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = I;
+    const unsigned Mask = calcLiveInMask(MBB);
+    if (!Mask)
+      continue;
+    // Ingoing bundle index.
+    unsigned &Idx = BlockBundle[MBB].first;
+    // Already assigned an ingoing bundle?
+    if (Idx)
+      continue;
+    // Allocate a new LiveBundle struct for this block's live-ins.
+    const unsigned BundleIdx = Idx = LiveBundles.size();
+    DEBUG(dbgs() << "Creating LB#" << BundleIdx << ": in:BB#"
+                 << MBB->getNumber());
+    LiveBundles.push_back(Mask);
+    LiveBundle &Bundle = LiveBundles.back();
+
+    // Make sure all predecessors have the same live-out set.
+    PropUp.insert(MBB);
+
+    // Keep pushing liveness up and down the CFG until convergence.
+    // Only critical edges cause iteration here, but when they do, multiple
+    // blocks can be assigned to the same LiveBundle index.
+    do {
+      // Assign BundleIdx as liveout from predecessors in PropUp.
+      for (SmallPtrSet<MachineBasicBlock*, 16>::iterator I = PropUp.begin(),
+           E = PropUp.end(); I != E; ++I) {
+        MachineBasicBlock *MBB = *I;
+        for (MachineBasicBlock::const_pred_iterator LinkI = MBB->pred_begin(),
+             LinkE = MBB->pred_end(); LinkI != LinkE; ++LinkI) {
+          MachineBasicBlock *PredMBB = *LinkI;
+          // PredMBB's liveout bundle should be set to LIIdx.
+          unsigned &Idx = BlockBundle[PredMBB].second;
+          if (Idx) {
+            assert(Idx == BundleIdx && "Inconsistent CFG");
+            continue;
+          }
+          Idx = BundleIdx;
+          DEBUG(dbgs() << " out:BB#" << PredMBB->getNumber());
+          // Propagate to siblings.
+          if (PredMBB->succ_size() > 1)
+            PropDown.insert(PredMBB);
+        }
+      }
+      PropUp.clear();
+
+      // Assign BundleIdx as livein to successors in PropDown.
+      for (SmallPtrSet<MachineBasicBlock*, 16>::iterator I = PropDown.begin(),
+           E = PropDown.end(); I != E; ++I) {
+        MachineBasicBlock *MBB = *I;
+        for (MachineBasicBlock::const_succ_iterator LinkI = MBB->succ_begin(),
+             LinkE = MBB->succ_end(); LinkI != LinkE; ++LinkI) {
+          MachineBasicBlock *SuccMBB = *LinkI;
+          // LinkMBB's livein bundle should be set to BundleIdx.
+          unsigned &Idx = BlockBundle[SuccMBB].first;
+          if (Idx) {
+            assert(Idx == BundleIdx && "Inconsistent CFG");
+            continue;
+          }
+          Idx = BundleIdx;
+          DEBUG(dbgs() << " in:BB#" << SuccMBB->getNumber());
+          // Propagate to siblings.
+          if (SuccMBB->pred_size() > 1)
+            PropUp.insert(SuccMBB);
+          // Also accumulate the bundle liveness mask from the liveins here.
+          Bundle.Mask |= calcLiveInMask(SuccMBB);
+        }
+      }
+      PropDown.clear();
+    } while (!PropUp.empty());
+    DEBUG({
+      dbgs() << " live:";
+      for (unsigned i = 0; i < 8; ++i)
+        if (Bundle.Mask & (1<<i))
+          dbgs() << " %FP" << i;
+      dbgs() << '\n';
+    });
+  }
+}
+
 /// processBasicBlock - Loop over all of the instructions in the basic block,
 /// transforming FP instructions into their stack form.
 ///
@@ -232,10 +412,12 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
   bool Changed = false;
   MBB = &BB;
 
+  setupBlockStack();
+
   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
     MachineInstr *MI = I;
     uint64_t Flags = MI->getDesc().TSFlags;
-    
+
     unsigned FPInstClass = Flags & X86II::FPTypeMask;
     if (MI->isInlineAsm())
       FPInstClass = X86II::SpecialFP;
@@ -302,10 +484,82 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
     Changed = true;
   }
 
-  assert(isStackEmpty() && "Stack not empty at end of basic block?");
+  finishBlockStack();
+
   return Changed;
 }
 
+/// setupBlockStack - Use the BlockBundle map to set up our model of the stack
+/// to match predecessors' live out stack.
+void FPS::setupBlockStack() {
+  DEBUG(dbgs() << "\nSetting up live-ins for BB#" << MBB->getNumber()
+               << " derived from " << MBB->getName() << ".\n");
+  StackTop = 0;
+  const LiveBundle &Bundle = LiveBundles[BlockBundle.lookup(MBB).first];
+
+  if (!Bundle.Mask) {
+    DEBUG(dbgs() << "Block has no FP live-ins.\n");
+    return;
+  }
+
+  // Depth-first iteration should ensure that we always have an assigned stack.
+  assert(Bundle.isFixed() && "Reached block before any predecessors");
+
+  // Push the fixed live-in registers.
+  for (unsigned i = Bundle.FixCount; i > 0; --i) {
+    MBB->addLiveIn(X86::ST0+i-1);
+    DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP"
+                 << unsigned(Bundle.FixStack[i-1]) << '\n');
+    pushReg(Bundle.FixStack[i-1]);
+  }
+
+  // Kill off unwanted live-ins. This can happen with a critical edge.
+  // FIXME: We could keep these live registers around as zombies. They may need
+  // to be revived at the end of a short block. It might save a few instrs.
+  adjustLiveRegs(calcLiveInMask(MBB), MBB->begin());
+  DEBUG(MBB->dump());
+}
+
+/// finishBlockStack - Revive live-outs that are implicitly defined out of
+/// MBB. Shuffle live registers to match the expected fixed stack of any
+/// predecessors, and ensure that all predecessors are expecting the same
+/// stack.
+void FPS::finishBlockStack() {
+  // The RET handling below takes care of return blocks for us.
+  if (MBB->succ_empty())
+    return;
+
+  DEBUG(dbgs() << "Setting up live-outs for BB#" << MBB->getNumber()
+               << " derived from " << MBB->getName() << ".\n");
+
+  unsigned BundleIdx = BlockBundle.lookup(MBB).second;
+  LiveBundle &Bundle = LiveBundles[BundleIdx];
+
+  // We may need to kill and define some registers to match successors.
+  // FIXME: This can probably be combined with the shuffle below.
+  MachineBasicBlock::iterator Term = MBB->getFirstTerminator();
+  adjustLiveRegs(Bundle.Mask, Term);
+
+  if (!Bundle.Mask) {
+    DEBUG(dbgs() << "No live-outs.\n");
+    return;
+  }
+
+  // Has the stack order been fixed yet?
+  DEBUG(dbgs() << "LB#" << BundleIdx << ": ");
+  if (Bundle.isFixed()) {
+    DEBUG(dbgs() << "Shuffling stack to match.\n");
+    shuffleStackTop(Bundle.FixStack, Bundle.FixCount, Term);
+  } else {
+    // Not fixed yet, we get to choose.
+    DEBUG(dbgs() << "Fixing stack order now.\n");
+    Bundle.FixCount = StackTop;
+    for (unsigned i = 0; i < StackTop; ++i)
+      Bundle.FixStack[i] = getStackEntry(i);
+  }
+}
+
+
 //===----------------------------------------------------------------------===//
 // Efficient Lookup Table Support
 //===----------------------------------------------------------------------===//
@@ -318,7 +572,7 @@ namespace {
     friend bool operator<(const TableEntry &TE, unsigned V) {
       return TE.from < V;
     }
-    friend bool operator<(unsigned V, const TableEntry &TE) {
+    friend bool ATTRIBUTE_USED operator<(unsigned V, const TableEntry &TE) {
       return V < TE.from;
     }
   };
@@ -597,6 +851,13 @@ void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
   // Otherwise, store the top of stack into the dead slot, killing the operand
   // without having to add in an explicit xchg then pop.
   //
+  I = freeStackSlotBefore(++I, FPRegNo);
+}
+
+/// freeStackSlotBefore - Free the specified register without trying any
+/// folding.
+MachineBasicBlock::iterator
+FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) {
   unsigned STReg    = getSTReg(FPRegNo);
   unsigned OldSlot  = getSlot(FPRegNo);
   unsigned TopReg   = Stack[StackTop-1];
@@ -604,9 +865,90 @@ void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
   RegMap[TopReg]    = OldSlot;
   RegMap[FPRegNo]   = ~0;
   Stack[--StackTop] = ~0;
-  MachineInstr *MI  = I;
-  DebugLoc dl = MI->getDebugLoc();
-  I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(STReg);
+  return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr)).addReg(STReg);
+}
+
+/// adjustLiveRegs - Kill and revive registers such that exactly the FP
+/// registers with a bit in Mask are live.
+void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
+  unsigned Defs = Mask;
+  unsigned Kills = 0;
+  for (unsigned i = 0; i < StackTop; ++i) {
+    unsigned RegNo = Stack[i];
+    if (!(Defs & (1 << RegNo)))
+      // This register is live, but we don't want it.
+      Kills |= (1 << RegNo);
+    else
+      // We don't need to imp-def this live register.
+      Defs &= ~(1 << RegNo);
+  }
+  assert((Kills & Defs) == 0 && "Register needs killing and def'ing?");
+
+  // Produce implicit-defs for free by using killed registers.
+  while (Kills && Defs) {
+    unsigned KReg = CountTrailingZeros_32(Kills);
+    unsigned DReg = CountTrailingZeros_32(Defs);
+    DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n");
+    std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
+    std::swap(RegMap[KReg], RegMap[DReg]);
+    Kills &= ~(1 << KReg);
+    Defs &= ~(1 << DReg);
+  }
+
+  // Kill registers by popping.
+  if (Kills && I != MBB->begin()) {
+    MachineBasicBlock::iterator I2 = llvm::prior(I);
+    for (;;) {
+      unsigned KReg = getStackEntry(0);
+      if (!(Kills & (1 << KReg)))
+        break;
+      DEBUG(dbgs() << "Popping %FP" << KReg << "\n");
+      popStackAfter(I2);
+      Kills &= ~(1 << KReg);
+    }
+  }
+
+  // Manually kill the rest.
+  while (Kills) {
+    unsigned KReg = CountTrailingZeros_32(Kills);
+    DEBUG(dbgs() << "Killing %FP" << KReg << "\n");
+    freeStackSlotBefore(I, KReg);
+    Kills &= ~(1 << KReg);
+  }
+
+  // Load zeros for all the imp-defs.
+  while(Defs) {
+    unsigned DReg = CountTrailingZeros_32(Defs);
+    DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n");
+    BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
+    pushReg(DReg);
+    Defs &= ~(1 << DReg);
+  }
+
+  // Now we should have the correct registers live.
+  DEBUG(dumpStack());
+  assert(StackTop == CountPopulation_32(Mask) && "Live count mismatch");
+}
+
+/// shuffleStackTop - emit fxch instructions before I to shuffle the top
+/// FixCount entries into the order given by FixStack.
+/// FIXME: Is there a better algorithm than insertion sort?
+void FPS::shuffleStackTop(const unsigned char *FixStack,
+                          unsigned FixCount,
+                          MachineBasicBlock::iterator I) {
+  // Move items into place, starting from the desired stack bottom.
+  while (FixCount--) {
+    // Old register at position FixCount.
+    unsigned OldReg = getStackEntry(FixCount);
+    // Desired register at position FixCount.
+    unsigned Reg = FixStack[FixCount];
+    if (Reg == OldReg)
+      continue;
+    // (Reg st0) (OldReg st0) = (Reg OldReg st0)
+    moveToTop(Reg, I);
+    moveToTop(OldReg, I);
+  }
+  DEBUG(dumpStack());
 }
 
 
@@ -660,7 +1002,7 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
        MI->getOpcode() == X86::ISTT_Fp32m80 ||
        MI->getOpcode() == X86::ISTT_Fp64m80 ||
        MI->getOpcode() == X86::ST_FpP80m)) {
-    duplicateToTop(Reg, 7 /*temp register*/, I);
+    duplicateToTop(Reg, getScratchReg(), I);
   } else {
     moveToTop(Reg, I);            // Move to the top of the stack...
   }
@@ -1013,8 +1355,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
 
     if (!MI->killsRegister(X86::FP0 + Op0)) {
       // Duplicate Op0 into a temporary on the stack top.
-      // This actually assumes that FP7 is dead.
-      duplicateToTop(Op0, 7, I);
+      duplicateToTop(Op0, getScratchReg(), I);
     } else {
       // Op0 is killed, so just swap it into position.
       moveToTop(Op0, I);
@@ -1034,8 +1375,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
     ++StackTop;
     unsigned RegOnTop = getStackEntry(0); // This reg must remain in st(0).
     if (!MI->killsRegister(X86::FP0 + Op0)) {
-      // Assume FP6 is not live, use it as a scratch register.
-      duplicateToTop(Op0, 6, I);
+      duplicateToTop(Op0, getScratchReg(), I);
       moveToTop(RegOnTop, I);
     } else if (getSTReg(Op0) != X86::ST1) {
       // We have the wrong value at st(1). Shuffle! Untested!
@@ -1119,11 +1459,11 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
   case X86::RETI:
     // If RET has an FP register use operand, pass the first one in ST(0) and
     // the second one in ST(1).
-    if (isStackEmpty()) return;  // Quick check to see if any are possible.
-    
+
     // Find the register operands.
     unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
-    
+    unsigned LiveMask = 0;
+
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
       MachineOperand &Op = MI->getOperand(i);
       if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
@@ -1142,12 +1482,18 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
         assert(SecondFPRegOp == ~0U && "More than two fp operands!");
         SecondFPRegOp = getFPReg(Op);
       }
+      LiveMask |= (1 << getFPReg(Op));
 
       // Remove the operand so that later passes don't see it.
       MI->RemoveOperand(i);
       --i, --e;
     }
-    
+
+    // We may have been carrying spurious live-ins, so make sure only the returned
+    // registers are left live.
+    adjustLiveRegs(LiveMask, MI);
+    if (!LiveMask) return;  // Quick check to see if any are possible.
+
     // There are only four possibilities here:
     // 1) we are returning a single FP value.  In this case, it has to be in
     //    ST(0) already, so just declare success by removing the value from the
@@ -1173,7 +1519,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
       
       // Duplicate the TOS so that we return it twice.  Just pick some other FPx
       // register to hold it.
-      unsigned NewReg = (FirstFPRegOp+1)%7;
+      unsigned NewReg = getScratchReg();
       duplicateToTop(FirstFPRegOp, NewReg, MI);
       FirstFPRegOp = NewReg;
     }
@@ -1197,7 +1543,14 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
   }
 
   I = MBB->erase(I);  // Remove the pseudo instruction
-  --I;
+
+  // We want to leave I pointing to the previous instruction, but what if we
+  // just erased the first instruction?
+  if (I == MBB->begin()) {
+    DEBUG(dbgs() << "Inserting dummy KILL\n");
+    I = BuildMI(*MBB, I, DebugLoc(), TII->get(TargetOpcode::KILL));
+  } else
+    --I;
 }
 
 // Translate a COPY instruction to a pseudo-op that handleSpecialFP understands.
diff --git a/lib/Target/X86/X86FloatingPointRegKill.cpp b/lib/Target/X86/X86FloatingPointRegKill.cpp
deleted file mode 100644
index 2c98b96c510b9..0000000000000
--- a/lib/Target/X86/X86FloatingPointRegKill.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-//===-- X86FloatingPoint.cpp - FP_REG_KILL inserter -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the pass which inserts FP_REG_KILL instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "x86-codegen"
-#include "X86.h"
-#include "X86InstrInfo.h"
-#include "llvm/Instructions.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/ADT/Statistic.h"
-using namespace llvm;
-
-STATISTIC(NumFPKill, "Number of FP_REG_KILL instructions added");
-
-namespace {
-  struct FPRegKiller : public MachineFunctionPass {
-    static char ID;
-    FPRegKiller() : MachineFunctionPass(&ID) {}
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      AU.addPreservedID(MachineLoopInfoID);
-      AU.addPreservedID(MachineDominatorsID);
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-
-    virtual bool runOnMachineFunction(MachineFunction &MF);
-
-    virtual const char *getPassName() const {
-      return "X86 FP_REG_KILL inserter";
-    }
-  };
-  char FPRegKiller::ID = 0;
-}
-
-FunctionPass *llvm::createX87FPRegKillInserterPass() {
-  return new FPRegKiller();
-}
-
-/// isFPStackVReg - Return true if the specified vreg is from a fp stack
-/// register class.
-static bool isFPStackVReg(unsigned RegNo, const MachineRegisterInfo &MRI) {
-  if (!TargetRegisterInfo::isVirtualRegister(RegNo))
-    return false;
-  
-  switch (MRI.getRegClass(RegNo)->getID()) {
-  default: return false;
-  case X86::RFP32RegClassID:
-  case X86::RFP64RegClassID:
-  case X86::RFP80RegClassID:
-  return true;
-  }
-}
-
-
-/// ContainsFPStackCode - Return true if the specific MBB has floating point
-/// stack code, and thus needs an FP_REG_KILL.
-static bool ContainsFPStackCode(MachineBasicBlock *MBB,
-                                const MachineRegisterInfo &MRI) {
-  // Scan the block, looking for instructions that define or use fp stack vregs.
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-       I != E; ++I) {
-    for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
-      if (!I->getOperand(op).isReg())
-        continue;
-      if (unsigned Reg = I->getOperand(op).getReg())
-        if (isFPStackVReg(Reg, MRI))
-          return true;
-    }
-  }
-  
-  // Check PHI nodes in successor blocks.  These PHI's will be lowered to have
-  // a copy of the input value in this block, which is a definition of the
-  // value.
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-       E = MBB->succ_end(); SI != E; ++ SI) {
-    MachineBasicBlock *SuccBB = *SI;
-    for (MachineBasicBlock::iterator I = SuccBB->begin(), E = SuccBB->end();
-         I != E; ++I) {
-      // All PHI nodes are at the top of the block.
-      if (!I->isPHI()) break;
-      
-      if (isFPStackVReg(I->getOperand(0).getReg(), MRI))
-        return true;
-    }
-  }
-  
-  return false;
-}                                 
-
-bool FPRegKiller::runOnMachineFunction(MachineFunction &MF) {
-  // If we are emitting FP stack code, scan the basic block to determine if this
-  // block defines or uses any FP values.  If so, put an FP_REG_KILL instruction
-  // before the terminator of the block.
-
-  // Note that FP stack instructions are used in all modes for long double,
-  // so we always need to do this check.
-  // Also note that it's possible for an FP stack register to be live across
-  // an instruction that produces multiple basic blocks (SSE CMOV) so we
-  // must check all the generated basic blocks.
-
-  // Scan all of the machine instructions in these MBBs, checking for FP
-  // stores.  (RFP32 and RFP64 will not exist in SSE mode, but RFP80 might.)
-
-  // Fast-path: If nothing is using the x87 registers, we don't need to do
-  // any scanning.
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  if (MRI.getRegClassVirtRegs(X86::RFP80RegisterClass).empty() &&
-      MRI.getRegClassVirtRegs(X86::RFP64RegisterClass).empty() &&
-      MRI.getRegClassVirtRegs(X86::RFP32RegisterClass).empty())
-    return false;
-
-  bool Changed = false;
-  MachineFunction::iterator MBBI = MF.begin();
-  MachineFunction::iterator EndMBB = MF.end();
-  for (; MBBI != EndMBB; ++MBBI) {
-    MachineBasicBlock *MBB = MBBI;
-    
-    // If this block returns, ignore it.  We don't want to insert an FP_REG_KILL
-    // before the return.
-    if (!MBB->empty()) {
-      MachineBasicBlock::iterator EndI = MBB->end();
-      --EndI;
-      if (EndI->getDesc().isReturn())
-        continue;
-    }
-    
-    // If we find any FP stack code, emit the FP_REG_KILL instruction.
-    if (ContainsFPStackCode(MBB, MRI)) {
-      BuildMI(*MBB, MBBI->getFirstTerminator(), DebugLoc(),
-              MF.getTarget().getInstrInfo()->get(X86::FP_REG_KILL));
-      ++NumFPKill;
-      Changed = true;
-    }
-  }
-
-  return Changed;
-}
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 72f2bc11d7cc7..c5234413aba63 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -171,6 +171,17 @@ namespace {
 
     virtual void PreprocessISelDAG();
 
+    inline bool immSext8(SDNode *N) const {
+      return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
+    }
+
+    // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+    // sign extended field.
+    inline bool i64immSExt32(SDNode *N) const {
+      uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
+      return (int64_t)v == (int32_t)v;
+    }
+
 // Include the pieces autogenerated from the target description.
 #include "X86GenDAGISel.inc"
 
@@ -1312,13 +1323,6 @@ SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
   return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
 }
 
-static SDNode *FindCallStartFromCall(SDNode *Node) {
-  if (Node->getOpcode() == ISD::CALLSEQ_START) return Node;
-    assert(Node->getOperand(0).getValueType() == MVT::Other &&
-         "Node doesn't have a token chain argument!");
-  return FindCallStartFromCall(Node->getOperand(0).getNode());
-}
-
 SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
   SDValue Chain = Node->getOperand(0);
   SDValue In1 = Node->getOperand(1);
@@ -1403,7 +1407,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
       Opc = X86::LOCK_DEC16m;
     else if (isSub) {
       if (isCN) {
-        if (Predicate_immSext8(Val.getNode()))
+        if (immSext8(Val.getNode()))
           Opc = X86::LOCK_SUB16mi8;
         else
           Opc = X86::LOCK_SUB16mi;
@@ -1411,7 +1415,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
         Opc = X86::LOCK_SUB16mr;
     } else {
       if (isCN) {
-        if (Predicate_immSext8(Val.getNode()))
+        if (immSext8(Val.getNode()))
           Opc = X86::LOCK_ADD16mi8;
         else
           Opc = X86::LOCK_ADD16mi;
@@ -1426,7 +1430,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
       Opc = X86::LOCK_DEC32m;
     else if (isSub) {
       if (isCN) {
-        if (Predicate_immSext8(Val.getNode()))
+        if (immSext8(Val.getNode()))
           Opc = X86::LOCK_SUB32mi8;
         else
           Opc = X86::LOCK_SUB32mi;
@@ -1434,7 +1438,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
         Opc = X86::LOCK_SUB32mr;
     } else {
       if (isCN) {
-        if (Predicate_immSext8(Val.getNode()))
+        if (immSext8(Val.getNode()))
           Opc = X86::LOCK_ADD32mi8;
         else
           Opc = X86::LOCK_ADD32mi;
@@ -1450,17 +1454,17 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
     else if (isSub) {
       Opc = X86::LOCK_SUB64mr;
       if (isCN) {
-        if (Predicate_immSext8(Val.getNode()))
+        if (immSext8(Val.getNode()))
           Opc = X86::LOCK_SUB64mi8;
-        else if (Predicate_i64immSExt32(Val.getNode()))
+        else if (i64immSExt32(Val.getNode()))
           Opc = X86::LOCK_SUB64mi32;
       }
     } else {
       Opc = X86::LOCK_ADD64mr;
       if (isCN) {
-        if (Predicate_immSext8(Val.getNode()))
+        if (immSext8(Val.getNode()))
           Opc = X86::LOCK_ADD64mi8;
-        else if (Predicate_i64immSExt32(Val.getNode()))
+        else if (i64immSExt32(Val.getNode()))
           Opc = X86::LOCK_ADD64mi32;
       }
     }
@@ -1841,7 +1845,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
     // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
     // use a smaller encoding.
-    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
+    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
+        HasNoSignedComparisonUses(Node))
       // Look past the truncate if CMP is the only use of it.
       N0 = N0.getOperand(0);
     if (N0.getNode()->getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b3c48862898ff..95dbb61766874 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16,6 +16,7 @@
 #include "X86.h"
 #include "X86InstrBuilder.h"
 #include "X86ISelLowering.h"
+#include "X86ShuffleDecode.h"
 #include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
 #include "llvm/CallingConv.h"
@@ -343,8 +344,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   if (Subtarget->hasSSE1())
     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 
-  if (!Subtarget->hasSSE2())
-    setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);
+  // We may not have a libcall for MEMBARRIER so we should lower this.
+  setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
+  
   // On X86 and X86-64, atomic operations are lowered to locked instructions.
   // Locked instructions, in turn, have implicit fence semantics (all memory
   // operations are flushed before issuing the locked instruction, and they
@@ -837,6 +839,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
+    // Can turn SHL into an integer multiply.
+    setOperationAction(ISD::SHL,                MVT::v4i32, Custom);
+    setOperationAction(ISD::SHL,                MVT::v16i8, Custom);
+
     // i8 and i16 vectors are custom , because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
     // custom since the immediate controlling the insert encodes additional
@@ -866,6 +872,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
     addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
     addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
+    addRegisterClass(MVT::v32i8, X86::VR256RegisterClass);
 
     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
     setOperationAction(ISD::LOAD,               MVT::v8i32, Legal);
@@ -877,7 +884,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
-    //setOperationAction(ISD::BUILD_VECTOR,       MVT::v8f32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8f32, Custom);
     //setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8f32, Custom);
     //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom);
     //setOperationAction(ISD::SELECT,             MVT::v8f32, Custom);
@@ -1189,6 +1196,50 @@ unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
   return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
 }
 
+std::pair<const TargetRegisterClass*, uint8_t>
+X86TargetLowering::findRepresentativeClass(EVT VT) const{
+  const TargetRegisterClass *RRC = 0;
+  uint8_t Cost = 1;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(VT);
+  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
+    RRC = (Subtarget->is64Bit()
+           ? X86::GR64RegisterClass : X86::GR32RegisterClass);
+    break;
+  case MVT::v8i8: case MVT::v4i16:
+  case MVT::v2i32: case MVT::v1i64: 
+    RRC = X86::VR64RegisterClass;
+    break;
+  case MVT::f32: case MVT::f64:
+  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
+  case MVT::v4f32: case MVT::v2f64:
+  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
+  case MVT::v4f64:
+    RRC = X86::VR128RegisterClass;
+    break;
+  }
+  return std::make_pair(RRC, Cost);
+}
+
+unsigned
+X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC,
+                                       MachineFunction &MF) const {
+  unsigned FPDiff = RegInfo->hasFP(MF) ? 1 : 0;
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case X86::GR32RegClassID:
+    return 4 - FPDiff;
+  case X86::GR64RegClassID:
+    return 8 - FPDiff;
+  case X86::VR128RegClassID:
+    return Subtarget->is64Bit() ? 10 : 4;
+  case X86::VR64RegClassID:
+    return 4;
+  }
+}
+
 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
                                                unsigned &Offset) const {
   if (!Subtarget->isTargetLinux())
@@ -1259,6 +1310,19 @@ X86TargetLowering::LowerReturn(SDValue Chain,
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
     SDValue ValToCopy = OutVals[i];
+    EVT ValVT = ValToCopy.getValueType();
+
+    // If this is x86-64, and we disabled SSE, we can't return FP values
+    if ((ValVT == MVT::f32 || ValVT == MVT::f64) &&
+        (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
+      report_fatal_error("SSE register return with SSE disabled");
+    }
+    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
+    // llvm-gcc has never done it right and no one has noticed, so this
+    // should be OK for now.
+    if (ValVT == MVT::f64 &&
+        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
+      report_fatal_error("SSE2 register return with SSE2 disabled");
 
     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
     // the RET instruction and handled by the FP Stackifier.
@@ -1276,14 +1340,20 @@ X86TargetLowering::LowerReturn(SDValue Chain,
     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
     // which is returned in RAX / RDX.
     if (Subtarget->is64Bit()) {
-      EVT ValVT = ValToCopy.getValueType();
       if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
         ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
-        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1)
-          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy);
+        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
+          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                                  ValToCopy);
+          
+          // If we don't have SSE2 available, convert to v4f32 so the generated
+          // register is legal.
+          if (!Subtarget->hasSSE2())
+            ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,ValToCopy);
+        }
       }
     }
-
+    
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
     Flag = Chain.getValue(1);
   }
@@ -1570,6 +1640,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         RC = X86::FR32RegisterClass;
       else if (RegVT == MVT::f64)
         RC = X86::FR64RegisterClass;
+      else if (RegVT.isVector() && RegVT.getSizeInBits() == 256)
+        RC = X86::VR256RegisterClass;
       else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
         RC = X86::VR128RegisterClass;
       else if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
@@ -1937,6 +2009,19 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
     if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      if (isVarArg && Subtarget->isTargetWin64()) {
+        // Win64 ABI requires argument XMM reg to be copied to the corresponding
+        // shadow reg if callee is a varargs function.
+        unsigned ShadowReg = 0;
+        switch (VA.getLocReg()) {
+        case X86::XMM0: ShadowReg = X86::RCX; break;
+        case X86::XMM1: ShadowReg = X86::RDX; break;
+        case X86::XMM2: ShadowReg = X86::R8; break;
+        case X86::XMM3: ShadowReg = X86::R9; break;
+        }
+        if (ShadowReg)
+          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
+      }
     } else if (!IsSibcall && (!isTailCall || isByVal)) {
       assert(VA.isMemLoc());
       if (StackPtr.getNode() == 0)
@@ -1990,7 +2075,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     }
   }
 
-  if (Is64Bit && isVarArg) {
+  if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) {
     // From AMD64 ABI document:
     // For calls that may call functions that use varargs or stdargs
     // (prototype-less calls or calls to functions containing ellipsis (...) in
@@ -1999,7 +2084,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // the number of registers, but must be an ubound on the number of SSE
     // registers used and is in the range 0 - 8 inclusive.
 
-    // FIXME: Verify this on Win64
     // Count the number of XMM registers allocated.
     static const unsigned XMMArgRegs[] = {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
@@ -2165,8 +2249,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   if (!isTailCall && Subtarget->isPICStyleGOT())
     Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
 
-  // Add an implicit use of AL for x86 vararg functions.
-  if (Is64Bit && isVarArg)
+  // Add an implicit use of AL for non-Windows x86 64-bit vararg functions.
+  if (Is64Bit && isVarArg && !Subtarget->isTargetWin64())
     Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
 
   if (InFlag.getNode())
@@ -2356,8 +2440,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (RegInfo->needsStackRealignment(MF))
     return false;
 
-  // Do not sibcall optimize vararg calls unless the call site is not passing any
-  // arguments.
+  // Do not sibcall optimize vararg calls unless the call site is not passing
+  // any arguments.
   if (isVarArg && !Outs.empty())
     return false;
 
@@ -2493,6 +2577,112 @@ X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
 //                           Other Lowering Hooks
 //===----------------------------------------------------------------------===//
 
+static bool MayFoldLoad(SDValue Op) {
+  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
+}
+
+static bool MayFoldIntoStore(SDValue Op) {
+  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
+}
+
+static bool isTargetShuffle(unsigned Opcode) {
+  switch(Opcode) {
+  default: return false;
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::SHUFPD:
+  case X86ISD::SHUFPS:
+  case X86ISD::MOVLHPS:
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVHLPS:
+  case X86ISD::MOVLPS:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+  case X86ISD::UNPCKLPS:
+  case X86ISD::UNPCKLPD:
+  case X86ISD::PUNPCKLWD:
+  case X86ISD::PUNPCKLBW:
+  case X86ISD::PUNPCKLDQ:
+  case X86ISD::PUNPCKLQDQ:
+  case X86ISD::UNPCKHPS:
+  case X86ISD::UNPCKHPD:
+  case X86ISD::PUNPCKHWD:
+  case X86ISD::PUNPCKHBW:
+  case X86ISD::PUNPCKHDQ:
+  case X86ISD::PUNPCKHQDQ:
+    return true;
+  }
+  return false;
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+                                               SDValue V1, SelectionDAG &DAG) {
+  switch(Opc) {
+  default: llvm_unreachable("Unknown x86 shuffle node");
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+    return DAG.getNode(Opc, dl, VT, V1);
+  }
+
+  return SDValue();
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+                          SDValue V1, unsigned TargetMask, SelectionDAG &DAG) {
+  switch(Opc) {
+  default: llvm_unreachable("Unknown x86 shuffle node");
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+    return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
+  }
+
+  return SDValue();
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+               SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) {
+  switch(Opc) {
+  default: llvm_unreachable("Unknown x86 shuffle node");
+  case X86ISD::SHUFPD:
+  case X86ISD::SHUFPS:
+    return DAG.getNode(Opc, dl, VT, V1, V2,
+                       DAG.getConstant(TargetMask, MVT::i8));
+  }
+  return SDValue();
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+                                    SDValue V1, SDValue V2, SelectionDAG &DAG) {
+  switch(Opc) {
+  default: llvm_unreachable("Unknown x86 shuffle node");
+  case X86ISD::MOVLHPS:
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVHLPS:
+  case X86ISD::MOVLPS:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+  case X86ISD::UNPCKLPS:
+  case X86ISD::UNPCKLPD:
+  case X86ISD::PUNPCKLWD:
+  case X86ISD::PUNPCKLBW:
+  case X86ISD::PUNPCKLDQ:
+  case X86ISD::PUNPCKLQDQ:
+  case X86ISD::UNPCKHPS:
+  case X86ISD::UNPCKHPD:
+  case X86ISD::PUNPCKHWD:
+  case X86ISD::PUNPCKHBW:
+  case X86ISD::PUNPCKHDQ:
+  case X86ISD::PUNPCKHQDQ:
+    return DAG.getNode(Opc, dl, VT, V1, V2);
+  }
+  return SDValue();
+}
 
 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -3347,18 +3537,27 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
                              DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
-  // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
-  // type.  This ensures they get CSE'd.
+  // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted
+  // to their dest type. This ensures they get CSE'd.
   SDValue Vec;
   if (VT.getSizeInBits() == 64) { // MMX
     SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
-  } else if (HasSSE2) {  // SSE2
-    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
-  } else { // SSE1
+  } else if (VT.getSizeInBits() == 128) {
+    if (HasSSE2) {  // SSE2
+      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+    } else { // SSE1
+      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
+    }
+  } else if (VT.getSizeInBits() == 256) { // AVX
+    // 256-bit logic and arithmetic instructions in AVX are
+    // all floating-point, no support for integer ops. Default
+    // to emitting fp zeroed vectors then.
     SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
+    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
   }
   return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
 }
@@ -3372,9 +3571,9 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
   // type.  This ensures they get CSE'd.
   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
   SDValue Vec;
-  if (VT.getSizeInBits() == 64)  // MMX
+  if (VT.getSizeInBits() == 64) // MMX
     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
-  else                                              // SSE
+  else // SSE
     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
 }
@@ -3439,9 +3638,8 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
 }
 
-/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
-static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG,
-                            bool HasSSE2) {
+/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32.
+static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   if (SV->getValueType(0).getVectorNumElements() <= 4)
     return SDValue(SV, 0);
 
@@ -3488,68 +3686,253 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
   return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
 }
 
-/// getNumOfConsecutiveZeros - Return the number of elements in a result of
-/// a shuffle that is zero.
-static
-unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems,
-                                  bool Low, SelectionDAG &DAG) {
-  unsigned NumZeros = 0;
-  for (int i = 0; i < NumElems; ++i) {
-    unsigned Index = Low ? i : NumElems-i-1;
-    int Idx = SVOp->getMaskElt(Index);
-    if (Idx < 0) {
-      ++NumZeros;
-      continue;
-    }
-    SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index);
-    if (Elt.getNode() && X86::isZeroNode(Elt))
-      ++NumZeros;
-    else
+/// getShuffleScalarElt - Returns the scalar element that will make up the ith
+/// element of the result of the vector shuffle.
+SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
+                            unsigned Depth) {
+  if (Depth == 6)
+    return SDValue();  // Limit search depth.
+
+  SDValue V = SDValue(N, 0);
+  EVT VT = V.getValueType();
+  unsigned Opcode = V.getOpcode();
+
+  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
+  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
+    Index = SV->getMaskElt(Index);
+
+    if (Index < 0)
+      return DAG.getUNDEF(VT.getVectorElementType());
+
+    int NumElems = VT.getVectorNumElements();
+    SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1);
+    return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1);
+  }
+
+  // Recurse into target specific vector shuffles to find scalars.
+  if (isTargetShuffle(Opcode)) {
+    int NumElems = VT.getVectorNumElements();
+    SmallVector<unsigned, 16> ShuffleMask;
+    SDValue ImmN;
+
+    switch(Opcode) {
+    case X86ISD::SHUFPS:
+    case X86ISD::SHUFPD:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeSHUFPSMask(NumElems,
+                       cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                       ShuffleMask);
+      break;
+    case X86ISD::PUNPCKHBW:
+    case X86ISD::PUNPCKHWD:
+    case X86ISD::PUNPCKHDQ:
+    case X86ISD::PUNPCKHQDQ:
+      DecodePUNPCKHMask(NumElems, ShuffleMask);
       break;
+    case X86ISD::UNPCKHPS:
+    case X86ISD::UNPCKHPD:
+      DecodeUNPCKHPMask(NumElems, ShuffleMask);
+      break;
+    case X86ISD::PUNPCKLBW:
+    case X86ISD::PUNPCKLWD:
+    case X86ISD::PUNPCKLDQ:
+    case X86ISD::PUNPCKLQDQ:
+      DecodePUNPCKLMask(NumElems, ShuffleMask);
+      break;
+    case X86ISD::UNPCKLPS:
+    case X86ISD::UNPCKLPD:
+      DecodeUNPCKLPMask(NumElems, ShuffleMask);
+      break;
+    case X86ISD::MOVHLPS:
+      DecodeMOVHLPSMask(NumElems, ShuffleMask);
+      break;
+    case X86ISD::MOVLHPS:
+      DecodeMOVLHPSMask(NumElems, ShuffleMask);
+      break;
+    case X86ISD::PSHUFD:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodePSHUFMask(NumElems,
+                      cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                      ShuffleMask);
+      break;
+    case X86ISD::PSHUFHW:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                        ShuffleMask);
+      break;
+    case X86ISD::PSHUFLW:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                        ShuffleMask);
+      break;
+    case X86ISD::MOVSS:
+    case X86ISD::MOVSD: {
+      // The index 0 always comes from the first element of the second source,
+      // this is why MOVSS and MOVSD are used in the first place. The other
+      // elements come from the other positions of the first source vector.
+      unsigned OpNum = (Index == 0) ? 1 : 0;
+      return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
+                                 Depth+1);
+    }
+    default:
+      assert("not implemented for target shuffle node");
+      return SDValue();
+    }
+
+    Index = ShuffleMask[Index];
+    if (Index < 0)
+      return DAG.getUNDEF(VT.getVectorElementType());
+
+    SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1);
+    return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG,
+                               Depth+1);
   }
-  return NumZeros;
-}
 
-/// isVectorShift - Returns true if the shuffle can be implemented as a
-/// logical left or right shift of a vector.
-/// FIXME: split into pslldqi, psrldqi, palignr variants.
-static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
-                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
-  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
+  // Actual nodes that may contain scalar elements
+  if (Opcode == ISD::BIT_CONVERT) {
+    V = V.getOperand(0);
+    EVT SrcVT = V.getValueType();
+    unsigned NumElems = VT.getVectorNumElements();
 
-  isLeft = true;
-  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG);
-  if (!NumZeros) {
-    isLeft = false;
-    NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG);
-    if (!NumZeros)
-      return false;
+    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
+      return SDValue();
+  }
+
+  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return (Index == 0) ? V.getOperand(0)
+                          : DAG.getUNDEF(VT.getVectorElementType());
+
+  if (V.getOpcode() == ISD::BUILD_VECTOR)
+    return V.getOperand(Index);
+
+  return SDValue();
+}
+
+/// getNumOfConsecutiveZeros - Return the number of elements of a vector
+/// shuffle operation which come from a consecutively from a zero. The
+/// search can start in two diferent directions, from left or right.
+static
+unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems,
+                                  bool ZerosFromLeft, SelectionDAG &DAG) {
+  int i = 0;
+
+  while (i < NumElems) {
+    unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
+    SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0);
+    if (!(Elt.getNode() &&
+         (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
+      break;
+    ++i;
   }
+
+  return i;
+}
+
+/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to
+/// MaskE correspond consecutively to elements from one of the vector operands,
+/// starting from its index OpIdx. Also tell OpNum which source vector operand.
+static
+bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE,
+                              int OpIdx, int NumElems, unsigned &OpNum) {
   bool SeenV1 = false;
   bool SeenV2 = false;
-  for (unsigned i = NumZeros; i < NumElems; ++i) {
-    unsigned Val = isLeft ? (i - NumZeros) : i;
-    int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros));
-    if (Idx_ < 0)
+
+  for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) {
+    int Idx = SVOp->getMaskElt(i);
+    // Ignore undef indicies
+    if (Idx < 0)
       continue;
-    unsigned Idx = (unsigned) Idx_;
+
     if (Idx < NumElems)
       SeenV1 = true;
-    else {
-      Idx -= NumElems;
+    else
       SeenV2 = true;
-    }
-    if (Idx != Val)
+
+    // Only accept consecutive elements from the same vector
+    if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
       return false;
   }
-  if (SeenV1 && SeenV2)
+
+  OpNum = SeenV1 ? 0 : 1;
+  return true;
+}
+
+/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
+/// logical left shift of a vector.
+static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
+  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
+              false /* check zeros from right */, DAG);
+  unsigned OpSrc;
+
+  if (!NumZeros)
+    return false;
+
+  // Considering the elements in the mask that are not consecutive zeros,
+  // check if they consecutively come from only one of the source vectors.
+  //
+  //               V1 = {X, A, B, C}     0
+  //                         \  \  \    /
+  //   vector_shuffle V1, V2 <1, 2, 3, X>
+  //
+  if (!isShuffleMaskConsecutive(SVOp,
+            0,                   // Mask Start Index
+            NumElems-NumZeros-1, // Mask End Index
+            NumZeros,            // Where to start looking in the src vector
+            NumElems,            // Number of elements in vector
+            OpSrc))              // Which source operand ?
+    return false;
+
+  isLeft = false;
+  ShAmt = NumZeros;
+  ShVal = SVOp->getOperand(OpSrc);
+  return true;
+}
+
+/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
+/// logical left shift of a vector.
+static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+                              bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
+  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
+              true /* check zeros from left */, DAG);
+  unsigned OpSrc;
+
+  if (!NumZeros)
+    return false;
+
+  // Considering the elements in the mask that are not consecutive zeros,
+  // check if they consecutively come from only one of the source vectors.
+  //
+  //                           0    { A, B, X, X } = V2
+  //                          / \    /  /
+  //   vector_shuffle V1, V2 <X, X, 4, 5>
+  //
+  if (!isShuffleMaskConsecutive(SVOp,
+            NumZeros,     // Mask Start Index
+            NumElems-1,   // Mask End Index
+            0,            // Where to start looking in the src vector
+            NumElems,     // Number of elements in vector
+            OpSrc))       // Which source operand ?
     return false;
 
-  ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1);
+  isLeft = true;
   ShAmt = NumZeros;
+  ShVal = SVOp->getOperand(OpSrc);
   return true;
 }
 
+/// isVectorShift - Returns true if the shuffle can be implemented as a
+/// logical left or right shift of a vector.
+static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+  if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
+      isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
+    return true;
+
+  return false;
+}
 
 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
 ///
@@ -3779,9 +4162,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
 SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
-  // All zero's are handled with pxor, all one's are handled with pcmpeqd.
-  if (ISD::isBuildVectorAllZeros(Op.getNode())
-      || ISD::isBuildVectorAllOnes(Op.getNode())) {
+  // All zero's are handled with pxor in SSE2 and above, xorps in SSE1.
+  // All one's are handled with pcmpeqd. In AVX, zero's are handled with
+  // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd
+  // is present, so AllOnes is ignored.
+  if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
+      (Op.getValueType().getSizeInBits() != 256 &&
+       ISD::isBuildVectorAllOnes(Op.getNode()))) {
     // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
     // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
     // eliminated on x86-32 hosts.
@@ -3819,10 +4206,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  if (NumNonZero == 0) {
-    // All undef vector. Return an UNDEF.  All zero vectors were handled above.
+  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
+  if (NumNonZero == 0)
     return DAG.getUNDEF(VT);
-  }
 
   // Special case for single non-zero, non-undef, element.
   if (NumNonZero == 1) {
@@ -3960,7 +4346,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   if (EVTBits == 16 && NumElems == 8) {
     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
-                                        *this);
+                                      *this);
     if (V.getNode()) return V;
   }
 
@@ -4014,28 +4400,51 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     if (LD.getNode())
       return LD;
     
-    // For SSE 4.1, use inserts into undef.  
+    // For SSE 4.1, use insertps to put the high elements into the low element. 
     if (getSubtarget()->hasSSE41()) {
-      V[0] = DAG.getUNDEF(VT);
-      for (unsigned i = 0; i < NumElems; ++i)
-        if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
-          V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0],
+      SDValue Result;
+      if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
+        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
+      else
+        Result = DAG.getUNDEF(VT);
+      
+      for (unsigned i = 1; i < NumElems; ++i) {
+        if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
+        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
                              Op.getOperand(i), DAG.getIntPtrConstant(i));
-      return V[0];
+      }
+      return Result;
     }
     
-    // Otherwise, expand into a number of unpckl*
-    // e.g. for v4f32
+    // Otherwise, expand into a number of unpckl*, start by extending each of
+    // our (non-undef) elements to the full vector width with the element in the
+    // bottom slot of the vector (which generates no code for SSE).
+    for (unsigned i = 0; i < NumElems; ++i) {
+      if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
+        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+      else
+        V[i] = DAG.getUNDEF(VT);
+    }
+
+    // Next, we iteratively mix elements, e.g. for v4f32:
     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
-    for (unsigned i = 0; i < NumElems; ++i)
-      V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
-    NumElems >>= 1;
-    while (NumElems != 0) {
-      for (unsigned i = 0; i < NumElems; ++i)
-        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]);
-      NumElems >>= 1;
+    unsigned EltStride = NumElems >> 1;
+    while (EltStride != 0) {
+      for (unsigned i = 0; i < EltStride; ++i) {
+        // If V[i+EltStride] is undef and this is the first round of mixing,
+        // then it is safe to just drop this shuffle: V[i] is already in the
+        // right place, the one element (since it's the first round) being
+        // inserted as undef can be dropped.  This isn't safe for successive
+        // rounds because they will permute elements within both vectors.
+        if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
+            EltStride == NumElems/2)
+          continue;
+        
+        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
+      }
+      EltStride >>= 1;
     }
     return V[0];
   }
@@ -4074,10 +4483,10 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
 // 2. [ssse3] 1 x pshufb
 // 3. [ssse3] 2 x pshufb + 1 x por
 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
-static
-SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
-                                 SelectionDAG &DAG,
-                                 const X86TargetLowering &TLI) {
+SDValue
+X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   DebugLoc dl = SVOp->getDebugLoc();
@@ -4128,7 +4537,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
   // quads, disable the next transformation since it does not help SSSE3.
   bool V1Used = InputQuads[0] || InputQuads[1];
   bool V2Used = InputQuads[2] || InputQuads[3];
-  if (TLI.getSubtarget()->hasSSSE3()) {
+  if (Subtarget->hasSSSE3()) {
     if (InputQuads.count() == 2 && V1Used && V2Used) {
       BestLoQuad = InputQuads.find_first();
       BestHiQuad = InputQuads.find_next(BestLoQuad);
@@ -4187,15 +4596,21 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
     // If we've eliminated the use of V2, and the new mask is a pshuflw or
     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
-      return DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
+      unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
+      unsigned TargetMask = 0;
+      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
+      TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()):
+                             X86::getShufflePSHUFLWImmediate(NewV.getNode());
+      V1 = NewV.getOperand(0);
+      return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
     }
   }
 
   // If we have SSSE3, and all words of the result are from 1 input vector,
   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
   // is present, fall back to case 4.
-  if (TLI.getSubtarget()->hasSSSE3()) {
+  if (Subtarget->hasSSSE3()) {
     SmallVector<SDValue,16> pshufbMask;
 
     // If we have elements from both input vectors, set the high bit of the
@@ -4262,6 +4677,12 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
       MaskV.push_back(i);
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
+
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
+      NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
+                               NewV.getOperand(0),
+                               X86::getShufflePSHUFLWImmediate(NewV.getNode()),
+                               DAG);
   }
 
   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
@@ -4284,6 +4705,12 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
     }
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
+
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
+      NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
+                              NewV.getOperand(0),
+                              X86::getShufflePSHUFHWImmediate(NewV.getNode()),
+                              DAG);
   }
 
   // In case BestHi & BestLo were both -1, which means each quadword has a word
@@ -4473,7 +4900,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
   SDValue V2 = SVOp->getOperand(1);
   unsigned NumElems = VT.getVectorNumElements();
   unsigned NewWidth = (NumElems == 4) ? 2 : 4;
-  EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
+  EVT MaskVT = (NewWidth == 4) ? MVT::v4i16 : MVT::v2i32;
   EVT NewVT = MaskVT;
   switch (VT.getSimpleVT().SimpleTy) {
   default: assert(false && "Unexpected!");
@@ -4697,6 +5124,129 @@ LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
 }
 
+static bool MayFoldVectorLoad(SDValue V) {
+  if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT)
+    V = V.getOperand(0);
+  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    V = V.getOperand(0);
+  if (MayFoldLoad(V))
+    return true;
+  return false;
+}
+
+static
+SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
+                        bool HasSSE2) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+
+  assert(VT != MVT::v2i64 && "unsupported shuffle type");
+
+  if (HasSSE2 && VT == MVT::v2f64)
+    return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
+
+  // v4f32 or v4i32
+  return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG);
+}
+
+static
+SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+
+  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
+         "unsupported shuffle type");
+
+  if (V2.getOpcode() == ISD::UNDEF)
+    V2 = V1;
+
+  // v4i32 or v4f32
+  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
+}
+
+static
+SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
+  // operand of these instructions is only memory, so check if there's a
+  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
+  // same masks.
+  bool CanFoldLoad = false;
+
+  // Trivial case, when V2 comes from a load.
+  if (MayFoldVectorLoad(V2))
+    CanFoldLoad = true;
+
+  // When V1 is a load, it can be folded later into a store in isel, example:
+  //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
+  //    turns into:
+  //  (MOVLPSmr addr:$src1, VR128:$src2)
+  // So, recognize this potential and also use MOVLPS or MOVLPD
+  if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
+    CanFoldLoad = true;
+
+  if (CanFoldLoad) {
+    if (HasSSE2 && NumElems == 2)
+      return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
+
+    if (NumElems == 4)
+      return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
+  }
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  // movl and movlp will both match v2i64, but v2i64 is never matched by
+  // movl earlier because we make it strict to avoid messing with the movlp load
+  // folding logic (see the code above getMOVLP call). Match it here then,
+  // this is horrible, but will stay like this until we move all shuffle
+  // matching to x86 specific nodes. Note that for the 1st condition all
+  // types are matched with movsd.
+  if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp))
+    return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
+  else if (HasSSE2)
+    return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
+
+
+  assert(VT != MVT::v4i32 && "unsupported shuffle type");
+
+  // Invert the operand order and use SHUFPS to match it.
+  return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1,
+                              X86::getShuffleSHUFImmediate(SVOp), DAG);
+}
+
+static inline unsigned getUNPCKLOpcode(EVT VT) {
+  switch(VT.getSimpleVT().SimpleTy) {
+  case MVT::v4i32: return X86ISD::PUNPCKLDQ;
+  case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
+  case MVT::v4f32: return X86ISD::UNPCKLPS;
+  case MVT::v2f64: return X86ISD::UNPCKLPD;
+  case MVT::v16i8: return X86ISD::PUNPCKLBW;
+  case MVT::v8i16: return X86ISD::PUNPCKLWD;
+  default:
+    llvm_unreachable("Unknow type for unpckl");
+  }
+  return 0;
+}
+
+static inline unsigned getUNPCKHOpcode(EVT VT) {
+  switch(VT.getSimpleVT().SimpleTy) {
+  case MVT::v4i32: return X86ISD::PUNPCKHDQ;
+  case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
+  case MVT::v4f32: return X86ISD::UNPCKHPS;
+  case MVT::v2f64: return X86ISD::UNPCKHPD;
+  case MVT::v16i8: return X86ISD::PUNPCKHBW;
+  case MVT::v8i16: return X86ISD::PUNPCKHWD;
+  default:
+    llvm_unreachable("Unknow type for unpckh");
+  }
+  return 0;
+}
+
 SDValue
 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
@@ -4710,6 +5260,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   bool V1IsSplat = false;
   bool V2IsSplat = false;
+  bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX();
+  bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX();
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
 
   if (isZeroShuffle(SVOp))
     return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
@@ -4718,7 +5272,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   if (SVOp->isSplat()) {
     if (isMMX || NumElems < 4)
       return Op;
-    return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2());
+    return PromoteSplat(SVOp, DAG);
   }
 
   // If the shuffle can be profitably rewritten as a narrower shuffle, then
@@ -4746,8 +5300,35 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  if (X86::isPSHUFDMask(SVOp))
-    return Op;
+  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
+  // unpckh_undef). Only use pshufd if speed is more important than size.
+  if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp))
+    if (VT != MVT::v2i64 && VT != MVT::v2f64)
+      return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
+  if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
+    if (VT != MVT::v2i64 && VT != MVT::v2f64)
+      return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+
+  if (X86::isPSHUFDMask(SVOp)) {
+    // The actual implementation will match the mask in the if above and then
+    // during isel it can match several different instructions, not only pshufd
+    // as its name says, sad but true, emulate the behavior for now...
+    if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
+        return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
+
+    unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
+
+    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
+      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
+
+    if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
+      return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1,
+                                  TargetMask, DAG);
+
+    if (VT == MVT::v4f32)
+      return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1,
+                                  TargetMask, DAG);
+  }
 
   // Check if this can be converted into a logical shift.
   bool isLeft = false;
@@ -4768,17 +5349,32 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
       return V2;
     if (ISD::isBuildVectorAllZeros(V1.getNode()))
       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
-    if (!isMMX)
-      return Op;
+    if (!isMMX && !X86::isMOVLPMask(SVOp)) {
+      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
+        return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
+
+      if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
+    }
   }
 
   // FIXME: fold these into legal mask.
-  if (!isMMX && (X86::isMOVSHDUPMask(SVOp) ||
-                 X86::isMOVSLDUPMask(SVOp) ||
-                 X86::isMOVHLPSMask(SVOp) ||
-                 X86::isMOVLHPSMask(SVOp) ||
-                 X86::isMOVLPMask(SVOp)))
-    return Op;
+  if (!isMMX) {
+    if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp))
+      return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
+
+    if (X86::isMOVHLPSMask(SVOp))
+      return getMOVHighToLow(Op, dl, DAG);
+
+    if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4)
+      return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
+
+    if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4)
+      return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
+
+    if (X86::isMOVLPMask(SVOp))
+      return getMOVLP(Op, dl, DAG, HasSSE2);
+  }
 
   if (ShouldXformToMOVHLPS(SVOp) ||
       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
@@ -4818,11 +5414,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getMOVL(DAG, dl, VT, V2, V1);
   }
 
-  if (X86::isUNPCKL_v_undef_Mask(SVOp) ||
-      X86::isUNPCKH_v_undef_Mask(SVOp) ||
-      X86::isUNPCKLMask(SVOp) ||
-      X86::isUNPCKHMask(SVOp))
-    return Op;
+  if (X86::isUNPCKLMask(SVOp))
+    return (isMMX) ?
+      Op : getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG);
+
+  if (X86::isUNPCKHMask(SVOp))
+    return (isMMX) ?
+      Op : getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG);
 
   if (V2IsSplat) {
     // Normalize mask so all entries that point to V2 points to its first
@@ -4844,11 +5442,14 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     // FIXME: this seems wrong.
     SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
     ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
-    if (X86::isUNPCKL_v_undef_Mask(NewSVOp) ||
-        X86::isUNPCKH_v_undef_Mask(NewSVOp) ||
-        X86::isUNPCKLMask(NewSVOp) ||
-        X86::isUNPCKHMask(NewSVOp))
-      return NewOp;
+
+    if (X86::isUNPCKLMask(NewSVOp))
+      return (isMMX) ?
+        NewOp : getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG);
+
+    if (X86::isUNPCKHMask(NewSVOp))
+      return (isMMX) ?
+        NewOp : getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG);
   }
 
   // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
@@ -4857,15 +5458,52 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
     return CommuteVectorShuffle(SVOp, DAG);
 
-  // Check for legal shuffle and return?
-  SmallVector<int, 16> PermMask;
-  SVOp->getMask(PermMask);
-  if (isShuffleMaskLegal(PermMask, VT))
+  // The checks below are all present in isShuffleMaskLegal, but they are
+  // inlined here right now to enable us to directly emit target specific
+  // nodes, and remove one by one until they don't return Op anymore.
+  SmallVector<int, 16> M;
+  SVOp->getMask(M);
+
+  // Very little shuffling can be done for 64-bit vectors right now.
+  if (VT.getSizeInBits() == 64)
+    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ? Op : SDValue();
+
+  // FIXME: pshufb, blends, shifts.
+  if (VT.getVectorNumElements() == 2 ||
+      ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+      isPALIGNRMask(M, VT, Subtarget->hasSSSE3()))
     return Op;
 
+  if (isPSHUFHWMask(M, VT))
+    return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
+                                X86::getShufflePSHUFHWImmediate(SVOp),
+                                DAG);
+
+  if (isPSHUFLWMask(M, VT))
+    return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
+                                X86::getShufflePSHUFLWImmediate(SVOp),
+                                DAG);
+
+  if (isSHUFPMask(M, VT)) {
+    unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
+    if (VT == MVT::v4f32 || VT == MVT::v4i32)
+      return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2,
+                                  TargetMask, DAG);
+    if (VT == MVT::v2f64 || VT == MVT::v2i64)
+      return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2,
+                                  TargetMask, DAG);
+  }
+
+  if (X86::isUNPCKL_v_undef_Mask(SVOp))
+    if (VT != MVT::v2i64 && VT != MVT::v2f64)
+      return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
+  if (X86::isUNPCKH_v_undef_Mask(SVOp))
+    if (VT != MVT::v2i64 && VT != MVT::v2f64)
+      return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+
   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
   if (VT == MVT::v8i16) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this);
+    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG);
     if (NewOp.getNode())
       return NewOp;
   }
@@ -6922,24 +7560,58 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
                                 DAG.getConstant(X86CC, MVT::i8), Cond);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
-  // ptest intrinsics. The intrinsic these come from are designed to return
-  // an integer value, not just an instruction so lower it to the ptest
-  // pattern and a setcc for the result.
+  // ptest and testp intrinsics. The intrinsic these come from are designed to
+  // return an integer value, not just an instruction so lower it to the ptest
+  // or testp pattern and a setcc for the result.
   case Intrinsic::x86_sse41_ptestz:
   case Intrinsic::x86_sse41_ptestc:
-  case Intrinsic::x86_sse41_ptestnzc:{
+  case Intrinsic::x86_sse41_ptestnzc:
+  case Intrinsic::x86_avx_ptestz_256:
+  case Intrinsic::x86_avx_ptestc_256:
+  case Intrinsic::x86_avx_ptestnzc_256:
+  case Intrinsic::x86_avx_vtestz_ps:
+  case Intrinsic::x86_avx_vtestc_ps:
+  case Intrinsic::x86_avx_vtestnzc_ps:
+  case Intrinsic::x86_avx_vtestz_pd:
+  case Intrinsic::x86_avx_vtestc_pd:
+  case Intrinsic::x86_avx_vtestnzc_pd:
+  case Intrinsic::x86_avx_vtestz_ps_256:
+  case Intrinsic::x86_avx_vtestc_ps_256:
+  case Intrinsic::x86_avx_vtestnzc_ps_256:
+  case Intrinsic::x86_avx_vtestz_pd_256:
+  case Intrinsic::x86_avx_vtestc_pd_256:
+  case Intrinsic::x86_avx_vtestnzc_pd_256: {
+    bool IsTestPacked = false;
     unsigned X86CC = 0;
     switch (IntNo) {
     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+    case Intrinsic::x86_avx_vtestz_ps:
+    case Intrinsic::x86_avx_vtestz_pd:
+    case Intrinsic::x86_avx_vtestz_ps_256:
+    case Intrinsic::x86_avx_vtestz_pd_256:
+      IsTestPacked = true; // Fallthrough
     case Intrinsic::x86_sse41_ptestz:
+    case Intrinsic::x86_avx_ptestz_256:
       // ZF = 1
       X86CC = X86::COND_E;
       break;
+    case Intrinsic::x86_avx_vtestc_ps:
+    case Intrinsic::x86_avx_vtestc_pd:
+    case Intrinsic::x86_avx_vtestc_ps_256:
+    case Intrinsic::x86_avx_vtestc_pd_256:
+      IsTestPacked = true; // Fallthrough
     case Intrinsic::x86_sse41_ptestc:
+    case Intrinsic::x86_avx_ptestc_256:
       // CF = 1
       X86CC = X86::COND_B;
       break;
+    case Intrinsic::x86_avx_vtestnzc_ps:
+    case Intrinsic::x86_avx_vtestnzc_pd:
+    case Intrinsic::x86_avx_vtestnzc_ps_256:
+    case Intrinsic::x86_avx_vtestnzc_pd_256:
+      IsTestPacked = true; // Fallthrough
     case Intrinsic::x86_sse41_ptestnzc:
+    case Intrinsic::x86_avx_ptestnzc_256:
       // ZF and CF = 0
       X86CC = X86::COND_A;
       break;
@@ -6947,7 +7619,8 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
 
     SDValue LHS = Op.getOperand(1);
     SDValue RHS = Op.getOperand(2);
-    SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS);
+    unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
+    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
@@ -7110,12 +7783,13 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Handler   = Op.getOperand(2);
   DebugLoc dl       = Op.getDebugLoc();
 
-  SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP,
-                                  getPointerTy());
+  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                                     Subtarget->is64Bit() ? X86::RBP : X86::EBP,
+                                     getPointerTy());
   unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
 
-  SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame,
-                                  DAG.getIntPtrConstant(-TD->getPointerSize()));
+  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
+                                  DAG.getIntPtrConstant(TD->getPointerSize()));
   StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0);
   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
@@ -7218,7 +7892,8 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
 
         if (InRegCount > 2) {
-          report_fatal_error("Nest register in use - reduce number of inreg parameters!");
+          report_fatal_error("Nest register in use - reduce number of inreg"
+                             " parameters!");
         }
       }
       break;
@@ -7439,6 +8114,86 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
   return Res;
 }
 
+SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue R = Op.getOperand(0);
+
+  LLVMContext *Context = DAG.getContext();
+
+  assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later");
+
+  if (VT == MVT::v4i32) {
+    Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
+                     Op.getOperand(1), DAG.getConstant(23, MVT::i32));
+
+    ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U));
+    
+    std::vector<Constant*> CV(4, CI);
+    Constant *C = ConstantVector::get(CV);
+    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+    SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                                 PseudoSourceValue::getConstantPool(), 0,
+                                 false, false, 16);
+
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
+    Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op);
+    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
+    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
+  }
+  if (VT == MVT::v16i8) {
+    // a = a << 5;
+    Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
+                     Op.getOperand(1), DAG.getConstant(5, MVT::i32));
+
+    ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15));
+    ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63));
+
+    std::vector<Constant*> CVM1(16, CM1);
+    std::vector<Constant*> CVM2(16, CM2);
+    Constant *C = ConstantVector::get(CVM1);
+    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+    SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                            PseudoSourceValue::getConstantPool(), 0,
+                            false, false, 16);
+
+    // r = pblendv(r, psllw(r & (char16)15, 4), a);
+    M = DAG.getNode(ISD::AND, dl, VT, R, M);
+    M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                    DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
+                    DAG.getConstant(4, MVT::i32));
+    R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                    DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32),
+                    R, M, Op);
+    // a += a
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
+    
+    C = ConstantVector::get(CVM2);
+    CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+    M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                    PseudoSourceValue::getConstantPool(), 0, false, false, 16);
+    
+    // r = pblendv(r, psllw(r & (char16)63, 2), a);
+    M = DAG.getNode(ISD::AND, dl, VT, R, M);
+    M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                    DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
+                    DAG.getConstant(2, MVT::i32));
+    R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                    DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32),
+                    R, M, Op);
+    // a += a
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
+    
+    // return pblendv(r, r+r, a);
+    R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                    DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32),
+                    R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op);
+    return R;
+  }
+  return SDValue();
+}
 
 SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
@@ -7508,6 +8263,50 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
   return Sum;
 }
 
+SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
+  DebugLoc dl = Op.getDebugLoc();
+  
+  if (!Subtarget->hasSSE2()) {
+    SDValue Chain = Op.getOperand(0);
+    SDValue Zero = DAG.getConstant(0, 
+                                   Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
+    SDValue Ops[] = {
+      DAG.getRegister(X86::ESP, MVT::i32), // Base
+      DAG.getTargetConstant(1, MVT::i8),   // Scale
+      DAG.getRegister(0, MVT::i32),        // Index
+      DAG.getTargetConstant(0, MVT::i32),  // Disp
+      DAG.getRegister(0, MVT::i32),        // Segment.
+      Zero,
+      Chain
+    };
+    SDNode *Res = 
+      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
+                          array_lengthof(Ops));
+    return SDValue(Res, 0);
+  }
+  
+  unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
+  if (!isDev)
+    return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+  
+  unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+  unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  
+  // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
+  if (!Op1 && !Op2 && !Op3 && Op4)
+    return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0));
+  
+  // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
+  if (Op1 && !Op2 && !Op3 && !Op4)
+    return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0));
+  
+  // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 
+  //           (MFENCE)>;
+  return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+}
+
 SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
   EVT T = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
@@ -7597,6 +8396,7 @@ SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Should not custom lower this!");
+  case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op,DAG);
   case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
@@ -7640,6 +8440,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
   case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
+  case ISD::SHL:                return LowerSHL(Op, DAG);
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
@@ -7852,6 +8653,40 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::AND:                return "X86ISD::AND";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
+  case X86ISD::TESTP:              return "X86ISD::TESTP";
+  case X86ISD::PALIGN:             return "X86ISD::PALIGN";
+  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
+  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
+  case X86ISD::PSHUFHW_LD:         return "X86ISD::PSHUFHW_LD";
+  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
+  case X86ISD::PSHUFLW_LD:         return "X86ISD::PSHUFLW_LD";
+  case X86ISD::SHUFPS:             return "X86ISD::SHUFPS";
+  case X86ISD::SHUFPD:             return "X86ISD::SHUFPD";
+  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
+  case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
+  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
+  case X86ISD::MOVHLPD:            return "X86ISD::MOVHLPD";
+  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
+  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
+  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
+  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
+  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
+  case X86ISD::MOVSHDUP_LD:        return "X86ISD::MOVSHDUP_LD";
+  case X86ISD::MOVSLDUP_LD:        return "X86ISD::MOVSLDUP_LD";
+  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
+  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
+  case X86ISD::UNPCKLPS:           return "X86ISD::UNPCKLPS";
+  case X86ISD::UNPCKLPD:           return "X86ISD::UNPCKLPD";
+  case X86ISD::UNPCKHPS:           return "X86ISD::UNPCKHPS";
+  case X86ISD::UNPCKHPD:           return "X86ISD::UNPCKHPD";
+  case X86ISD::PUNPCKLBW:          return "X86ISD::PUNPCKLBW";
+  case X86ISD::PUNPCKLWD:          return "X86ISD::PUNPCKLWD";
+  case X86ISD::PUNPCKLDQ:          return "X86ISD::PUNPCKLDQ";
+  case X86ISD::PUNPCKLQDQ:         return "X86ISD::PUNPCKLQDQ";
+  case X86ISD::PUNPCKHBW:          return "X86ISD::PUNPCKHBW";
+  case X86ISD::PUNPCKHWD:          return "X86ISD::PUNPCKHWD";
+  case X86ISD::PUNPCKHDQ:          return "X86ISD::PUNPCKHDQ";
+  case X86ISD::PUNPCKHQDQ:         return "X86ISD::PUNPCKHQDQ";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::MINGW_ALLOCA:       return "X86ISD::MINGW_ALLOCA";
   }
@@ -7863,6 +8698,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
                                               const Type *Ty) const {
   // X86 supports extremely general addressing modes.
   CodeModel::Model M = getTargetMachine().getCodeModel();
+  Reloc::Model R = getTargetMachine().getRelocationModel();
 
   // X86 allows a sign-extended 32-bit immediate field as a displacement.
   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
@@ -7882,7 +8718,8 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
       return false;
 
     // If lower 4G is not available, then we must use rip-relative addressing.
-    if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
+    if ((M != CodeModel::Small || R != Reloc::Static) &&
+        Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
       return false;
   }
 
@@ -8368,19 +9205,31 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
 }
 
 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
-// all of this code can be replaced with that in the .td file.
+// or XMM0_V32I8 in AVX all of this code can be replaced with that
+// in the .td file.
 MachineBasicBlock *
 X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
                             unsigned numArgs, bool memArg) const {
 
+  assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) &&
+         "Target must have SSE4.2 or AVX features enabled");
+
   DebugLoc dl = MI->getDebugLoc();
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 
   unsigned Opc;
-  if (memArg)
-    Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
-  else
-    Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
+
+  if (!Subtarget->hasAVX()) {
+    if (memArg)
+      Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
+    else
+      Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
+  } else {
+    if (memArg)
+      Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm;
+    else
+      Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr;
+  }
 
   MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc));
 
@@ -8562,7 +9411,8 @@ X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI,
     .addReg(X86::EAX, RegState::Implicit)
     .addReg(X86::ESP, RegState::Implicit)
     .addReg(X86::EAX, RegState::Define | RegState::Implicit)
-    .addReg(X86::ESP, RegState::Define | RegState::Implicit);
+    .addReg(X86::ESP, RegState::Define | RegState::Implicit)
+    .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
 
   MI->eraseFromParent();   // The pseudo instruction is gone now.
   return BB;
@@ -8579,6 +9429,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
     = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
   DebugLoc DL = MI->getDebugLoc();
   MachineFunction *F = BB->getParent();
+  bool IsWin64 = Subtarget->isTargetWin64();
   
   assert(MI->getOperand(3).isGlobal() && "This should be a global");
   
@@ -8590,7 +9441,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 
                       MI->getOperand(3).getTargetFlags())
     .addReg(0);
-    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
+    MIB = BuildMI(*BB, MI, DL, TII->get(IsWin64 ? X86::WINCALL64m : X86::CALL64m));
     addDirectMem(MIB, X86::RDI);
   } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
@@ -8727,12 +9578,16 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   }
     // String/text processing lowering.
   case X86::PCMPISTRM128REG:
+  case X86::VPCMPISTRM128REG:
     return EmitPCMP(MI, BB, 3, false /* in-mem */);
   case X86::PCMPISTRM128MEM:
+  case X86::VPCMPISTRM128MEM:
     return EmitPCMP(MI, BB, 3, true /* in-mem */);
   case X86::PCMPESTRM128REG:
+  case X86::VPCMPESTRM128REG:
     return EmitPCMP(MI, BB, 5, false /* in mem */);
   case X86::PCMPESTRM128MEM:
+  case X86::VPCMPESTRM128MEM:
     return EmitPCMP(MI, BB, 5, true /* in mem */);
 
     // Atomic Lowering.
@@ -8966,21 +9821,20 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                      const TargetLowering &TLI) {
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
-  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
 
   if (VT.getSizeInBits() != 128)
     return SDValue();
 
   SmallVector<SDValue, 16> Elts;
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
-    Elts.push_back(DAG.getShuffleScalarElt(SVN, i));
-  
+    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
+
   return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
 }
 
-/// PerformShuffleCombine - Detect vector gather/scatter index generation
-/// and convert it from being a bunch of shuffles and extracts to a simple
-/// store and scalar loads to extract the elements.
+/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
+/// generation and convert it from being a bunch of shuffles and extracts
+/// to a simple store and scalar loads to extract the elements.
 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
                                                 const TargetLowering &TLI) {
   SDValue InputVector = N->getOperand(0);
@@ -9030,8 +9884,8 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
 
   // Store the value to a temporary stack slot.
   SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
-  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 0,
-                            false, false, 0);
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL,
+                            0, false, false, 0);
 
   // Replace each use (extract) with a load of the appropriate element.
   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
@@ -9045,11 +9899,12 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
     uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
     SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
 
-    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), OffsetVal, StackPtr);
+    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(),
+                                     OffsetVal, StackPtr);
 
     // Load the scalar.
-    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr,
-                          NULL, 0, false, false, 0);
+    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
+                                     ScalarAddr, NULL, 0, false, false, 0);
 
     // Replace the exact with the load.
     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
@@ -9087,8 +9942,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // Converting this to a min would handle NaNs incorrectly, and swapping
         // the operands would cause it to handle comparisons between positive
         // and negative zero incorrectly.
-        if (!FiniteOnlyFPMath() &&
-            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) {
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
           if (!UnsafeFPMath &&
               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
             break;
@@ -9126,8 +9980,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // Converting this to a max would handle NaNs incorrectly, and swapping
         // the operands would cause it to handle comparisons between positive
         // and negative zero incorrectly.
-        if (!FiniteOnlyFPMath() &&
-            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) {
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
           if (!UnsafeFPMath &&
               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
             break;
@@ -9156,8 +10009,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // cause it to handle NaNs incorrectly.
         if (!UnsafeFPMath &&
             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
-          if (!FiniteOnlyFPMath() &&
-              (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
             break;
           std::swap(LHS, RHS);
         }
@@ -9182,8 +10034,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
 
       case ISD::SETULT:
         // Converting this to a max would handle NaNs incorrectly.
-        if (!FiniteOnlyFPMath() &&
-            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
           break;
         Opcode = X86ISD::FMAX;
         break;
@@ -9193,8 +10044,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // cause it to handle NaNs incorrectly.
         if (!UnsafeFPMath &&
             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
-          if (!FiniteOnlyFPMath() &&
-              (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
             break;
           std::swap(LHS, RHS);
         }
@@ -9905,7 +10755,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
   default: break;
-  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
   case ISD::EXTRACT_VECTOR_ELT:
                         return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
@@ -9922,6 +10771,28 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG);
+  case X86ISD::SHUFPS:      // Handle all target specific shuffles
+  case X86ISD::SHUFPD:
+  case X86ISD::PUNPCKHBW:
+  case X86ISD::PUNPCKHWD:
+  case X86ISD::PUNPCKHDQ:
+  case X86ISD::PUNPCKHQDQ:
+  case X86ISD::UNPCKHPS:
+  case X86ISD::UNPCKHPD:
+  case X86ISD::PUNPCKLBW:
+  case X86ISD::PUNPCKLWD:
+  case X86ISD::PUNPCKLDQ:
+  case X86ISD::PUNPCKLQDQ:
+  case X86ISD::UNPCKLPS:
+  case X86ISD::UNPCKLPD:
+  case X86ISD::MOVHLPS:
+  case X86ISD::MOVLHPS:
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
   }
 
   return SDValue();
@@ -9956,14 +10827,6 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   }
 }
 
-static bool MayFoldLoad(SDValue Op) {
-  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
-}
-
-static bool MayFoldIntoStore(SDValue Op) {
-  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
-}
-
 /// IsDesirableToPromoteOp - This method query the target whether it is
 /// beneficial for dag combiner to promote the specified node. If true, it
 /// should return the desired promotion type by reference.
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 4e4daa4bc5ca9..d2d9b28a03967 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -248,6 +248,44 @@ namespace llvm {
       // PTEST - Vector bitwise comparisons
       PTEST,
 
+      // TESTP - Vector packed fp sign bitwise comparisons
+      TESTP,
+
+      // Several flavors of instructions with vector shuffle behaviors.
+      PALIGN,
+      PSHUFD,
+      PSHUFHW,
+      PSHUFLW,
+      PSHUFHW_LD,
+      PSHUFLW_LD,
+      SHUFPD,
+      SHUFPS,
+      MOVDDUP,
+      MOVSHDUP,
+      MOVSLDUP,
+      MOVSHDUP_LD,
+      MOVSLDUP_LD,
+      MOVLHPS,
+      MOVLHPD,
+      MOVHLPS,
+      MOVHLPD,
+      MOVLPS,
+      MOVLPD,
+      MOVSD,
+      MOVSS,
+      UNPCKLPS,
+      UNPCKLPD,
+      UNPCKHPS,
+      UNPCKHPD,
+      PUNPCKLBW,
+      PUNPCKLWD,
+      PUNPCKLDQ,
+      PUNPCKLQDQ,
+      PUNPCKHBW,
+      PUNPCKHWD,
+      PUNPCKHDQ,
+      PUNPCKHQDQ,
+
       // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
       // according to %al. An operator is needed so that this can be expanded
       // with control flow.
@@ -265,7 +303,13 @@ namespace llvm {
       ATOMXOR64_DAG,
       ATOMAND64_DAG,
       ATOMNAND64_DAG,
-      ATOMSWAP64_DAG
+      ATOMSWAP64_DAG,
+      
+      // Memory barrier
+      MEMBARRIER,
+      MFENCE,
+      SFENCE,
+      LFENCE
 
       // WARNING: Do not add anything in the end unless you want the node to
       // have memop! In fact, starting from ATOMADD64_DAG all opcodes will be
@@ -584,12 +628,19 @@ namespace llvm {
     /// getFunctionAlignment - Return the Log2 alignment of this function.
     virtual unsigned getFunctionAlignment(const Function *F) const;
 
+    unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                                 MachineFunction &MF) const;
+
     /// getStackCookieLocation - Return true if the target stores stack
     /// protector cookies at a fixed offset in some non-standard address
     /// space, and populates the address space and offset as
     /// appropriate.
     virtual bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const;
 
+  protected:
+    std::pair<const TargetRegisterClass*, uint8_t>
+    findRepresentativeClass(EVT VT) const;
+
   private:
     /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
     /// make the right decision when generating code for different targets.
@@ -710,11 +761,16 @@ namespace llvm {
     SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSHL(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const;
+
+    // Utility functions to help LowerVECTOR_SHUFFLE
+    SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
index 42d0e7f9778ab..0884b61425e9e 100644
--- a/lib/Target/X86/X86Instr64bit.td
+++ b/lib/Target/X86/X86Instr64bit.td
@@ -73,11 +73,7 @@ def GetLo32XForm : SDNodeXForm<imm, [{
   return getI32Imm((unsigned)N->getZExtValue());
 }]>;
 
-def i64immSExt32  : PatLeaf<(i64 imm), [{
-  // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
-  // sign extended field.
-  return (int64_t)N->getZExtValue() == (int32_t)N->getZExtValue();
-}]>;
+def i64immSExt32  : PatLeaf<(i64 imm), [{ return i64immSExt32(N); }]>;
 
 
 def i64immZExt32  : PatLeaf<(i64 imm), [{
@@ -158,7 +154,7 @@ let isCall = 1 in
 
   // FIXME: We need to teach codegen about single list of call-clobbered 
   // registers.
-let isCall = 1 in
+let isCall = 1, isCodeGenOnly = 1 in
   // All calls clobber the non-callee saved registers. RSP is marked as
   // a use to prevent stack-pointer assignments that appear immediately
   // before calls from potentially appearing dead. Uses for argument
@@ -168,7 +164,7 @@ let isCall = 1 in
               MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
               XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, EFLAGS],
       Uses = [RSP] in {
-    def WINCALL64pcrel32 : I<0xE8, RawFrm,
+    def WINCALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
                              (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
                              "call\t$dst", []>,
                            Requires<[IsWin64]>;
@@ -182,7 +178,8 @@ let isCall = 1 in
   }
 
 
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    isCodeGenOnly = 1 in
   let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
               FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
               MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
@@ -216,9 +213,9 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
   def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst), 
                        "jmp{q}\t$dst", []>;
   def JMP64r     : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
-                     [(brind GR64:$dst)]>;
+                     [(brind GR64:$dst)]>, Requires<[In64BitMode]>;
   def JMP64m     : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
-                     [(brind (loadi64 addr:$dst))]>;
+                     [(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>;
   def FARJMP64   : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
                       "ljmp{q}\t{*}$dst", []>;
 }
@@ -246,7 +243,7 @@ def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
 
 let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
 def LEAVE64  : I<0xC9, RawFrm,
-                 (outs), (ins), "leave", []>;
+                 (outs), (ins), "leave", []>, Requires<[In64BitMode]>;
 let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
 let mayLoad = 1 in {
 def POP64r   : I<0x58, AddRegFrm,
@@ -330,7 +327,7 @@ def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", []>;
 
 // Fast system-call instructions
 def SYSEXIT64 : RI<0x35, RawFrm,
-                   (outs), (ins), "sysexit", []>, TB;
+                   (outs), (ins), "sysexit", []>, TB, Requires<[In64BitMode]>;
 
 //===----------------------------------------------------------------------===//
 //  Move Instructions...
@@ -374,6 +371,7 @@ def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
                       [(store i64immSExt32:$src, addr:$dst)]>;
 
 /// Versions of MOV64rr, MOV64rm, and MOV64mr for i64mem_TC and GR64_TC.
+let isCodeGenOnly = 1 in {
 let neverHasSideEffects = 1 in
 def MOV64rr_TC : RI<0x89, MRMDestReg, (outs GR64_TC:$dst), (ins GR64_TC:$src),
                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
@@ -388,7 +386,13 @@ let mayStore = 1 in
 def MOV64mr_TC : RI<0x89, MRMDestMem, (outs), (ins i64mem_TC:$dst, GR64_TC:$src),
                 "mov{q}\t{$src, $dst|$dst, $src}",
                 []>;
+}
 
+// FIXME: These definitions are utterly broken
+// Just leave them commented out for now because they're useless outside
+// of the large code model, and most compilers won't generate the instructions
+// in question.
+/*
 def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src),
                       "mov{q}\t{$src, %rax|%rax, $src}", []>;
 def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src),
@@ -397,6 +401,7 @@ def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins),
                        "mov{q}\t{%rax, $dst|$dst, %rax}", []>;
 def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins),
                        "mov{q}\t{%rax, $dst|$dst, %rax}", []>;
+*/
 
 // Moves to and from segment registers
 def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
@@ -1316,14 +1321,13 @@ def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
                 []
                 >, TB;
 
-def BT64ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
                 "bt{q}\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB,
-		REX_W;
+                [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB;
 // Note that these instructions don't need FastBTMem because that
 // only applies when the other operand is in a register. When it's
 // an immediate, bt is still fast.
-def BT64mi8 : Ii8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
                 "bt{q}\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86bt (loadi64 addr:$src1),
                                      i64immSExt8:$src2))]>, TB;
@@ -1537,116 +1541,6 @@ def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
           (SETB_C64r)>;
 
 //===----------------------------------------------------------------------===//
-//  Conversion Instructions...
-//
-
-// f64 -> signed i64
-def CVTSD2SI64rr: RSDI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src),
-                       "cvtsd2si{q}\t{$src, $dst|$dst, $src}", []>;
-def CVTSD2SI64rm: RSDI<0x2D, MRMSrcMem, (outs GR64:$dst), (ins f64mem:$src),
-                       "cvtsd2si{q}\t{$src, $dst|$dst, $src}", []>;
-def Int_CVTSD2SI64rr: RSDI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-                           "cvtsd2si{q}\t{$src, $dst|$dst, $src}",
-                           [(set GR64:$dst,
-                             (int_x86_sse2_cvtsd2si64 VR128:$src))]>;
-def Int_CVTSD2SI64rm: RSDI<0x2D, MRMSrcMem, (outs GR64:$dst), 
-                           (ins f128mem:$src),
-                           "cvtsd2si{q}\t{$src, $dst|$dst, $src}",
-                           [(set GR64:$dst, (int_x86_sse2_cvtsd2si64
-                                             (load addr:$src)))]>;
-def CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src),
-                        "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
-                        [(set GR64:$dst, (fp_to_sint FR64:$src))]>;
-def CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f64mem:$src),
-                        "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
-                        [(set GR64:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
-def Int_CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-                            "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
-                            [(set GR64:$dst,
-                              (int_x86_sse2_cvttsd2si64 VR128:$src))]>;
-def Int_CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (outs GR64:$dst), 
-                            (ins f128mem:$src),
-                            "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
-                            [(set GR64:$dst,
-                              (int_x86_sse2_cvttsd2si64
-                               (load addr:$src)))]>;
-
-// Signed i64 -> f64
-def CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
-                       "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
-                       [(set FR64:$dst, (sint_to_fp GR64:$src))]>;
-def CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
-                       "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
-                       [(set FR64:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
-
-let Constraints = "$src1 = $dst" in {
-def Int_CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg,
-                           (outs VR128:$dst), (ins VR128:$src1, GR64:$src2),
-                           "cvtsi2sd{q}\t{$src2, $dst|$dst, $src2}",
-                           [(set VR128:$dst,
-                             (int_x86_sse2_cvtsi642sd VR128:$src1,
-                              GR64:$src2))]>;
-def Int_CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem,
-                           (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
-                           "cvtsi2sd{q}\t{$src2, $dst|$dst, $src2}",
-                           [(set VR128:$dst,
-                             (int_x86_sse2_cvtsi642sd VR128:$src1,
-                              (loadi64 addr:$src2)))]>;
-} // Constraints = "$src1 = $dst"
-
-// Signed i64 -> f32
-def CVTSI2SS64rr: RSSI<0x2A, MRMSrcReg, (outs FR32:$dst), (ins GR64:$src),
-                       "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
-                       [(set FR32:$dst, (sint_to_fp GR64:$src))]>;
-def CVTSI2SS64rm: RSSI<0x2A, MRMSrcMem, (outs FR32:$dst), (ins i64mem:$src),
-                       "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
-                       [(set FR32:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
-
-let Constraints = "$src1 = $dst" in {
-  def Int_CVTSI2SS64rr : RSSI<0x2A, MRMSrcReg,
-                              (outs VR128:$dst), (ins VR128:$src1, GR64:$src2),
-                              "cvtsi2ss{q}\t{$src2, $dst|$dst, $src2}",
-                              [(set VR128:$dst,
-                                (int_x86_sse_cvtsi642ss VR128:$src1,
-                                 GR64:$src2))]>;
-  def Int_CVTSI2SS64rm : RSSI<0x2A, MRMSrcMem,
-                              (outs VR128:$dst), 
-                              (ins VR128:$src1, i64mem:$src2),
-                              "cvtsi2ss{q}\t{$src2, $dst|$dst, $src2}",
-                              [(set VR128:$dst,
-                                (int_x86_sse_cvtsi642ss VR128:$src1,
-                                 (loadi64 addr:$src2)))]>;
-} // Constraints = "$src1 = $dst"
-
-// f32 -> signed i64
-def CVTSS2SI64rr: RSSI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src),
-                       "cvtss2si{q}\t{$src, $dst|$dst, $src}", []>;
-def CVTSS2SI64rm: RSSI<0x2D, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
-                       "cvtss2si{q}\t{$src, $dst|$dst, $src}", []>;
-def Int_CVTSS2SI64rr: RSSI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-                           "cvtss2si{q}\t{$src, $dst|$dst, $src}",
-                           [(set GR64:$dst,
-                             (int_x86_sse_cvtss2si64 VR128:$src))]>;
-def Int_CVTSS2SI64rm: RSSI<0x2D, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
-                           "cvtss2si{q}\t{$src, $dst|$dst, $src}",
-                           [(set GR64:$dst, (int_x86_sse_cvtss2si64
-                                             (load addr:$src)))]>;
-def CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src),
-                        "cvttss2si{q}\t{$src, $dst|$dst, $src}",
-                        [(set GR64:$dst, (fp_to_sint FR32:$src))]>;
-def CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
-                        "cvttss2si{q}\t{$src, $dst|$dst, $src}",
-                        [(set GR64:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
-def Int_CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-                            "cvttss2si{q}\t{$src, $dst|$dst, $src}",
-                            [(set GR64:$dst,
-                              (int_x86_sse_cvttss2si64 VR128:$src))]>;
-def Int_CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (outs GR64:$dst),
-                            (ins f32mem:$src),
-                            "cvttss2si{q}\t{$src, $dst|$dst, $src}",
-                            [(set GR64:$dst,
-                              (int_x86_sse_cvttss2si64 (load addr:$src)))]>;
-                              
 // Descriptor-table support instructions
 
 // LLDT is not interpreted specially in 64-bit mode because there is no sign
@@ -1726,6 +1620,14 @@ def MOV64FSrm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
 // Atomic Instructions
 //===----------------------------------------------------------------------===//
 
+// TODO: Get this to fold the constant into the instruction.           
+let hasSideEffects = 1, Defs = [ESP] in
+def Int_MemBarrierNoSSE64  : RI<0x09, MRM1r, (outs), (ins GR64:$zero),
+                           "lock\n\t"
+                           "or{q}\t{$zero, (%rsp)|(%rsp), $zero}",
+                           [(X86MemBarrierNoSSE GR64:$zero)]>,
+													 Requires<[In64BitMode]>, LOCK;
+
 let Defs = [RAX, EFLAGS], Uses = [RAX] in {
 def LCMPXCHG64 : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$ptr, GR64:$swap),
                "lock\n\t"
@@ -1772,7 +1674,7 @@ def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
 // Optimized codegen when the non-memory output is not used.
 let Defs = [EFLAGS], mayLoad = 1, mayStore = 1 in {
 // FIXME: Use normal add / sub instructions and add lock prefix dynamically.
-def LOCK_ADD64mr : RI<0x03, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+def LOCK_ADD64mr : RI<0x01, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                       "lock\n\t"
                       "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
 def LOCK_ADD64mi8 : RIi8<0x83, MRM0m, (outs),
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
new file mode 100644
index 0000000000000..d868773d2d690
--- /dev/null
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -0,0 +1,60 @@
+//====- X86InstrFMA.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes FMA (Fused Multiply-Add) instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FMA3 - Intel 3 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+multiclass fma_rm<bits<8> opc, string OpcodeStr> {
+  def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           []>;
+  def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, f128mem:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           []>;
+  def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           []>;
+  def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, f256mem:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           []>;
+}
+
+multiclass fma_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                     string OpcodeStr, string PackTy> {
+  defm r132 : fma_rm<opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy))>;
+  defm r213 : fma_rm<opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy))>;
+  defm r231 : fma_rm<opc231, !strconcat(OpcodeStr, !strconcat("231", PackTy))>;
+}
+
+let isAsmParserOnly = 1 in {
+  // Fused Multiply-Add
+  defm VFMADDPS    : fma_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps">;
+  defm VFMADDPD    : fma_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd">, VEX_W;
+  defm VFMADDSUBPS : fma_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps">;
+  defm VFMADDSUBPD : fma_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd">, VEX_W;
+  defm VFMSUBADDPS : fma_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps">;
+  defm VFMSUBADDPD : fma_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd">, VEX_W;
+  defm VFMSUBPS    : fma_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps">;
+  defm VFMSUBPD    : fma_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd">, VEX_W;
+
+  // Fused Negative Multiply-Add
+  defm VFNMADDPS : fma_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps">;
+  defm VFNMADDPD : fma_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd">, VEX_W;
+  defm VFNMSUBPS : fma_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps">;
+  defm VFNMSUBPD : fma_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd">, VEX_W;
+}
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index da93de988d506..9c9bcc7d0b6a0 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -108,10 +108,6 @@ let usesCustomInserter = 1 in {  // Expanded after instruction selection.
                               [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
 }
 
-let isTerminator = 1 in
-  let Defs = [FP0, FP1, FP2, FP3, FP4, FP5, FP6] in
-    def FP_REG_KILL  : I<0, Pseudo, (outs), (ins), "##FP_REG_KILL", []>;
-
 // All FP Stack operations are represented with four instructions here.  The
 // first three instructions, generated by the instruction selector, use "RFP32"
 // "RFP64" or "RFP80" registers: traditional register files to reference 32-bit,
@@ -157,7 +153,7 @@ def FpSET_ST1_64 : FpI_<(outs), (ins RFP64:$src), SpecialFP, []>; // ST(1) = FPR
 def FpSET_ST1_80 : FpI_<(outs), (ins RFP80:$src), SpecialFP, []>; // ST(1) = FPR
 }
 
-// FpIf32, FpIf64 - Floating Point Psuedo Instruction template.
+// FpIf32, FpIf64 - Floating Point Pseudo Instruction template.
 // f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
 // f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
 // f80 instructions cannot use SSE and use neither of these.
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index cc3fdf1efd7b6..79187e9a76d70 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -39,6 +39,7 @@ def MRM_E8 : Format<39>;
 def MRM_F0 : Format<40>;
 def MRM_F8 : Format<41>;
 def MRM_F9 : Format<42>;
+def RawFrmImm16 : Format<43>;
 
 // ImmType - This specifies the immediate type used by an instruction. This is
 // part of the ad-hoc solution used to emit machine instruction encodings by our
@@ -210,7 +211,7 @@ class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
 class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
   : I<o, F, outs, ins, asm, []> {}
 
-// FpI_ - Floating Point Psuedo Instruction template. Not Predicated.
+// FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
 class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
   : X86Inst<0, Pseudo, NoImm, outs, ins, ""> {
   let FPForm = fp;
@@ -224,13 +225,13 @@ class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
 //   Iseg32 - 16-bit segment selector, 32-bit offset
 
 class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, 
-              list<dag> pattern> : X86Inst<o, f, NoImm, outs, ins, asm> {
+              list<dag> pattern> : X86Inst<o, f, Imm16, outs, ins, asm> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 
 class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, 
-              list<dag> pattern> : X86Inst<o, f, NoImm, outs, ins, asm> {
+              list<dag> pattern> : X86Inst<o, f, Imm32, outs, ins, asm> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
@@ -411,6 +412,20 @@ class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
       : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
         Requires<[HasSSE42]>;
 
+// AVX Instruction Templates:
+//   Instructions introduced in AVX (no SSE equivalent forms)
+//
+//   AVX8I - AVX instructions with T8 and OpSize prefix.
+//   AVXAIi8 - AVX instructions with TA, OpSize prefix and ImmT = Imm8.
+class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, OpSize,
+        Requires<[HasAVX]>;
+class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, OpSize,
+        Requires<[HasAVX]>;
+
 // AES Instruction Templates:
 //
 // AES8I
@@ -425,6 +440,18 @@ class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
       : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
         Requires<[HasAES]>;
 
+// CLMUL Instruction Templates
+class CLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag>pattern>
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+        OpSize, VEX_4V, Requires<[HasAVX, HasCLMUL]>;
+
+// FMA3 Instruction Templates
+class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern>
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+        OpSize, VEX_4V, Requires<[HasFMA3]>;
+
 // X86-64 Instruction templates...
 //
 
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 71c4e8bc147fa..01149b699213d 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -117,9 +117,67 @@ def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>;
 def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>;
 
 def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
-                                          SDTCisVT<1, v4f32>,
-                                          SDTCisVT<2, v4f32>]>;
+                                          SDTCisVec<1>,
+                                          SDTCisSameAs<2, 1>]>;
 def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
+def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
+
+// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
+// translated into one of the target nodes below during lowering.
+// Note: this is a work in progress...
+def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                SDTCisSameAs<0,2>]>;
+
+def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                 SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                 SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+
+def SDTShuff2OpLdI : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisPtrTy<1>,
+                                  SDTCisInt<2>]>;
+
+def X86PAlign : SDNode<"X86ISD::PALIGN", SDTShuff3OpI>;
+
+def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
+def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
+def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>;
+
+def X86PShufhwLd : SDNode<"X86ISD::PSHUFHW_LD", SDTShuff2OpLdI>;
+def X86PShuflwLd : SDNode<"X86ISD::PSHUFLW_LD", SDTShuff2OpLdI>;
+
+def X86Shufpd : SDNode<"X86ISD::SHUFPD", SDTShuff3OpI>;
+def X86Shufps : SDNode<"X86ISD::SHUFPS", SDTShuff3OpI>;
+
+def X86Movddup  : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
+def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
+def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
+
+def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>;
+def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>;
+
+def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>;
+def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>;
+def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
+def X86Movhlpd : SDNode<"X86ISD::MOVHLPD", SDTShuff2Op>;
+
+def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
+def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
+
+def X86Unpcklps : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>;
+def X86Unpcklpd : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>;
+def X86Unpckhps : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>;
+def X86Unpckhpd : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>;
+
+def X86Punpcklbw  : SDNode<"X86ISD::PUNPCKLBW", SDTShuff2Op>;
+def X86Punpcklwd  : SDNode<"X86ISD::PUNPCKLWD", SDTShuff2Op>;
+def X86Punpckldq  : SDNode<"X86ISD::PUNPCKLDQ", SDTShuff2Op>;
+def X86Punpcklqdq : SDNode<"X86ISD::PUNPCKLQDQ", SDTShuff2Op>;
+
+def X86Punpckhbw  : SDNode<"X86ISD::PUNPCKHBW", SDTShuff2Op>;
+def X86Punpckhwd  : SDNode<"X86ISD::PUNPCKHWD", SDTShuff2Op>;
+def X86Punpckhdq  : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>;
+def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>;
 
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
@@ -148,12 +206,13 @@ def sdmem : Operand<v2f64> {
 // SSE pattern fragments
 //===----------------------------------------------------------------------===//
 
+// 128-bit load pattern fragments
 def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
 def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
 def loadv4i32    : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
 def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
 
-// FIXME: move this to a more appropriate place after all AVX is done.
+// 256-bit load pattern fragments
 def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
 def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
 def loadv8i32    : PatFrag<(ops node:$ptr), (v8i32 (load node:$ptr))>;
@@ -174,6 +233,8 @@ def alignedloadfsf32 : PatFrag<(ops node:$ptr),
                                (f32 (alignedload node:$ptr))>;
 def alignedloadfsf64 : PatFrag<(ops node:$ptr),
                                (f64 (alignedload node:$ptr))>;
+
+// 128-bit aligned load pattern fragments
 def alignedloadv4f32 : PatFrag<(ops node:$ptr),
                                (v4f32 (alignedload node:$ptr))>;
 def alignedloadv2f64 : PatFrag<(ops node:$ptr),
@@ -183,7 +244,7 @@ def alignedloadv4i32 : PatFrag<(ops node:$ptr),
 def alignedloadv2i64 : PatFrag<(ops node:$ptr),
                                (v2i64 (alignedload node:$ptr))>;
 
-// FIXME: move this to a more appropriate place after all AVX is done.
+// 256-bit aligned load pattern fragments
 def alignedloadv8f32 : PatFrag<(ops node:$ptr),
                                (v8f32 (alignedload node:$ptr))>;
 def alignedloadv4f64 : PatFrag<(ops node:$ptr),
@@ -206,15 +267,20 @@ def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
 
 def memopfsf32 : PatFrag<(ops node:$ptr), (f32   (memop node:$ptr))>;
 def memopfsf64 : PatFrag<(ops node:$ptr), (f64   (memop node:$ptr))>;
+
+// 128-bit memop pattern fragments
 def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
 def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
 def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
 def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
 def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
 
-// FIXME: move this to a more appropriate place after all AVX is done.
+// 256-bit memop pattern fragments
+def memopv32i8 : PatFrag<(ops node:$ptr), (v32i8 (memop node:$ptr))>;
 def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>;
 def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>;
+def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>;
+def memopv8i32 : PatFrag<(ops node:$ptr), (v8i32 (memop node:$ptr))>;
 
 // SSSE3 uses MMX registers for some instructions. They aren't aligned on a
 // 16-byte boundary.
@@ -254,6 +320,7 @@ def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
   return false;
 }]>;
 
+// 128-bit bitconvert pattern fragments
 def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
 def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
 def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
@@ -261,6 +328,9 @@ def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
 def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
 def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
 
+// 256-bit bitconvert pattern fragments
+def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
+
 def vzmovl_v2i64 : PatFrag<(ops node:$src),
                            (bitconvert (v2i64 (X86vzmovl
                              (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index ce471eadd78b2..5280940cf437b 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -235,6 +235,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::BT64ri8,     X86::BT64mi8, 1, 0 },
     { X86::CALL32r,     X86::CALL32m, 1, 0 },
     { X86::CALL64r,     X86::CALL64m, 1, 0 },
+    { X86::WINCALL64r,  X86::WINCALL64m, 1, 0 },
     { X86::CMP16ri,     X86::CMP16mi, 1, 0 },
     { X86::CMP16ri8,    X86::CMP16mi8, 1, 0 },
     { X86::CMP16rr,     X86::CMP16mr, 1, 0 },
@@ -667,46 +668,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
   assert(AmbEntries.empty() && "Duplicated entries in unfolding maps?");
 }
 
-bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
-                               unsigned &SrcReg, unsigned &DstReg,
-                               unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case X86::MOV8rr:
-  case X86::MOV8rr_NOREX:
-  case X86::MOV16rr:
-  case X86::MOV32rr: 
-  case X86::MOV64rr:
-  case X86::MOV32rr_TC: 
-  case X86::MOV64rr_TC:
-
-  // FP Stack register class copies
-  case X86::MOV_Fp3232: case X86::MOV_Fp6464: case X86::MOV_Fp8080:
-  case X86::MOV_Fp3264: case X86::MOV_Fp3280:
-  case X86::MOV_Fp6432: case X86::MOV_Fp8032:
-
-  // Note that MOVSSrr and MOVSDrr are not considered copies. FR32 and FR64
-  // copies are done with FsMOVAPSrr and FsMOVAPDrr.
-
-  case X86::FsMOVAPSrr:
-  case X86::FsMOVAPDrr:
-  case X86::MOVAPSrr:
-  case X86::MOVAPDrr:
-  case X86::MOVDQArr:
-  case X86::MMX_MOVQ64rr:
-    assert(MI.getNumOperands() >= 2 &&
-           MI.getOperand(0).isReg() &&
-           MI.getOperand(1).isReg() &&
-           "invalid register-register move instruction");
-    SrcReg = MI.getOperand(1).getReg();
-    DstReg = MI.getOperand(0).getReg();
-    SrcSubIdx = MI.getOperand(1).getSubReg();
-    DstSubIdx = MI.getOperand(0).getSubReg();
-    return true;
-  }
-}
-
 bool
 X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
                                     unsigned &SrcReg, unsigned &DstReg,
@@ -827,7 +788,7 @@ static bool isFrameStoreOpcode(int Opcode) {
 unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, 
                                            int &FrameIndex) const {
   if (isFrameLoadOpcode(MI->getOpcode()))
-    if (isFrameOperand(MI, 1, FrameIndex))
+    if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
       return MI->getOperand(0).getReg();
   return 0;
 }
@@ -866,7 +827,8 @@ bool X86InstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
 unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
                                           int &FrameIndex) const {
   if (isFrameStoreOpcode(MI->getOpcode()))
-    if (isFrameOperand(MI, 0, FrameIndex))
+    if (MI->getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
+        isFrameOperand(MI, 0, FrameIndex))
       return MI->getOperand(X86::AddrNumOperands).getReg();
   return 0;
 }
@@ -1664,14 +1626,6 @@ bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
   return !isPredicated(MI);
 }
 
-// For purposes of branch analysis do not count FP_REG_KILL as a terminator.
-static bool isBrAnalysisUnpredicatedTerminator(const MachineInstr *MI,
-                                               const X86InstrInfo &TII) {
-  if (MI->getOpcode() == X86::FP_REG_KILL)
-    return false;
-  return TII.isUnpredicatedTerminator(MI);
-}
-
 bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, 
                                  MachineBasicBlock *&TBB,
                                  MachineBasicBlock *&FBB,
@@ -1688,7 +1642,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
     // Working from the bottom, when we see a non-terminator instruction, we're
     // done.
-    if (!isBrAnalysisUnpredicatedTerminator(I, *this))
+    if (!isUnpredicatedTerminator(I))
       break;
 
     // A terminator that isn't a branch can't easily be handled by this
@@ -1891,6 +1845,33 @@ static bool isHReg(unsigned Reg) {
   return X86::GR8_ABCD_HRegClass.contains(Reg);
 }
 
+// Try and copy between VR128/VR64 and GR64 registers.
+static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg) {
+  // SrcReg(VR128) -> DestReg(GR64)
+  // SrcReg(VR64)  -> DestReg(GR64)
+  // SrcReg(GR64)  -> DestReg(VR128)
+  // SrcReg(GR64)  -> DestReg(VR64)
+
+  if (X86::GR64RegClass.contains(DestReg)) {
+    if (X86::VR128RegClass.contains(SrcReg)) {
+      // Copy from a VR128 register to a GR64 register.
+      return X86::MOVPQIto64rr;
+    } else if (X86::VR64RegClass.contains(SrcReg)) {
+      // Copy from a VR64 register to a GR64 register.
+      return X86::MOVSDto64rr;
+    }
+  } else if (X86::GR64RegClass.contains(SrcReg)) {
+    // Copy from a GR64 register to a VR128 register.
+    if (X86::VR128RegClass.contains(DestReg))
+      return X86::MOV64toPQIrr;
+    // Copy from a GR64 register to a VR64 register.
+    else if (X86::VR64RegClass.contains(DestReg))
+      return X86::MOV64toSDrr;
+  }
+
+  return 0;
+}
+
 void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator MI, DebugLoc DL,
                                unsigned DestReg, unsigned SrcReg,
@@ -1915,6 +1896,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Opc = X86::MOVAPSrr;
   else if (X86::VR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MMX_MOVQ64rr;
+  else
+    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg);
 
   if (Opc) {
     BuildMI(MBB, MI, DL, get(Opc), DestReg)
@@ -2046,6 +2029,8 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
+  assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
+         "Stack slot too small for store");
   bool isAligned = (RI.getStackAlignment() >= 16) || RI.canRealignStack(MF);
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
   DebugLoc DL = MBB.findDebugLoc(MI);
@@ -2130,8 +2115,9 @@ bool X86InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
       CalleeFrameSize += SlotSize;
       BuildMI(MBB, MI, DL, get(Opc)).addReg(Reg, RegState::Kill);
     } else {
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
       storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(),
-                          &X86::VR128RegClass, &RI);
+                          RC, &RI);
     }
   }
 
@@ -2161,8 +2147,9 @@ bool X86InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
     if (!X86::VR128RegClass.contains(Reg) && !isWin64) {
       BuildMI(MBB, MI, DL, get(Opc), Reg);
     } else {
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
       loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(),
-                           &X86::VR128RegClass, &RI);
+                           RC, &RI);
     }
   }
   return true;
@@ -2423,10 +2410,17 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     Alignment = (*LoadMI->memoperands_begin())->getAlignment();
   else
     switch (LoadMI->getOpcode()) {
+    case X86::AVX_SET0PSY:
+    case X86::AVX_SET0PDY:
+      Alignment = 32;
+      break;
     case X86::V_SET0PS:
     case X86::V_SET0PD:
     case X86::V_SET0PI:
     case X86::V_SETALLONES:
+    case X86::AVX_SET0PS:
+    case X86::AVX_SET0PD:
+    case X86::AVX_SET0PI:
       Alignment = 16;
       break;
     case X86::FsFLD0SD:
@@ -2453,12 +2447,22 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   } else if (Ops.size() != 1)
     return NULL;
 
+  // Make sure the subregisters match.
+  // Otherwise we risk changing the size of the load.
+  if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg())
+    return NULL;
+
   SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
   switch (LoadMI->getOpcode()) {
   case X86::V_SET0PS:
   case X86::V_SET0PD:
   case X86::V_SET0PI:
   case X86::V_SETALLONES:
+  case X86::AVX_SET0PS:
+  case X86::AVX_SET0PD:
+  case X86::AVX_SET0PI:
+  case X86::AVX_SET0PSY:
+  case X86::AVX_SET0PDY:
   case X86::FsFLD0SD:
   case X86::FsFLD0SS: {
     // Folding a V_SET0P? or V_SETALLONES as a load, to ease register pressure.
@@ -2485,10 +2489,13 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     // Create a constant-pool entry.
     MachineConstantPool &MCP = *MF.getConstantPool();
     const Type *Ty;
-    if (LoadMI->getOpcode() == X86::FsFLD0SS)
+    unsigned Opc = LoadMI->getOpcode();
+    if (Opc == X86::FsFLD0SS)
       Ty = Type::getFloatTy(MF.getFunction()->getContext());
-    else if (LoadMI->getOpcode() == X86::FsFLD0SD)
+    else if (Opc == X86::FsFLD0SD)
       Ty = Type::getDoubleTy(MF.getFunction()->getContext());
+    else if (Opc == X86::AVX_SET0PSY || Opc == X86::AVX_SET0PDY)
+      Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8);
     else
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
     const Constant *C = LoadMI->getOpcode() == X86::V_SETALLONES ?
@@ -2991,561 +2998,6 @@ bool X86InstrInfo::isX86_64ExtendedReg(unsigned RegNo) {
   return false;
 }
 
-
-/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64
-/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
-/// size, and 3) use of X86-64 extended registers.
-unsigned X86InstrInfo::determineREX(const MachineInstr &MI) {
-  unsigned REX = 0;
-  const TargetInstrDesc &Desc = MI.getDesc();
-
-  // Pseudo instructions do not need REX prefix byte.
-  if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo)
-    return 0;
-  if (Desc.TSFlags & X86II::REX_W)
-    REX |= 1 << 3;
-
-  unsigned NumOps = Desc.getNumOperands();
-  if (NumOps) {
-    bool isTwoAddr = NumOps > 1 &&
-      Desc.getOperandConstraint(1, TOI::TIED_TO) != -1;
-
-    // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
-    unsigned i = isTwoAddr ? 1 : 0;
-    for (unsigned e = NumOps; i != e; ++i) {
-      const MachineOperand& MO = MI.getOperand(i);
-      if (MO.isReg()) {
-        unsigned Reg = MO.getReg();
-        if (isX86_64NonExtLowByteReg(Reg))
-          REX |= 0x40;
-      }
-    }
-
-    switch (Desc.TSFlags & X86II::FormMask) {
-    case X86II::MRMInitReg:
-      if (isX86_64ExtendedReg(MI.getOperand(0)))
-        REX |= (1 << 0) | (1 << 2);
-      break;
-    case X86II::MRMSrcReg: {
-      if (isX86_64ExtendedReg(MI.getOperand(0)))
-        REX |= 1 << 2;
-      i = isTwoAddr ? 2 : 1;
-      for (unsigned e = NumOps; i != e; ++i) {
-        const MachineOperand& MO = MI.getOperand(i);
-        if (isX86_64ExtendedReg(MO))
-          REX |= 1 << 0;
-      }
-      break;
-    }
-    case X86II::MRMSrcMem: {
-      if (isX86_64ExtendedReg(MI.getOperand(0)))
-        REX |= 1 << 2;
-      unsigned Bit = 0;
-      i = isTwoAddr ? 2 : 1;
-      for (; i != NumOps; ++i) {
-        const MachineOperand& MO = MI.getOperand(i);
-        if (MO.isReg()) {
-          if (isX86_64ExtendedReg(MO))
-            REX |= 1 << Bit;
-          Bit++;
-        }
-      }
-      break;
-    }
-    case X86II::MRM0m: case X86II::MRM1m:
-    case X86II::MRM2m: case X86II::MRM3m:
-    case X86II::MRM4m: case X86II::MRM5m:
-    case X86II::MRM6m: case X86II::MRM7m:
-    case X86II::MRMDestMem: {
-      unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
-      i = isTwoAddr ? 1 : 0;
-      if (NumOps > e && isX86_64ExtendedReg(MI.getOperand(e)))
-        REX |= 1 << 2;
-      unsigned Bit = 0;
-      for (; i != e; ++i) {
-        const MachineOperand& MO = MI.getOperand(i);
-        if (MO.isReg()) {
-          if (isX86_64ExtendedReg(MO))
-            REX |= 1 << Bit;
-          Bit++;
-        }
-      }
-      break;
-    }
-    default: {
-      if (isX86_64ExtendedReg(MI.getOperand(0)))
-        REX |= 1 << 0;
-      i = isTwoAddr ? 2 : 1;
-      for (unsigned e = NumOps; i != e; ++i) {
-        const MachineOperand& MO = MI.getOperand(i);
-        if (isX86_64ExtendedReg(MO))
-          REX |= 1 << 2;
-      }
-      break;
-    }
-    }
-  }
-  return REX;
-}
-
-/// sizePCRelativeBlockAddress - This method returns the size of a PC
-/// relative block address instruction
-///
-static unsigned sizePCRelativeBlockAddress() {
-  return 4;
-}
-
-/// sizeGlobalAddress - Give the size of the emission of this global address
-///
-static unsigned sizeGlobalAddress(bool dword) {
-  return dword ? 8 : 4;
-}
-
-/// sizeConstPoolAddress - Give the size of the emission of this constant
-/// pool address
-///
-static unsigned sizeConstPoolAddress(bool dword) {
-  return dword ? 8 : 4;
-}
-
-/// sizeExternalSymbolAddress - Give the size of the emission of this external
-/// symbol
-///
-static unsigned sizeExternalSymbolAddress(bool dword) {
-  return dword ? 8 : 4;
-}
-
-/// sizeJumpTableAddress - Give the size of the emission of this jump
-/// table address
-///
-static unsigned sizeJumpTableAddress(bool dword) {
-  return dword ? 8 : 4;
-}
-
-static unsigned sizeConstant(unsigned Size) {
-  return Size;
-}
-
-static unsigned sizeRegModRMByte(){
-  return 1;
-}
-
-static unsigned sizeSIBByte(){
-  return 1;
-}
-
-static unsigned getDisplacementFieldSize(const MachineOperand *RelocOp) {
-  unsigned FinalSize = 0;
-  // If this is a simple integer displacement that doesn't require a relocation.
-  if (!RelocOp) {
-    FinalSize += sizeConstant(4);
-    return FinalSize;
-  }
-  
-  // Otherwise, this is something that requires a relocation.
-  if (RelocOp->isGlobal()) {
-    FinalSize += sizeGlobalAddress(false);
-  } else if (RelocOp->isCPI()) {
-    FinalSize += sizeConstPoolAddress(false);
-  } else if (RelocOp->isJTI()) {
-    FinalSize += sizeJumpTableAddress(false);
-  } else {
-    llvm_unreachable("Unknown value to relocate!");
-  }
-  return FinalSize;
-}
-
-static unsigned getMemModRMByteSize(const MachineInstr &MI, unsigned Op,
-                                    bool IsPIC, bool Is64BitMode) {
-  const MachineOperand &Op3 = MI.getOperand(Op+3);
-  int DispVal = 0;
-  const MachineOperand *DispForReloc = 0;
-  unsigned FinalSize = 0;
-  
-  // Figure out what sort of displacement we have to handle here.
-  if (Op3.isGlobal()) {
-    DispForReloc = &Op3;
-  } else if (Op3.isCPI()) {
-    if (Is64BitMode || IsPIC) {
-      DispForReloc = &Op3;
-    } else {
-      DispVal = 1;
-    }
-  } else if (Op3.isJTI()) {
-    if (Is64BitMode || IsPIC) {
-      DispForReloc = &Op3;
-    } else {
-      DispVal = 1; 
-    }
-  } else {
-    DispVal = 1;
-  }
-
-  const MachineOperand &Base     = MI.getOperand(Op);
-  const MachineOperand &IndexReg = MI.getOperand(Op+2);
-
-  unsigned BaseReg = Base.getReg();
-
-  // Is a SIB byte needed?
-  if ((!Is64BitMode || DispForReloc || BaseReg != 0) &&
-      IndexReg.getReg() == 0 &&
-      (BaseReg == 0 || X86RegisterInfo::getX86RegNum(BaseReg) != N86::ESP)) {      
-    if (BaseReg == 0) {  // Just a displacement?
-      // Emit special case [disp32] encoding
-      ++FinalSize; 
-      FinalSize += getDisplacementFieldSize(DispForReloc);
-    } else {
-      unsigned BaseRegNo = X86RegisterInfo::getX86RegNum(BaseReg);
-      if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
-        // Emit simple indirect register encoding... [EAX] f.e.
-        ++FinalSize;
-      // Be pessimistic and assume it's a disp32, not a disp8
-      } else {
-        // Emit the most general non-SIB encoding: [REG+disp32]
-        ++FinalSize;
-        FinalSize += getDisplacementFieldSize(DispForReloc);
-      }
-    }
-
-  } else {  // We need a SIB byte, so start by outputting the ModR/M byte first
-    assert(IndexReg.getReg() != X86::ESP &&
-           IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
-
-    bool ForceDisp32 = false;
-    if (BaseReg == 0 || DispForReloc) {
-      // Emit the normal disp32 encoding.
-      ++FinalSize;
-      ForceDisp32 = true;
-    } else {
-      ++FinalSize;
-    }
-
-    FinalSize += sizeSIBByte();
-
-    // Do we need to output a displacement?
-    if (DispVal != 0 || ForceDisp32) {
-      FinalSize += getDisplacementFieldSize(DispForReloc);
-    }
-  }
-  return FinalSize;
-}
-
-
-static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
-                                    const TargetInstrDesc *Desc,
-                                    bool IsPIC, bool Is64BitMode) {
-  
-  unsigned Opcode = Desc->Opcode;
-  unsigned FinalSize = 0;
-
-  // Emit the lock opcode prefix as needed.
-  if (Desc->TSFlags & X86II::LOCK) ++FinalSize;
-
-  // Emit segment override opcode prefix as needed.
-  switch (Desc->TSFlags & X86II::SegOvrMask) {
-  case X86II::FS:
-  case X86II::GS:
-   ++FinalSize;
-   break;
-  default: llvm_unreachable("Invalid segment!");
-  case 0: break;  // No segment override!
-  }
-
-  // Emit the repeat opcode prefix as needed.
-  if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) ++FinalSize;
-
-  // Emit the operand size opcode prefix as needed.
-  if (Desc->TSFlags & X86II::OpSize) ++FinalSize;
-
-  // Emit the address size opcode prefix as needed.
-  if (Desc->TSFlags & X86II::AdSize) ++FinalSize;
-
-  bool Need0FPrefix = false;
-  switch (Desc->TSFlags & X86II::Op0Mask) {
-  case X86II::TB:  // Two-byte opcode prefix
-  case X86II::T8:  // 0F 38
-  case X86II::TA:  // 0F 3A
-    Need0FPrefix = true;
-    break;
-  case X86II::TF: // F2 0F 38
-    ++FinalSize;
-    Need0FPrefix = true;
-    break;
-  case X86II::REP: break; // already handled.
-  case X86II::XS:   // F3 0F
-    ++FinalSize;
-    Need0FPrefix = true;
-    break;
-  case X86II::XD:   // F2 0F
-    ++FinalSize;
-    Need0FPrefix = true;
-    break;
-  case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
-  case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
-    ++FinalSize;
-    break; // Two-byte opcode prefix
-  default: llvm_unreachable("Invalid prefix!");
-  case 0: break;  // No prefix!
-  }
-
-  if (Is64BitMode) {
-    // REX prefix
-    unsigned REX = X86InstrInfo::determineREX(MI);
-    if (REX)
-      ++FinalSize;
-  }
-
-  // 0x0F escape code must be emitted just before the opcode.
-  if (Need0FPrefix)
-    ++FinalSize;
-
-  switch (Desc->TSFlags & X86II::Op0Mask) {
-  case X86II::T8:  // 0F 38
-    ++FinalSize;
-    break;
-  case X86II::TA:  // 0F 3A
-    ++FinalSize;
-    break;
-  case X86II::TF: // F2 0F 38
-    ++FinalSize;
-    break;
-  }
-
-  // If this is a two-address instruction, skip one of the register operands.
-  unsigned NumOps = Desc->getNumOperands();
-  unsigned CurOp = 0;
-  if (NumOps > 1 && Desc->getOperandConstraint(1, TOI::TIED_TO) != -1)
-    CurOp++;
-  else if (NumOps > 2 && Desc->getOperandConstraint(NumOps-1, TOI::TIED_TO)== 0)
-    // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
-    --NumOps;
-
-  switch (Desc->TSFlags & X86II::FormMask) {
-  default: llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!");
-  case X86II::Pseudo:
-    // Remember the current PC offset, this is the PIC relocation
-    // base address.
-    switch (Opcode) {
-    default: 
-      break;
-    case TargetOpcode::INLINEASM: {
-      const MachineFunction *MF = MI.getParent()->getParent();
-      const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
-      FinalSize += TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
-                                          *MF->getTarget().getMCAsmInfo());
-      break;
-    }
-    case TargetOpcode::DBG_LABEL:
-    case TargetOpcode::EH_LABEL:
-    case TargetOpcode::DBG_VALUE:
-      break;
-    case TargetOpcode::IMPLICIT_DEF:
-    case TargetOpcode::KILL:
-    case X86::FP_REG_KILL:
-      break;
-    case X86::MOVPC32r: {
-      // This emits the "call" portion of this pseudo instruction.
-      ++FinalSize;
-      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
-      break;
-    }
-    }
-    CurOp = NumOps;
-    break;
-  case X86II::RawFrm:
-    ++FinalSize;
-
-    if (CurOp != NumOps) {
-      const MachineOperand &MO = MI.getOperand(CurOp++);
-      if (MO.isMBB()) {
-        FinalSize += sizePCRelativeBlockAddress();
-      } else if (MO.isGlobal()) {
-        FinalSize += sizeGlobalAddress(false);
-      } else if (MO.isSymbol()) {
-        FinalSize += sizeExternalSymbolAddress(false);
-      } else if (MO.isImm()) {
-        FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
-      } else {
-        llvm_unreachable("Unknown RawFrm operand!");
-      }
-    }
-    break;
-
-  case X86II::AddRegFrm:
-    ++FinalSize;
-    ++CurOp;
-    
-    if (CurOp != NumOps) {
-      const MachineOperand &MO1 = MI.getOperand(CurOp++);
-      unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
-      if (MO1.isImm())
-        FinalSize += sizeConstant(Size);
-      else {
-        bool dword = false;
-        if (Opcode == X86::MOV64ri)
-          dword = true; 
-        if (MO1.isGlobal()) {
-          FinalSize += sizeGlobalAddress(dword);
-        } else if (MO1.isSymbol())
-          FinalSize += sizeExternalSymbolAddress(dword);
-        else if (MO1.isCPI())
-          FinalSize += sizeConstPoolAddress(dword);
-        else if (MO1.isJTI())
-          FinalSize += sizeJumpTableAddress(dword);
-      }
-    }
-    break;
-
-  case X86II::MRMDestReg: {
-    ++FinalSize; 
-    FinalSize += sizeRegModRMByte();
-    CurOp += 2;
-    if (CurOp != NumOps) {
-      ++CurOp;
-      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
-    }
-    break;
-  }
-  case X86II::MRMDestMem: {
-    ++FinalSize;
-    FinalSize += getMemModRMByteSize(MI, CurOp, IsPIC, Is64BitMode);
-    CurOp +=  X86::AddrNumOperands + 1;
-    if (CurOp != NumOps) {
-      ++CurOp;
-      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
-    }
-    break;
-  }
-
-  case X86II::MRMSrcReg:
-    ++FinalSize;
-    FinalSize += sizeRegModRMByte();
-    CurOp += 2;
-    if (CurOp != NumOps) {
-      ++CurOp;
-      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
-    }
-    break;
-
-  case X86II::MRMSrcMem: {
-    ++FinalSize;
-    FinalSize += getMemModRMByteSize(MI, CurOp+1, IsPIC, Is64BitMode);
-    CurOp += X86::AddrNumOperands + 1;
-    if (CurOp != NumOps) {
-      ++CurOp;
-      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
-    }
-    break;
-  }
-
-  case X86II::MRM0r: case X86II::MRM1r:
-  case X86II::MRM2r: case X86II::MRM3r:
-  case X86II::MRM4r: case X86II::MRM5r:
-  case X86II::MRM6r: case X86II::MRM7r:
-    ++FinalSize;
-    if (Desc->getOpcode() == X86::LFENCE ||
-        Desc->getOpcode() == X86::MFENCE) {
-      // Special handling of lfence and mfence;
-      FinalSize += sizeRegModRMByte();
-    } else if (Desc->getOpcode() == X86::MONITOR ||
-               Desc->getOpcode() == X86::MWAIT) {
-      // Special handling of monitor and mwait.
-      FinalSize += sizeRegModRMByte() + 1; // +1 for the opcode.
-    } else {
-      ++CurOp;
-      FinalSize += sizeRegModRMByte();
-    }
-
-    if (CurOp != NumOps) {
-      const MachineOperand &MO1 = MI.getOperand(CurOp++);
-      unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
-      if (MO1.isImm())
-        FinalSize += sizeConstant(Size);
-      else {
-        bool dword = false;
-        if (Opcode == X86::MOV64ri32)
-          dword = true;
-        if (MO1.isGlobal()) {
-          FinalSize += sizeGlobalAddress(dword);
-        } else if (MO1.isSymbol())
-          FinalSize += sizeExternalSymbolAddress(dword);
-        else if (MO1.isCPI())
-          FinalSize += sizeConstPoolAddress(dword);
-        else if (MO1.isJTI())
-          FinalSize += sizeJumpTableAddress(dword);
-      }
-    }
-    break;
-
-  case X86II::MRM0m: case X86II::MRM1m:
-  case X86II::MRM2m: case X86II::MRM3m:
-  case X86II::MRM4m: case X86II::MRM5m:
-  case X86II::MRM6m: case X86II::MRM7m: {
-    
-    ++FinalSize;
-    FinalSize += getMemModRMByteSize(MI, CurOp, IsPIC, Is64BitMode);
-    CurOp += X86::AddrNumOperands;
-
-    if (CurOp != NumOps) {
-      const MachineOperand &MO = MI.getOperand(CurOp++);
-      unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
-      if (MO.isImm())
-        FinalSize += sizeConstant(Size);
-      else {
-        bool dword = false;
-        if (Opcode == X86::MOV64mi32)
-          dword = true;
-        if (MO.isGlobal()) {
-          FinalSize += sizeGlobalAddress(dword);
-        } else if (MO.isSymbol())
-          FinalSize += sizeExternalSymbolAddress(dword);
-        else if (MO.isCPI())
-          FinalSize += sizeConstPoolAddress(dword);
-        else if (MO.isJTI())
-          FinalSize += sizeJumpTableAddress(dword);
-      }
-    }
-    break;
-    
-  case X86II::MRM_C1:
-  case X86II::MRM_C8:
-  case X86II::MRM_C9:
-  case X86II::MRM_E8:
-  case X86II::MRM_F0:
-    FinalSize += 2;
-    break;
-  }
-
-  case X86II::MRMInitReg:
-    ++FinalSize;
-    // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
-    FinalSize += sizeRegModRMByte();
-    ++CurOp;
-    break;
-  }
-
-  if (!Desc->isVariadic() && CurOp != NumOps) {
-    std::string msg;
-    raw_string_ostream Msg(msg);
-    Msg << "Cannot determine size: " << MI;
-    report_fatal_error(Msg.str());
-  }
-  
-
-  return FinalSize;
-}
-
-
-unsigned X86InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  const TargetInstrDesc &Desc = MI->getDesc();
-  bool IsPIC = TM.getRelocationModel() == Reloc::PIC_;
-  bool Is64BitMode = TM.getSubtargetImpl()->is64Bit();
-  unsigned Size = GetInstSizeWithDesc(*MI, &Desc, IsPIC, Is64BitMode);
-  if (Desc.getOpcode() == X86::MOVPC32r)
-    Size += GetInstSizeWithDesc(*MI, &get(X86::POP32r), IsPIC, Is64BitMode);
-  return Size;
-}
-
 /// getGlobalBaseReg - Return a virtual register initialized with the
 /// the global base register value. Output instructions required to
 /// initialize the register in the function entry block, if necessary.
@@ -3573,7 +3025,7 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
 // that we don't include here. We don't want to replace instructions selected
 // by intrinsics.
 static const unsigned ReplaceableInstrs[][3] = {
-  //PackedInt       PackedSingle     PackedDouble
+  //PackedSingle     PackedDouble    PackedInt
   { X86::MOVAPSmr,   X86::MOVAPDmr,  X86::MOVDQAmr  },
   { X86::MOVAPSrm,   X86::MOVAPDrm,  X86::MOVDQArm  },
   { X86::MOVAPSrr,   X86::MOVAPDrr,  X86::MOVDQArr  },
@@ -3589,6 +3041,22 @@ static const unsigned ReplaceableInstrs[][3] = {
   { X86::V_SET0PS,   X86::V_SET0PD,  X86::V_SET0PI  },
   { X86::XORPSrm,    X86::XORPDrm,   X86::PXORrm    },
   { X86::XORPSrr,    X86::XORPDrr,   X86::PXORrr    },
+  // AVX 128-bit support
+  { X86::VMOVAPSmr,  X86::VMOVAPDmr,  X86::VMOVDQAmr  },
+  { X86::VMOVAPSrm,  X86::VMOVAPDrm,  X86::VMOVDQArm  },
+  { X86::VMOVAPSrr,  X86::VMOVAPDrr,  X86::VMOVDQArr  },
+  { X86::VMOVUPSmr,  X86::VMOVUPDmr,  X86::VMOVDQUmr  },
+  { X86::VMOVUPSrm,  X86::VMOVUPDrm,  X86::VMOVDQUrm  },
+  { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
+  { X86::VANDNPSrm,  X86::VANDNPDrm,  X86::VPANDNrm   },
+  { X86::VANDNPSrr,  X86::VANDNPDrr,  X86::VPANDNrr   },
+  { X86::VANDPSrm,   X86::VANDPDrm,   X86::VPANDrm    },
+  { X86::VANDPSrr,   X86::VANDPDrr,   X86::VPANDrr    },
+  { X86::VORPSrm,    X86::VORPDrm,    X86::VPORrm     },
+  { X86::VORPSrr,    X86::VORPDrr,    X86::VPORrr     },
+  { X86::AVX_SET0PS, X86::AVX_SET0PD, X86::AVX_SET0PI },
+  { X86::VXORPSrm,   X86::VXORPDrm,   X86::VPXORrm    },
+  { X86::VXORPSrr,   X86::VXORPDrr,   X86::VPXORrr    },
 };
 
 // FIXME: Some shuffle and unpack instructions have equivalents in different
@@ -3627,7 +3095,7 @@ namespace {
   /// global base register for x86-32.
   struct CGBR : public MachineFunctionPass {
     static char ID;
-    CGBR() : MachineFunctionPass(&ID) {}
+    CGBR() : MachineFunctionPass(ID) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF) {
       const X86TargetMachine *TM =
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index ad0217adb4758..f33620641e88c 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -311,6 +311,12 @@ namespace X86II {
     MRM_F0 = 40,
     MRM_F8 = 41,
     MRM_F9 = 42,
+    
+    /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
+    /// immediates, the first of which is a 16 or 32-bit immediate (specified by
+    /// the imm encoding) and the second is a 16-bit fixed value.  In the AMD
+    /// manual, this operand is described as pntr16:32 and pntr16:16
+    RawFrmImm16 = 43,
 
     FormMask       = 63,
 
@@ -439,27 +445,27 @@ namespace X86II {
 
     //===------------------------------------------------------------------===//
     // VEX - The opcode prefix used by AVX instructions
-    VEX         = 1ULL << 32,
+    VEX         = 1U << 0,
 
     // VEX_W - Has a opcode specific functionality, but is used in the same
     // way as REX_W is for regular SSE instructions.
-    VEX_W       = 1ULL << 33,
+    VEX_W       = 1U << 1,
 
     // VEX_4V - Used to specify an additional AVX/SSE register. Several 2
     // address instructions in SSE are represented as 3 address ones in AVX
     // and the additional register is encoded in VEX_VVVV prefix.
-    VEX_4V      = 1ULL << 34,
+    VEX_4V      = 1U << 2,
 
     // VEX_I8IMM - Specifies that the last register used in a AVX instruction,
     // must be encoded in the i8 immediate field. This usually happens in
     // instructions with 4 operands.
-    VEX_I8IMM   = 1ULL << 35,
+    VEX_I8IMM   = 1U << 3,
 
     // VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
     // instruction uses 256-bit wide registers. This is usually auto detected if
     // a VR256 register is used, but some AVX instructions also have this field
     // marked when using a f256 memory references.
-    VEX_L       = 1ULL << 36
+    VEX_L       = 1U << 4
   };
   
   // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
@@ -522,11 +528,12 @@ namespace X86II {
     case X86II::AddRegFrm:
     case X86II::MRMDestReg:
     case X86II::MRMSrcReg:
+    case X86II::RawFrmImm16:
        return -1;
     case X86II::MRMDestMem:
       return 0;
     case X86II::MRMSrcMem: {
-      bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+      bool HasVEX_4V = (TSFlags >> 32) & X86II::VEX_4V;
       unsigned FirstMemOp = 1;
       if (HasVEX_4V)
         ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
@@ -610,12 +617,6 @@ public:
   ///
   virtual const X86RegisterInfo &getRegisterInfo() const { return RI; }
 
-  /// Return true if the instruction is a register to register move and return
-  /// the source and dest operands and their sub-register indices by reference.
-  virtual bool isMoveInstr(const MachineInstr &MI,
-                           unsigned &SrcReg, unsigned &DstReg,
-                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-
   /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
   /// extension instruction. That is, it's like a copy where it's legal for the
   /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
@@ -826,16 +827,11 @@ public:
     if (!MO.isReg()) return false;
     return isX86_64ExtendedReg(MO.getReg());
   }
-  static unsigned determineREX(const MachineInstr &MI);
 
   /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
   /// higher) register?  e.g. r8, xmm8, xmm13, etc.
   static bool isX86_64ExtendedReg(unsigned RegNo);
 
-  /// GetInstSize - Returns the size of the specified MachineInstr.
-  ///
-  virtual unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
-
   /// getGlobalBaseReg - Return a virtual register initialized with the
   /// the global base register value. Output instructions required to
   /// initialize the register in the function entry block, if necessary.
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 1efef5a80b1ba..09b7721a621d3 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -80,6 +80,21 @@ def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
 
+def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
+def SDT_X86MEMBARRIERNoSSE : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
+                            [SDNPHasChain]>;
+def X86MemBarrierNoSSE : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIERNoSSE,
+                                [SDNPHasChain]>;
+def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
+                        [SDNPHasChain]>;
+def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER,
+                        [SDNPHasChain]>;
+def X86LFence : SDNode<"X86ISD::LFENCE", SDT_X86MEMBARRIER,
+                        [SDNPHasChain]>;
+
+
 def X86bsf     : SDNode<"X86ISD::BSF",      SDTUnaryArithWithFlags>;
 def X86bsr     : SDNode<"X86ISD::BSR",      SDTUnaryArithWithFlags>;
 def X86shld    : SDNode<"X86ISD::SHLD",     SDTIntShiftDOp>;
@@ -222,7 +237,7 @@ def i16mem  : X86MemOperand<"printi16mem">;
 def i32mem  : X86MemOperand<"printi32mem">;
 def i64mem  : X86MemOperand<"printi64mem">;
 def i128mem : X86MemOperand<"printi128mem">;
-//def i256mem : X86MemOperand<"printi256mem">;
+def i256mem : X86MemOperand<"printi256mem">;
 def f32mem  : X86MemOperand<"printf32mem">;
 def f64mem  : X86MemOperand<"printf64mem">;
 def f80mem  : X86MemOperand<"printf80mem">;
@@ -333,15 +348,21 @@ def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
 // X86 Instruction Predicate Definitions.
 def HasCMov      : Predicate<"Subtarget->hasCMov()">;
 def NoCMov       : Predicate<"!Subtarget->hasCMov()">;
-def HasMMX       : Predicate<"Subtarget->hasMMX()">;
-def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
-def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
-def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
-def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
-def HasSSE41     : Predicate<"Subtarget->hasSSE41()">;
-def HasSSE42     : Predicate<"Subtarget->hasSSE42()">;
-def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
+
+// FIXME: temporary hack to let codegen assert or generate poor code in case
+// no AVX version of the desired intructions is present, this is better for
+// incremental dev (without fallbacks it's easier to spot what's missing)
+def HasMMX       : Predicate<"Subtarget->hasMMX() && !Subtarget->hasAVX()">;
+def HasSSE1      : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
+def HasSSE2      : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">;
+def HasSSE3      : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
+def HasSSSE3     : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
+def HasSSE41     : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
+def HasSSE42     : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
+def HasSSE4A     : Predicate<"Subtarget->hasSSE4A() && !Subtarget->hasAVX()">;
+
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
+def HasCLMUL     : Predicate<"Subtarget->hasCLMUL()">;
 def HasFMA3      : Predicate<"Subtarget->hasFMA3()">;
 def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
@@ -393,9 +414,7 @@ def X86_COND_O   : PatLeaf<(i8 13)>;
 def X86_COND_P   : PatLeaf<(i8 14)>; // alt. COND_PE
 def X86_COND_S   : PatLeaf<(i8 15)>;
 
-def immSext8 : PatLeaf<(imm), [{
-  return N->getSExtValue() == (int8_t)N->getSExtValue();
-}]>;
+def immSext8 : PatLeaf<(imm), [{ return immSext8(N); }]>;
 
 def i16immSExt8  : PatLeaf<(i16 immSext8)>;
 def i32immSExt8  : PatLeaf<(i32 immSext8)>;
@@ -559,9 +578,10 @@ def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
 // The main point of having separate instruction are extra unmodelled effects
 // (compared to ordinary calls) like stack pointer change.
 
-def MINGW_ALLOCA : I<0, Pseudo, (outs), (ins),
-                     "# dynamic stack allocation",
-                     [(X86MingwAlloca)]>;
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+  def MINGW_ALLOCA : I<0, Pseudo, (outs), (ins),
+                       "# dynamic stack allocation",
+                       [(X86MingwAlloca)]>;
 }
 
 // Nop
@@ -574,10 +594,14 @@ let neverHasSideEffects = 1 in {
 }
 
 // Trap
-def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
-def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", []>;
+let Uses = [EFLAGS] in {
+  def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
+}
+def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
+              [(int_x86_int (i8 3))]>;
 // FIXME: need to make sure that "int $3" matches int3
-def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", []>;
+def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap",
+              [(int_x86_int imm:$trap)]>;
 def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", []>, OpSize;
 def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l}", []>;
 
@@ -650,16 +674,16 @@ let Uses = [ECX], isBranch = 1, isTerminator = 1 in
 // Indirect branches
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
   def JMP32r     : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
-                     [(brind GR32:$dst)]>;
+                     [(brind GR32:$dst)]>, Requires<[In32BitMode]>;
   def JMP32m     : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
-                     [(brind (loadi32 addr:$dst))]>;
+                     [(brind (loadi32 addr:$dst))]>, Requires<[In32BitMode]>;
                      
-  def FARJMP16i  : Iseg16<0xEA, RawFrm, (outs), 
-                          (ins i16imm:$seg, i16imm:$off),
-                          "ljmp{w}\t$seg, $off", []>, OpSize;
-  def FARJMP32i  : Iseg32<0xEA, RawFrm, (outs),
-                          (ins i16imm:$seg, i32imm:$off),
-                          "ljmp{l}\t$seg, $off", []>;                     
+  def FARJMP16i  : Iseg16<0xEA, RawFrmImm16, (outs), 
+                          (ins i16imm:$off, i16imm:$seg),
+                          "ljmp{w}\t{$seg, $off|$off, $seg}", []>, OpSize;
+  def FARJMP32i  : Iseg32<0xEA, RawFrmImm16, (outs),
+                          (ins i32imm:$off, i16imm:$seg),
+                          "ljmp{l}\t{$seg, $off|$off, $seg}", []>;                     
 
   def FARJMP16m  : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst), 
                      "ljmp{w}\t{*}$dst", []>, OpSize;
@@ -670,9 +694,9 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 
 // Loop instructions
 
-def LOOP   : I<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", []>;
-def LOOPE  : I<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", []>;
-def LOOPNE : I<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", []>;
+def LOOP   : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", []>;
+def LOOPE  : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", []>;
+def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", []>;
 
 //===----------------------------------------------------------------------===//
 //  Call Instructions...
@@ -695,12 +719,12 @@ let isCall = 1 in
     def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops),
                         "call\t{*}$dst", [(X86call (loadi32 addr:$dst))]>;
   
-    def FARCALL16i  : Iseg16<0x9A, RawFrm, (outs), 
-                             (ins i16imm:$seg, i16imm:$off),
-                             "lcall{w}\t$seg, $off", []>, OpSize;
-    def FARCALL32i  : Iseg32<0x9A, RawFrm, (outs),
-                             (ins i16imm:$seg, i32imm:$off),
-                             "lcall{l}\t$seg, $off", []>;
+    def FARCALL16i  : Iseg16<0x9A, RawFrmImm16, (outs), 
+                             (ins i16imm:$off, i16imm:$seg),
+                             "lcall{w}\t{$seg, $off|$off, $seg}", []>, OpSize;
+    def FARCALL32i  : Iseg32<0x9A, RawFrmImm16, (outs),
+                             (ins i32imm:$off, i16imm:$seg),
+                             "lcall{l}\t{$seg, $off|$off, $seg}", []>;
                              
     def FARCALL16m  : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
                         "lcall{w}\t{*}$dst", []>, OpSize;
@@ -721,7 +745,8 @@ def ENTER : I<0xC8, RawFrm, (outs), (ins i16imm:$len, i8imm:$lvl),
 
 // Tail call stuff.
 
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    isCodeGenOnly = 1 in
   let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
               MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
               XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
@@ -756,7 +781,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
 //
 let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in
 def LEAVE    : I<0xC9, RawFrm,
-                 (outs), (ins), "leave", []>;
+                 (outs), (ins), "leave", []>, Requires<[In32BitMode]>;
 
 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                    "popcnt{w}\t{$src, $dst|$dst, $src}", []>, OpSize, XS;
@@ -934,7 +959,7 @@ def SYSRET   : I<0x07, RawFrm,
 def SYSENTER : I<0x34, RawFrm,
                  (outs), (ins), "sysenter", []>, TB;
 def SYSEXIT  : I<0x35, RawFrm,
-                 (outs), (ins), "sysexit", []>, TB;
+                 (outs), (ins), "sysexit", []>, TB, Requires<[In32BitMode]>;
 
 def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>;
 
@@ -1025,17 +1050,23 @@ def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
 /// moffs8, moffs16 and moffs32 versions of moves.  The immediate is a
 /// 32-bit offset from the PC.  These are only valid in x86-32 mode.
 def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
-                   "mov{b}\t{$src, %al|%al, $src}", []>;
+                   "mov{b}\t{$src, %al|%al, $src}", []>,
+                   Requires<[In32BitMode]>;
 def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
-                      "mov{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+                      "mov{w}\t{$src, %ax|%ax, $src}", []>, OpSize,
+                     Requires<[In32BitMode]>;
 def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
-                      "mov{l}\t{$src, %eax|%eax, $src}", []>;
+                      "mov{l}\t{$src, %eax|%eax, $src}", []>,
+                     Requires<[In32BitMode]>;
 def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
-                   "mov{b}\t{%al, $dst|$dst, %al}", []>;
+                   "mov{b}\t{%al, $dst|$dst, %al}", []>,
+                  Requires<[In32BitMode]>;
 def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins),
-                      "mov{w}\t{%ax, $dst|$dst, %ax}", []>, OpSize;
+                      "mov{w}\t{%ax, $dst|$dst, %ax}", []>, OpSize,
+                     Requires<[In32BitMode]>;
 def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
-                      "mov{l}\t{%eax, $dst|$dst, %eax}", []>;
+                      "mov{l}\t{%eax, $dst|$dst, %eax}", []>,
+                     Requires<[In32BitMode]>;
                       
 // Moves to and from segment registers
 def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
@@ -1087,6 +1118,7 @@ def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                 [(store GR32:$src, addr:$dst)]>;
 
 /// Versions of MOV32rr, MOV32rm, and MOV32mr for i32mem_TC and GR32_TC.
+let isCodeGenOnly = 1 in {
 let neverHasSideEffects = 1 in
 def MOV32rr_TC : I<0x89, MRMDestReg, (outs GR32_TC:$dst), (ins GR32_TC:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}", []>;
@@ -1101,10 +1133,12 @@ let mayStore = 1 in
 def MOV32mr_TC : I<0x89, MRMDestMem, (outs), (ins i32mem_TC:$dst, GR32_TC:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}",
                 []>;
+}
 
 // Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
 // that they can be used for copying and storing h registers, which can't be
 // encoded when a REX prefix is present.
+let isCodeGenOnly = 1 in {
 let neverHasSideEffects = 1 in
 def MOV8rr_NOREX : I<0x88, MRMDestReg,
                      (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
@@ -1118,6 +1152,7 @@ let mayLoad = 1,
 def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
                      (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
                      "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+}
 
 // Moves to and from debug registers
 def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
@@ -1137,7 +1172,7 @@ def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
 
 // Extra precision multiplication
 
-// AL is really implied by AX, by the registers in Defs must match the
+// AL is really implied by AX, but the registers in Defs must match the
 // SDNode results (i8, i32).
 let Defs = [AL,EFLAGS,AX], Uses = [AL] in
 def MUL8r  : I<0xF6, MRM4r, (outs),  (ins GR8:$src), "mul{b}\t$src",
@@ -3895,6 +3930,20 @@ def EH_RETURN   : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
 // Atomic support
 //
 
+// Memory barriers
+
+// TODO: Get this to fold the constant into the instruction.           
+def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
+                      "lock\n\t"
+                      "or{l}\t{$zero, $dst|$dst, $zero}",
+                      []>, Requires<[In32BitMode]>, LOCK;
+
+let hasSideEffects = 1 in {
+def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
+                     "#MEMBARRIER",
+                     [(X86MemBarrier)]>, Requires<[HasSSE2]>;
+}
+
 // Atomic swap. These are just normal xchg instructions. But since a memory
 // operand is referenced, the atomicity is ensured.
 let Constraints = "$val = $dst" in {
@@ -4928,6 +4977,12 @@ include "X86Instr64bit.td"
 include "X86InstrFragmentsSIMD.td"
 
 //===----------------------------------------------------------------------===//
+// FMA - Fused Multiply-Add support (requires FMA)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrFMA.td"
+
+//===----------------------------------------------------------------------===//
 // XMM Floating point support (requires SSE / SSE2)
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 6cf7ac83620e8..11d4179534dce 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -164,7 +164,7 @@ let neverHasSideEffects = 1 in
 def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMSrcReg, (outs FR64:$dst), (ins VR64:$src),
                            "movq2dq\t{$src, $dst|$dst, $src}", []>;
 
-def MMX_MOVFR642Qrr: SSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins FR64:$src),
+def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins FR64:$src),
                            "movdq2q\t{$src, $dst|$dst, $src}", []>;
 
 def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index ebe161b46bdcb..f5466f83f5192 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -142,7 +142,7 @@ multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
        !if(Is2Addr,
            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-           [(set RC:$dst, (!nameconcat<Intrinsic>("int_x86_sse",
+           [(set RC:$dst, (!nameconcat<Intrinsic>("int_x86_",
                            !strconcat(SSEVer, !strconcat("_",
                            !strconcat(OpcodeStr, FPSizeStr))))
                  RC:$src1, RC:$src2))], d>;
@@ -150,7 +150,7 @@ multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
        !if(Is2Addr,
            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (!nameconcat<Intrinsic>("int_x86_sse",
+       [(set RC:$dst, (!nameconcat<Intrinsic>("int_x86_",
                        !strconcat(SSEVer, !strconcat("_",
                        !strconcat(OpcodeStr, FPSizeStr))))
              RC:$src1, (mem_frag addr:$src2)))], d>;
@@ -256,10 +256,10 @@ def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
 let isAsmParserOnly = 1 in {
 def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
                   "movss\t{$src, $dst|$dst, $src}",
-                  [(store FR32:$src, addr:$dst)]>, XS, VEX_4V;
+                  [(store FR32:$src, addr:$dst)]>, XS, VEX;
 def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
                   "movsd\t{$src, $dst|$dst, $src}",
-                  [(store FR64:$src, addr:$dst)]>, XD, VEX_4V;
+                  [(store FR64:$src, addr:$dst)]>, XD, VEX;
 }
 
 // Extract and store.
@@ -340,6 +340,15 @@ def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v4f64 VR256:$src), addr:$dst)]>, VEX;
 }
+
+def : Pat<(int_x86_avx_loadu_ps_256 addr:$src), (VMOVUPSYrm addr:$src)>;
+def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
+          (VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+def : Pat<(int_x86_avx_loadu_pd_256 addr:$src), (VMOVUPDYrm addr:$src)>;
+def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
+          (VMOVUPDYmr addr:$dst, VR256:$src)>;
+
 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
@@ -516,6 +525,14 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                         [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>;
 }
 
+multiclass sse12_cvt_s_np<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                          X86MemOperand x86memop, string asm> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                        []>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                        []>;
+}
+
 multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
                          string asm, Domain d> {
@@ -526,35 +543,58 @@ multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
 }
 
 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
-                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
-                     string asm> {
+                          X86MemOperand x86memop, string asm> {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
-              asm, []>;
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
-              (ins DstRC:$src1, x86memop:$src), asm, []>;
+              (ins DstRC:$src1, x86memop:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
 }
 
 let isAsmParserOnly = 1 in {
-defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
-                      "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX;
-defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
-                      "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, VEX;
-defm VCVTSI2SS  : sse12_vcvt_avx<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
-                      "cvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}">, XS,
-                      VEX_4V;
-defm VCVTSI2SD  : sse12_vcvt_avx<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
-                      "cvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}">, XD,
-                      VEX_4V;
+defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+                                "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX;
+defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+                                "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX,
+                                VEX_W;
+defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+                                "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, VEX;
+defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+                                "cvttsd2si\t{$src, $dst|$dst, $src}">, XD,
+                                VEX, VEX_W;
+
+// The assembler can recognize rr 64-bit instructions by seeing a rxx
+// register, but the same isn't true when only using memory operands,
+// provide other assembly "l" and "q" forms to address this explicitly
+// where appropriate to do so.
+defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss">, XS,
+                                  VEX_4V;
+defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">, XS,
+                                  VEX_4V, VEX_W;
+defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd">, XD,
+                                  VEX_4V;
+defm VCVTSI2SDL  : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, XD,
+                                  VEX_4V;
+defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, XD,
+                                  VEX_4V, VEX_W;
 }
 
 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
                       "cvttss2si\t{$src, $dst|$dst, $src}">, XS;
+defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+                      "cvttss2si{q}\t{$src, $dst|$dst, $src}">, XS, REX_W;
 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
                       "cvttsd2si\t{$src, $dst|$dst, $src}">, XD;
+defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+                      "cvttsd2si{q}\t{$src, $dst|$dst, $src}">, XD, REX_W;
 defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
                       "cvtsi2ss\t{$src, $dst|$dst, $src}">, XS;
+defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
+                      "cvtsi2ss{q}\t{$src, $dst|$dst, $src}">, XS, REX_W;
 defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
                       "cvtsi2sd\t{$src, $dst|$dst, $src}">, XD;
+defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
+                      "cvtsi2sd{q}\t{$src, $dst|$dst, $src}">, XD, REX_W;
 
 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
 // and/or XMM operand(s).
@@ -570,10 +610,12 @@ multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
                          string asm> {
-  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
-                        [(set DstRC:$dst, (Int SrcRC:$src))]>;
-  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
-                        [(set DstRC:$dst, (Int (ld_frag addr:$src)))]>;
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int SrcRC:$src))]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int (ld_frag addr:$src)))]>;
 }
 
 multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
@@ -588,35 +630,79 @@ multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
 
 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
                     RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
-                    PatFrag ld_frag, string asm> {
+                    PatFrag ld_frag, string asm, bit Is2Addr = 1> {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
-              asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))]>;
+              !if(Is2Addr,
+                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+              [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))]>;
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
-                   (ins DstRC:$src1, x86memop:$src2), asm,
+              (ins DstRC:$src1, x86memop:$src2),
+              !if(Is2Addr,
+                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
               [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))]>;
 }
 
 let isAsmParserOnly = 1 in {
   defm Int_VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
-                        f32mem, load, "cvtss2si\t{$src, $dst|$dst, $src}">, XS,
-                        VEX;
+                        f32mem, load, "cvtss2si">, XS, VEX;
+  defm Int_VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
+                          int_x86_sse_cvtss2si64, f32mem, load, "cvtss2si">,
+                          XS, VEX, VEX_W;
   defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
-                        f128mem, load, "cvtsd2si\t{$src, $dst|$dst, $src}">, XD,
-                        VEX;
+                        f128mem, load, "cvtsd2si">, XD, VEX;
+  defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
+                        int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">,
+                        XD, VEX, VEX_W;
+
+  // FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_
+  // Get rid of this hack or rename the intrinsics, there are several
+  // intructions that only match with the intrinsic form, why create duplicates
+  // to let them be recognized by the assembler?
+  defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem,
+                        "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX;
+  defm VCVTSD2SI64   : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem,
+                        "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W;
 }
 defm Int_CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
-                      f32mem, load, "cvtss2si\t{$src, $dst|$dst, $src}">, XS;
+                      f32mem, load, "cvtss2si">, XS;
+defm Int_CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+                      f32mem, load, "cvtss2si{q}">, XS, REX_W;
 defm Int_CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
-                      f128mem, load, "cvtsd2si\t{$src, $dst|$dst, $src}">, XD;
+                      f128mem, load, "cvtsd2si">, XD;
+defm Int_CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
+                        f128mem, load, "cvtsd2si">, XD, REX_W;
 
+defm CVTSD2SI64 : sse12_cvt_s_np<0x2D, VR128, GR64, f64mem, "cvtsd2si{q}">, XD,
+                        REX_W;
+
+let isAsmParserOnly = 1 in {
+  defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V;
+  defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V,
+            VEX_W;
+  defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V;
+  defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD,
+            VEX_4V, VEX_W;
+}
 
 let Constraints = "$src1 = $dst" in {
   defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
                         int_x86_sse_cvtsi2ss, i32mem, loadi32,
-                        "cvtsi2ss\t{$src2, $dst|$dst, $src2}">, XS;
+                        "cvtsi2ss">, XS;
+  defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+                        int_x86_sse_cvtsi642ss, i64mem, loadi64,
+                        "cvtsi2ss{q}">, XS, REX_W;
   defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
                         int_x86_sse2_cvtsi2sd, i32mem, loadi32,
-                        "cvtsi2ss\t{$src2, $dst|$dst, $src2}">, XD;
+                        "cvtsi2sd">, XD;
+  defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+                        int_x86_sse2_cvtsi642sd, i64mem, loadi64,
+                        "cvtsi2sd">, XD, REX_W;
 }
 
 // Instructions below don't have an AVX form.
@@ -645,35 +731,48 @@ let Constraints = "$src1 = $dst" in {
 /// SSE 1 Only
 
 // Aliases for intrinsics
-let isAsmParserOnly = 1, Pattern = []<dag> in {
-defm Int_VCVTTSS2SI : sse12_cvt_sint_3addr<0x2C, VR128, GR32,
-                int_x86_sse_cvttss2si, f32mem, load,
-                "cvttss2si\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS;
-defm Int_VCVTTSD2SI : sse12_cvt_sint_3addr<0x2C, VR128, GR32,
-                int_x86_sse2_cvttsd2si, f128mem, load,
-                "cvttss2si\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD;
+let isAsmParserOnly = 1 in {
+defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+                                    f32mem, load, "cvttss2si">, XS, VEX;
+defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                    int_x86_sse_cvttss2si64, f32mem, load,
+                                    "cvttss2si">, XS, VEX, VEX_W;
+defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+                                    f128mem, load, "cvttss2si">, XD, VEX;
+defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                    int_x86_sse2_cvttsd2si64, f128mem, load,
+                                    "cvttss2si">, XD, VEX, VEX_W;
 }
 defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
-                          f32mem, load, "cvttss2si\t{$src, $dst|$dst, $src}">,
-                          XS;
+                                    f32mem, load, "cvttss2si">, XS;
+defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                    int_x86_sse_cvttss2si64, f32mem, load,
+                                    "cvttss2si{q}">, XS, REX_W;
 defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
-                          f128mem, load, "cvttss2si\t{$src, $dst|$dst, $src}">,
-                          XD;
+                                    f128mem, load, "cvttss2si">, XD;
+defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                    int_x86_sse2_cvttsd2si64, f128mem, load,
+                                    "cvttss2si{q}">, XD, REX_W;
 
 let isAsmParserOnly = 1, Pattern = []<dag> in {
-defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load,
-                          "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS, VEX;
-defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, f128mem, load,
-                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
-                            SSEPackedSingle>, TB, VEX;
-defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, undef, f256mem, load,
-                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
-                            SSEPackedSingle>, TB, VEX;
+defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load,
+                               "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS, VEX;
+defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load,
+                               "cvtss2si\t{$src, $dst|$dst, $src}">, XS, VEX,
+                               VEX_W;
+defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load,
+                               "cvtdq2ps\t{$src, $dst|$dst, $src}",
+                               SSEPackedSingle>, TB, VEX;
+defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, undef, i256mem, load,
+                               "cvtdq2ps\t{$src, $dst|$dst, $src}",
+                               SSEPackedSingle>, TB, VEX;
 }
 let Pattern = []<dag> in {
 defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load /*dummy*/,
                           "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS;
-defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, f128mem, load /*dummy*/,
+defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load /*dummy*/,
+                          "cvtss2si{q}\t{$src, $dst|$dst, $src}">, XS, REX_W;
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
                             SSEPackedSingle>, TB; /* PD SSE3 form is avaiable */
 }
@@ -701,13 +800,11 @@ def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
 
 let isAsmParserOnly = 1 in
 defm Int_VCVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
-                    int_x86_sse2_cvtsd2ss, f64mem, load,
-                    "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
-                    XS, VEX_4V;
+                      int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss", 0>,
+                      XS, VEX_4V;
 let Constraints = "$src1 = $dst" in
 defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
-             int_x86_sse2_cvtsd2ss, f64mem, load,
-             "cvtsd2ss\t{$src2, $dst|$dst, $src2}">, XS;
+                      int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss">, XS;
 
 // Convert scalar single to scalar double
 let isAsmParserOnly = 1 in { // SSE2 instructions with XS prefix
@@ -806,6 +903,7 @@ def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                                         (bitconvert (memopv2i64 addr:$src))))]>,
                      XS, Requires<[HasSSE2]>;
 
+
 // Convert packed single/double fp to doubleword
 let isAsmParserOnly = 1 in {
 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -964,11 +1062,11 @@ def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
 
 let isAsmParserOnly = 1 in {
 def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "cvtps2pd\t{$src, $dst|$dst, $src}",
+                       "vcvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
                      VEX, Requires<[HasAVX]>;
 def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                       "cvtps2pd\t{$src, $dst|$dst, $src}",
+                       "vcvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd
                                           (load addr:$src)))]>,
                      VEX, Requires<[HasAVX]>;
@@ -1029,6 +1127,39 @@ def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
                                             (memop addr:$src)))]>;
 
+// AVX 256-bit register conversion intrinsics
+// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
+// whenever possible to avoid declaring two versions of each one.
+def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
+          (VCVTDQ2PSYrr VR256:$src)>;
+def : Pat<(int_x86_avx_cvtdq2_ps_256 (memopv8i32 addr:$src)),
+          (VCVTDQ2PSYrm addr:$src)>;
+
+def : Pat<(int_x86_avx_cvt_pd2_ps_256 VR256:$src),
+          (VCVTPD2PSYrr VR256:$src)>;
+def : Pat<(int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)),
+          (VCVTPD2PSYrm addr:$src)>;
+
+def : Pat<(int_x86_avx_cvt_ps2dq_256 VR256:$src),
+          (VCVTPS2DQYrr VR256:$src)>;
+def : Pat<(int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)),
+          (VCVTPS2DQYrm addr:$src)>;
+
+def : Pat<(int_x86_avx_cvt_ps2_pd_256 VR128:$src),
+          (VCVTPS2PDYrr VR128:$src)>;
+def : Pat<(int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)),
+          (VCVTPS2PDYrm addr:$src)>;
+
+def : Pat<(int_x86_avx_cvtt_pd2dq_256 VR256:$src),
+          (VCVTTPD2DQYrr VR256:$src)>;
+def : Pat<(int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)),
+          (VCVTTPD2DQYrm addr:$src)>;
+
+def : Pat<(int_x86_avx_cvtt_ps2dq_256 VR256:$src),
+          (VCVTTPS2DQYrr VR256:$src)>;
+def : Pat<(int_x86_avx_cvtt_ps2dq_256 (memopv8f32 addr:$src)),
+          (VCVTTPS2DQYrm addr:$src)>;
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Compare Instructions
 //===----------------------------------------------------------------------===//
@@ -1193,16 +1324,14 @@ let isAsmParserOnly = 1 in {
                  "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
                  "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
                  SSEPackedDouble>, OpSize, VEX_4V;
-  let Pattern = []<dag> in {
-    defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_sse_cmp_ps,
-                   "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
-                   "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-                   SSEPackedSingle>, VEX_4V;
-    defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_sse2_cmp_pd,
-                   "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
-                   "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-                   SSEPackedDouble>, OpSize, VEX_4V;
-  }
+  defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256,
+                 "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
+                 "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
+                 SSEPackedSingle>, VEX_4V;
+  defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256,
+                 "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
+                 "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
+                 SSEPackedDouble>, OpSize, VEX_4V;
 }
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
@@ -1232,24 +1361,30 @@ def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
                          ValueType vt, string asm, PatFrag mem_frag,
                          Domain d, bit IsConvertibleToThreeAddress = 0> {
-  def rmi : PIi8<0xC6, MRMSrcMem, (outs VR128:$dst),
-                   (ins VR128:$src1, f128mem:$src2, i8imm:$src3), asm,
-                   [(set VR128:$dst, (vt (shufp:$src3
-                            VR128:$src1, (mem_frag addr:$src2))))], d>;
+  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, f128mem:$src2, i8imm:$src3), asm,
+                   [(set RC:$dst, (vt (shufp:$src3
+                            RC:$src1, (mem_frag addr:$src2))))], d>;
   let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
-    def rri : PIi8<0xC6, MRMSrcReg, (outs VR128:$dst),
-                   (ins VR128:$src1, VR128:$src2, i8imm:$src3), asm,
-                   [(set VR128:$dst,
-                            (vt (shufp:$src3 VR128:$src1, VR128:$src2)))], d>;
+    def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
+                   [(set RC:$dst,
+                            (vt (shufp:$src3 RC:$src1, RC:$src2)))], d>;
 }
 
 let isAsmParserOnly = 1 in {
-  defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
-            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-            memopv4f32, SSEPackedSingle>, VEX_4V;
-  defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
-            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
-            memopv2f64, SSEPackedDouble>, OpSize, VEX_4V;
+  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
+             "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+             memopv4f32, SSEPackedSingle>, VEX_4V;
+  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
+             "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+             memopv8f32, SSEPackedSingle>, VEX_4V;
+  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
+             "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
+             memopv2f64, SSEPackedDouble>, OpSize, VEX_4V;
+  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
+             "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
+             memopv4f64, SSEPackedDouble>, OpSize, VEX_4V;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -1351,12 +1486,23 @@ let isAsmParserOnly = 1 in {
   defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
                                         "movmskpd", SSEPackedDouble>, OpSize,
                                         VEX;
-  // FIXME: merge with multiclass above when the intrinsics come.
-  def VMOVMSKPSYrr : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src),
+  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
+                                        "movmskps", SSEPackedSingle>, VEX;
+  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
+                                        "movmskpd", SSEPackedDouble>, OpSize,
+                                        VEX;
+
+  // Assembler Only
+  def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+             "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
+  def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+             "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
+             VEX;
+  def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
              "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
-  def VMOVMSKPDYrr : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src),
+  def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
              "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
-                                                                        VEX;
+             VEX;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1536,6 +1682,9 @@ let isCommutable = 0 in
 ///
 /// These three forms can each be reg+reg or reg+mem.
 ///
+
+/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
+/// classes below
 multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                   bit Is2Addr = 1> {
   defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
@@ -1565,7 +1714,7 @@ multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr,
 }
 
 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
-                                   bit Is2Addr = 1> {
+                                      bit Is2Addr = 1> {
   defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
      !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, Is2Addr>, XS;
   defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
@@ -1573,37 +1722,57 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
 }
 
 multiclass basic_sse12_fp_binop_p_int<bits<8> opc, string OpcodeStr,
-                                   bit Is2Addr = 1> {
+                                      bit Is2Addr = 1> {
   defm PS : sse12_fp_packed_int<opc, OpcodeStr, VR128,
-     !strconcat(OpcodeStr, "ps"), "", "_ps", f128mem, memopv4f32,
+     !strconcat(OpcodeStr, "ps"), "sse", "_ps", f128mem, memopv4f32,
                                               SSEPackedSingle, Is2Addr>, TB;
 
   defm PD : sse12_fp_packed_int<opc, OpcodeStr, VR128,
-     !strconcat(OpcodeStr, "pd"), "2", "_pd", f128mem, memopv2f64,
+     !strconcat(OpcodeStr, "pd"), "sse2", "_pd", f128mem, memopv2f64,
                                       SSEPackedDouble, Is2Addr>, TB, OpSize;
 }
 
+multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr> {
+  defm PSY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
+     !strconcat(OpcodeStr, "ps"), "avx", "_ps_256", f256mem, memopv8f32,
+      SSEPackedSingle, 0>, TB;
+
+  defm PDY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
+     !strconcat(OpcodeStr, "pd"), "avx", "_pd_256", f256mem, memopv4f64,
+      SSEPackedDouble, 0>, TB, OpSize;
+}
+
 // Binary Arithmetic instructions
 let isAsmParserOnly = 1 in {
   defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>,
+              basic_sse12_fp_binop_s_int<0x58, "add", 0>,
               basic_sse12_fp_binop_p<0x58, "add", fadd, 0>,
               basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V;
   defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>,
+              basic_sse12_fp_binop_s_int<0x59, "mul", 0>,
               basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>,
               basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V;
 
   let isCommutable = 0 in {
     defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>,
+                basic_sse12_fp_binop_s_int<0x5C, "sub", 0>,
                 basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>,
                 basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V;
     defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>,
+                basic_sse12_fp_binop_s_int<0x5E, "div", 0>,
                 basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>,
                 basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V;
     defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>,
+                basic_sse12_fp_binop_s_int<0x5F, "max", 0>,
                 basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>,
-                basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>, VEX_4V;
+                basic_sse12_fp_binop_p_int<0x5F, "max", 0>,
+                basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>,
+                basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V;
     defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>,
+                basic_sse12_fp_binop_s_int<0x5D, "min", 0>,
                 basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>,
+                basic_sse12_fp_binop_p_int<0x5D, "min", 0>,
+                basic_sse12_fp_binop_p_y_int<0x5D, "min">,
                 basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V;
   }
 }
@@ -1668,20 +1837,20 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
 multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr,
                               SDNode OpNode, Intrinsic F32Int> {
   def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
-                !strconcat(!strconcat("v", OpcodeStr),
+                !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
   def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2),
-                !strconcat(!strconcat("v", OpcodeStr),
+                !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 []>, XS, Requires<[HasAVX, OptForSize]>;
-  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
-                (ins VR128:$src1, VR128:$src2),
-                !strconcat(!strconcat("v", OpcodeStr),
-                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
-                (ins VR128:$src1, ssmem:$src2),
-                !strconcat(!strconcat("v", OpcodeStr),
-                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                !strconcat(OpcodeStr,
+                           "ss\t{$src, $dst, $dst|$dst, $dst, $src}"),
+                [(set VR128:$dst, (F32Int VR128:$src))]>;
+  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
+                !strconcat(OpcodeStr,
+                           "ss\t{$src, $dst, $dst|$dst, $dst, $src}"),
+                [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
 }
 
 /// sse1_fp_unop_p - SSE1 unops in packed form.
@@ -1715,6 +1884,16 @@ multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
                     [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>;
 }
 
+/// sse1_fp_unop_p_y_int - AVX 256-bit intrinsics unops in packed forms.
+multiclass sse1_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
+                                Intrinsic V4F32Int> {
+  def PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst, (V4F32Int VR256:$src))]>;
+  def PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst, (V4F32Int (memopv8f32 addr:$src)))]>;
+}
 
 /// sse2_fp_unop_s - SSE2 unops in scalar form.
 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
@@ -1738,21 +1917,19 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
 /// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form.
 multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr,
                               SDNode OpNode, Intrinsic F64Int> {
-  def SDr : VSDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
-                !strconcat(OpcodeStr,
-                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  def SDm : VSDI<opc, MRMSrcMem, (outs FR64:$dst),
-                (ins FR64:$src1, f64mem:$src2),
-                !strconcat(OpcodeStr,
-                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  def SDr_Int : VSDI<opc, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2),
-           !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>;
-  def SDm_Int : VSDI<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, sdmem:$src2),
-           !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>;
+  def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+               !strconcat(OpcodeStr,
+                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
+               (ins FR64:$src1, f64mem:$src2),
+               !strconcat(OpcodeStr,
+                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "sd\t{$src, $dst, $dst|$dst, $dst, $src}"),
+           [(set VR128:$dst, (F64Int VR128:$src))]>;
+  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
+           !strconcat(OpcodeStr, "sd\t{$src, $dst, $dst|$dst, $dst, $src}"),
+           [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
 }
 
 /// sse2_fp_unop_p - SSE2 unops in vector forms.
@@ -1787,29 +1964,48 @@ multiclass sse2_fp_unop_p_int<bits<8> opc, string OpcodeStr,
                     [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))]>;
 }
 
+/// sse2_fp_unop_p_y_int - AVX 256-bit intrinsic unops in vector forms.
+multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
+                                Intrinsic V2F64Int> {
+  def PDYr_Int : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst, (V2F64Int VR256:$src))]>;
+  def PDYm_Int : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))]>;
+}
+
 let isAsmParserOnly = 1, Predicates = [HasAVX] in {
   // Square root.
-  defm VSQRT  : sse1_fp_unop_s_avx<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss>,
-                sse2_fp_unop_s_avx<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd>,
+  defm VSQRT  : sse1_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse_sqrt_ss>,
+                sse2_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse2_sqrt_sd>,
                 VEX_4V;
 
   defm VSQRT  : sse1_fp_unop_p<0x51, "vsqrt", fsqrt>,
                 sse2_fp_unop_p<0x51, "vsqrt", fsqrt>,
                 sse1_fp_unop_p_y<0x51, "vsqrt", fsqrt>,
                 sse2_fp_unop_p_y<0x51, "vsqrt", fsqrt>,
+                sse1_fp_unop_p_int<0x51, "vsqrt", int_x86_sse_sqrt_ps>,
+                sse2_fp_unop_p_int<0x51, "vsqrt", int_x86_sse2_sqrt_pd>,
+                sse1_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_ps_256>,
+                sse2_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_pd_256>,
                 VEX;
 
   // Reciprocal approximations. Note that these typically require refinement
   // in order to obtain suitable precision.
-  defm VRSQRT : sse1_fp_unop_s_avx<0x52, "rsqrt", X86frsqrt,
+  defm VRSQRT : sse1_fp_unop_s_avx<0x52, "vrsqrt", X86frsqrt,
                                    int_x86_sse_rsqrt_ss>, VEX_4V;
   defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt>,
-                sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt>, VEX;
+                sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt>,
+                sse1_fp_unop_p_y_int<0x52, "vrsqrt", int_x86_avx_rsqrt_ps_256>,
+                sse1_fp_unop_p_int<0x52, "vrsqrt", int_x86_sse_rsqrt_ps>, VEX;
 
-  defm VRCP   : sse1_fp_unop_s_avx<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>,
+  defm VRCP   : sse1_fp_unop_s_avx<0x53, "vrcp", X86frcp, int_x86_sse_rcp_ss>,
                                    VEX_4V;
   defm VRCP   : sse1_fp_unop_p<0x53, "vrcp", X86frcp>,
-                sse1_fp_unop_p_y<0x53, "vrcp", X86frcp>, VEX;
+                sse1_fp_unop_p_y<0x53, "vrcp", X86frcp>,
+                sse1_fp_unop_p_y_int<0x53, "vrcp", int_x86_avx_rcp_ps_256>,
+                sse1_fp_unop_p_int<0x53, "vrcp", int_x86_sse_rcp_ps>, VEX;
 }
 
 // Square root.
@@ -1898,6 +2094,13 @@ let isAsmParserOnly = 1 in {
   }
 }
 
+def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src),
+          (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+def : Pat<(int_x86_avx_movnt_pd_256 addr:$dst, VR256:$src),
+          (VMOVNTPDYmr addr:$dst, VR256:$src)>;
+def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src),
+          (VMOVNTPSYmr addr:$dst, VR256:$src)>;
+
 def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                     "movntps\t{$src, $dst|$dst, $src}",
                     [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
@@ -1961,11 +2164,14 @@ def PREFETCHNTA  : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
 // Load, store, and memory fence
 def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
              TB, Requires<[HasSSE1]>;
+def : Pat<(X86SFence), (SFENCE)>;
 
 // Alias instructions that map zero vector to pxor / xorp* for sse.
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-zeros value if folding it would be beneficial.
-// FIXME: Change encoding to pseudo!
+// FIXME: Change encoding to pseudo! This is blocked right now by the x86
+// JIT implementatioan, it does not expand the instructions below like
+// X86MCInstLower does.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isCodeGenOnly = 1 in {
 def V_SET0PS : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
@@ -1977,6 +2183,26 @@ def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
                  [(set VR128:$dst, (v4i32 immAllZerosV))]>;
 }
 
+// The same as done above but for AVX. The 128-bit versions are the
+// same, but re-encoded. The 256-bit does not support PI version.
+// FIXME: Change encoding to pseudo! This is blocked right now by the x86
+// JIT implementatioan, it does not expand the instructions below like
+// X86MCInstLower does.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1, Predicates = [HasAVX] in {
+def AVX_SET0PS  : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
+                   [(set VR128:$dst, (v4f32 immAllZerosV))]>, VEX_4V;
+def AVX_SET0PD  : PDI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
+                   [(set VR128:$dst, (v2f64 immAllZerosV))]>, VEX_4V;
+def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
+                   [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V;
+def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
+                   [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
+let ExeDomain = SSEPackedInt in
+def AVX_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
+                 [(set VR128:$dst, (v4i32 immAllZerosV))]>;
+}
+
 def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>;
 def : Pat<(v8i16 immAllZerosV), (V_SET0PI)>;
 def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>;
@@ -2003,35 +2229,47 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
 //===---------------------------------------------------------------------===//
 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
 //===---------------------------------------------------------------------===//
+
 let ExeDomain = SSEPackedInt in { // SSE integer instructions
 
 let isAsmParserOnly = 1 in {
-  let neverHasSideEffects = 1 in
-  def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
-  def VMOVDQUrr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                     "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
+  let neverHasSideEffects = 1 in {
+  def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                      "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+  }
+  def VMOVDQUrr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
+  def VMOVDQUYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                      "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
 
   let canFoldAsLoad = 1, mayLoad = 1 in {
-  def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                     "movdqa\t{$src, $dst|$dst, $src}",
-                     [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>,
-                     VEX;
-  def VMOVDQUrm :  I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                     "vmovdqu\t{$src, $dst|$dst, $src}",
-                     [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
-                   XS, VEX, Requires<[HasAVX]>;
+  def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+  let Predicates = [HasAVX] in {
+    def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                      "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
+    def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                      "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
+  }
   }
 
   let mayStore = 1 in {
-  def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
-                     (ins i128mem:$dst, VR128:$src),
-                     "movdqa\t{$src, $dst|$dst, $src}",
-                     [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>, VEX;
-  def VMOVDQUmr :  I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
-                     "vmovdqu\t{$src, $dst|$dst, $src}",
-                     [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
-                   XS, VEX, Requires<[HasAVX]>;
+  def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
+                       (ins i128mem:$dst, VR128:$src),
+                       "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
+                       (ins i256mem:$dst, VR256:$src),
+                       "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+  let Predicates = [HasAVX] in {
+  def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                    "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
+  def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
+                    "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
+  }
   }
 }
 
@@ -2084,6 +2322,10 @@ def MOVDQUmr_Int :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
 
 } // ExeDomain = SSEPackedInt
 
+def : Pat<(int_x86_avx_loadu_dq_256 addr:$src), (VMOVDQUYrm addr:$src)>;
+def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
+          (VMOVDQUYmr addr:$dst, VR256:$src)>;
+
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Arithmetic Instructions
 //===---------------------------------------------------------------------===//
@@ -2376,6 +2618,25 @@ let ExeDomain = SSEPackedInt in {
 }
 } // Constraints = "$src1 = $dst"
 
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
+            (v2i64 (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
+            (v2i64 (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+  def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2),
+            (v2i64 (VPSLLDQri VR128:$src1, imm:$src2))>;
+  def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2),
+            (v2i64 (VPSRLDQri VR128:$src1, imm:$src2))>;
+  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
+            (v2f64 (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+
+  // Shift up / down and insert zero's.
+  def : Pat<(v2i64 (X86vshl  VR128:$src, (i8 imm:$amt))),
+            (v2i64 (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
+  def : Pat<(v2i64 (X86vshr  VR128:$src, (i8 imm:$amt))),
+            (v2i64 (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
+}
+
 let Predicates = [HasSSE2] in {
   def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
             (v2i64 (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
@@ -2662,11 +2923,16 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
                                                 imm:$src2))]>;
 
 // Insert
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
-  defm PINSRW : sse2_pinsrw<0>, OpSize, VEX_4V;
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+  defm VPINSRW : sse2_pinsrw<0>, OpSize, VEX_4V;
+  def  VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
+       "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+       []>, OpSize, VEX_4V;
+}
 
 let Constraints = "$src1 = $dst" in
-  defm VPINSRW : sse2_pinsrw, TB, OpSize;
+  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[HasSSE2]>;
 
 } // ExeDomain = SSEPackedInt
 
@@ -2676,10 +2942,13 @@ let Constraints = "$src1 = $dst" in
 
 let ExeDomain = SSEPackedInt in {
 
-let isAsmParserOnly = 1 in
-def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+let isAsmParserOnly = 1 in {
+def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
            [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>, VEX;
+def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX;
+}
 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
            [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
@@ -2939,18 +3208,20 @@ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
 
 // Instructions to match in the assembler
 let isAsmParserOnly = 1 in {
-// This instructions is in fact an alias to movd with 64 bit dst
 def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                       "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
 def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
+// Recognize "movd" with GR64 destination, but encode as a "movq"
+def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                          "movd\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
 }
 
 // Instructions for the disassembler
 // xr = XMM register
 // xm = mem64
 
-let isAsmParserOnly = 1 in
+let isAsmParserOnly = 1, Predicates = [HasAVX] in
 def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS;
 def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -2970,19 +3241,14 @@ def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
                "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
 def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
                "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
+def : Pat<(X86LFence), (LFENCE)>;
+def : Pat<(X86MFence), (MFENCE)>;
+
 
 // Pause. This "instruction" is encoded as "rep; nop", so even though it
 // was introduced with SSE2, it's backward compatible.
 def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP;
 
-//TODO: custom lower this so as to never even generate the noop
-def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
-           (i8 0)), (NOOP)>;
-def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
-def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
-def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
-           (i8 1)), (MFENCE)>;
-
 // Alias instructions that map zero vector to pxor / xorp* for sse.
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-ones value if folding it would be beneficial.
@@ -3027,13 +3293,13 @@ def CVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 // Convert Packed DW Integers to Packed Double FP
 let isAsmParserOnly = 1, Predicates = [HasAVX] in {
 def VCVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTDQ2PDYrm  : S3SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
-                       "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTDQ2PDYrr  : S3SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
-                       "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
 }
 
 def CVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
@@ -3041,6 +3307,17 @@ def CVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 def CVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
 
+// AVX 256-bit register conversion intrinsics
+def : Pat<(int_x86_avx_cvtdq2_pd_256 VR128:$src),
+           (VCVTDQ2PDYrr VR128:$src)>;
+def : Pat<(int_x86_avx_cvtdq2_pd_256 (memopv4i32 addr:$src)),
+           (VCVTDQ2PDYrm addr:$src)>;
+
+def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src),
+          (VCVTPD2DQYrr VR256:$src)>;
+def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)),
+          (VCVTPD2DQYrm addr:$src)>;
+
 //===---------------------------------------------------------------------===//
 // SSE3 - Move Instructions
 //===---------------------------------------------------------------------===//
@@ -3057,9 +3334,20 @@ def rm : S3SI<op, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                                          (memopv4f32 addr:$src), (undef)))]>;
 }
 
+multiclass sse3_replicate_sfp_y<bits<8> op, PatFrag rep_frag,
+                                string OpcodeStr> {
+def rr : S3SI<op, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+def rm : S3SI<op, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+}
+
 let isAsmParserOnly = 1, Predicates = [HasAVX] in {
-defm VMOVSHDUP : sse3_replicate_sfp<0x16, movshdup, "vmovshdup">, VEX;
-defm VMOVSLDUP : sse3_replicate_sfp<0x12, movsldup, "vmovsldup">, VEX;
+  // FIXME: Merge above classes when we have patterns for the ymm version
+  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, movshdup, "vmovshdup">, VEX;
+  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, movsldup, "vmovsldup">, VEX;
+  defm VMOVSHDUPY : sse3_replicate_sfp_y<0x16, movshdup, "vmovshdup">, VEX;
+  defm VMOVSLDUPY : sse3_replicate_sfp_y<0x12, movsldup, "vmovsldup">, VEX;
 }
 defm MOVSHDUP : sse3_replicate_sfp<0x16, movshdup, "movshdup">;
 defm MOVSLDUP : sse3_replicate_sfp<0x12, movsldup, "movsldup">;
@@ -3076,15 +3364,31 @@ def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                                       (undef))))]>;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
-  defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX;
+multiclass sse3_replicate_dfp_y<string OpcodeStr> {
+def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    []>;
+def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    []>;
+}
+
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+  // FIXME: Merge above classes when we have patterns for the ymm version
+  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
+  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX;
+}
 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
 
 // Move Unaligned Integer
-let isAsmParserOnly = 1 in
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
   def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                     "vlddqu\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
+                   "vlddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
+  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                   "vlddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, VEX;
+}
 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "lddqu\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
@@ -3125,35 +3429,39 @@ let AddedComplexity = 20 in
 // SSE3 - Arithmetic
 //===---------------------------------------------------------------------===//
 
-multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, bit Is2Addr = 1> {
+multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop, bit Is2Addr = 1> {
   def rr : I<0xD0, MRMSrcReg,
-       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+       (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set VR128:$dst, (Int VR128:$src1,
-                          VR128:$src2))]>;
+       [(set RC:$dst, (Int RC:$src1, RC:$src2))]>;
   def rm : I<0xD0, MRMSrcMem,
-       (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set VR128:$dst, (Int VR128:$src1,
-                          (memop addr:$src2)))]>;
-
+       [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))]>;
 }
 
 let isAsmParserOnly = 1, Predicates = [HasAVX],
   ExeDomain = SSEPackedDouble in {
-  defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", 0>, XD,
-                              VEX_4V;
-  defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", 0>, OpSize,
-                              VEX_4V;
+  defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
+                               f128mem, 0>, XD, VEX_4V;
+  defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
+                               f128mem, 0>, OpSize, VEX_4V;
+  defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
+                               f256mem, 0>, XD, VEX_4V;
+  defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
+                               f256mem, 0>, OpSize, VEX_4V;
 }
 let Constraints = "$src1 = $dst", Predicates = [HasSSE3],
     ExeDomain = SSEPackedDouble in {
-  defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps">, XD;
-  defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd">, TB, OpSize;
+  defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
+                              f128mem>, XD;
+  defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
+                              f128mem>, TB, OpSize;
 }
 
 //===---------------------------------------------------------------------===//
@@ -3161,61 +3469,72 @@ let Constraints = "$src1 = $dst", Predicates = [HasSSE3],
 //===---------------------------------------------------------------------===//
 
 // Horizontal ops
-class S3D_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId, bit Is2Addr = 1>
-  : S3DI<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+                   X86MemOperand x86memop, Intrinsic IntId, bit Is2Addr = 1> {
+  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-         [(set VR128:$dst, (v4f32 (IntId VR128:$src1, VR128:$src2)))]>;
-class S3D_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId, bit Is2Addr = 1>
-  : S3DI<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+      [(set RC:$dst, (vt (IntId RC:$src1, RC:$src2)))]>;
+
+  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-         [(set VR128:$dst, (v4f32 (IntId VR128:$src1, (memop addr:$src2))))]>;
-class S3_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId, bit Is2Addr = 1>
-  : S3I<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+      [(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>;
+}
+multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+                  X86MemOperand x86memop, Intrinsic IntId, bit Is2Addr = 1> {
+  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-        [(set VR128:$dst, (v2f64 (IntId VR128:$src1, VR128:$src2)))]>;
-class S3_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId, bit Is2Addr = 1>
-  : S3I<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+      [(set RC:$dst, (vt (IntId RC:$src1, RC:$src2)))]>;
+
+  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set VR128:$dst, (v2f64 (IntId VR128:$src1, (memopv2f64 addr:$src2))))]>;
+      [(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>;
+}
 
 let isAsmParserOnly = 1, Predicates = [HasAVX] in {
-  def VHADDPSrr : S3D_Intrr<0x7C, "vhaddps", int_x86_sse3_hadd_ps, 0>, VEX_4V;
-  def VHADDPSrm : S3D_Intrm<0x7C, "vhaddps", int_x86_sse3_hadd_ps, 0>, VEX_4V;
-  def VHADDPDrr : S3_Intrr <0x7C, "vhaddpd", int_x86_sse3_hadd_pd, 0>, VEX_4V;
-  def VHADDPDrm : S3_Intrm <0x7C, "vhaddpd", int_x86_sse3_hadd_pd, 0>, VEX_4V;
-  def VHSUBPSrr : S3D_Intrr<0x7D, "vhsubps", int_x86_sse3_hsub_ps, 0>, VEX_4V;
-  def VHSUBPSrm : S3D_Intrm<0x7D, "vhsubps", int_x86_sse3_hsub_ps, 0>, VEX_4V;
-  def VHSUBPDrr : S3_Intrr <0x7D, "vhsubpd", int_x86_sse3_hsub_pd, 0>, VEX_4V;
-  def VHSUBPDrm : S3_Intrm <0x7D, "vhsubpd", int_x86_sse3_hsub_pd, 0>, VEX_4V;
+  defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
+                          int_x86_sse3_hadd_ps, 0>, VEX_4V;
+  defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
+                          int_x86_sse3_hadd_pd, 0>, VEX_4V;
+  defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
+                          int_x86_sse3_hsub_ps, 0>, VEX_4V;
+  defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
+                          int_x86_sse3_hsub_pd, 0>, VEX_4V;
+  defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
+                          int_x86_avx_hadd_ps_256, 0>, VEX_4V;
+  defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
+                          int_x86_avx_hadd_pd_256, 0>, VEX_4V;
+  defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
+                          int_x86_avx_hsub_ps_256, 0>, VEX_4V;
+  defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
+                          int_x86_avx_hsub_pd_256, 0>, VEX_4V;
 }
 
 let Constraints = "$src1 = $dst" in {
-  def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>;
-  def HADDPSrm : S3D_Intrm<0x7C, "haddps", int_x86_sse3_hadd_ps>;
-  def HADDPDrr : S3_Intrr <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
-  def HADDPDrm : S3_Intrm <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
-  def HSUBPSrr : S3D_Intrr<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
-  def HSUBPSrm : S3D_Intrm<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
-  def HSUBPDrr : S3_Intrr <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
-  def HSUBPDrm : S3_Intrm <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+  defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem,
+                        int_x86_sse3_hadd_ps>;
+  defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem,
+                       int_x86_sse3_hadd_pd>;
+  defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem,
+                        int_x86_sse3_hsub_ps>;
+  defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem,
+                       int_x86_sse3_hsub_pd>;
 }
 
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Absolute Instructions
 //===---------------------------------------------------------------------===//
 
-/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
-multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
-                            PatFrag mem_frag64, PatFrag mem_frag128,
-                            Intrinsic IntId64, Intrinsic IntId128> {
+/// SS3I_unop_rm_int_mm - Simple SSSE3 unary whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
+                               PatFrag mem_frag64, Intrinsic IntId64> {
   def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst, (IntId64 VR64:$src))]>;
@@ -3224,7 +3543,11 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst,
                      (IntId64 (bitconvert (mem_frag64 addr:$src))))]>;
+}
 
+/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
+                            PatFrag mem_frag128, Intrinsic IntId128> {
   def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -3240,26 +3563,28 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
 }
 
 let isAsmParserOnly = 1, Predicates = [HasAVX] in {
-  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb", memopv8i8, memopv16i8,
-                                  int_x86_ssse3_pabs_b,
+  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb", memopv16i8,
                                   int_x86_ssse3_pabs_b_128>, VEX;
-  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw", memopv4i16, memopv8i16,
-                                  int_x86_ssse3_pabs_w,
+  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw", memopv8i16,
                                   int_x86_ssse3_pabs_w_128>, VEX;
-  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd", memopv2i32, memopv4i32,
-                                  int_x86_ssse3_pabs_d,
+  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd", memopv4i32,
                                   int_x86_ssse3_pabs_d_128>, VEX;
 }
 
-defm PABSB       : SS3I_unop_rm_int<0x1C, "pabsb", memopv8i8, memopv16i8,
-                                    int_x86_ssse3_pabs_b,
-                                    int_x86_ssse3_pabs_b_128>;
-defm PABSW       : SS3I_unop_rm_int<0x1D, "pabsw", memopv4i16, memopv8i16,
-                                    int_x86_ssse3_pabs_w,
-                                    int_x86_ssse3_pabs_w_128>;
-defm PABSD       : SS3I_unop_rm_int<0x1E, "pabsd", memopv2i32, memopv4i32,
-                                    int_x86_ssse3_pabs_d,
-                                    int_x86_ssse3_pabs_d_128>;
+defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", memopv16i8,
+                              int_x86_ssse3_pabs_b_128>,
+             SS3I_unop_rm_int_mm<0x1C, "pabsb", memopv8i8,
+                                 int_x86_ssse3_pabs_b>;
+
+defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", memopv8i16,
+                              int_x86_ssse3_pabs_w_128>,
+             SS3I_unop_rm_int_mm<0x1D, "pabsw", memopv4i16,
+                                 int_x86_ssse3_pabs_w>;
+
+defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", memopv4i32,
+                              int_x86_ssse3_pabs_d_128>,
+             SS3I_unop_rm_int_mm<0x1E, "pabsd", memopv2i32,
+                              int_x86_ssse3_pabs_d>;
 
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Binary Operator Instructions
@@ -3267,26 +3592,9 @@ defm PABSD       : SS3I_unop_rm_int<0x1E, "pabsd", memopv2i32, memopv4i32,
 
 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
-                             PatFrag mem_frag64, PatFrag mem_frag128,
-                             Intrinsic IntId64, Intrinsic IntId128,
+                             PatFrag mem_frag128, Intrinsic IntId128,
                              bit Is2Addr = 1> {
   let isCommutable = 1 in
-  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
-       (ins VR64:$src1, VR64:$src2),
-       !if(Is2Addr,
-         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]>;
-  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
-       (ins VR64:$src1, i64mem:$src2),
-       !if(Is2Addr,
-         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set VR64:$dst,
-         (IntId64 VR64:$src1,
-          (bitconvert (memopv8i8 addr:$src2))))]>;
-
-  let isCommutable = 1 in
   def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
        (ins VR128:$src1, VR128:$src2),
        !if(Is2Addr,
@@ -3303,88 +3611,102 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
          (IntId128 VR128:$src1,
           (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
 }
+multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
+                             PatFrag mem_frag64, Intrinsic IntId64> {
+  let isCommutable = 1 in
+  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+       (ins VR64:$src1, VR64:$src2),
+        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+       [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]>;
+  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+       (ins VR64:$src1, i64mem:$src2),
+        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+       [(set VR64:$dst,
+         (IntId64 VR64:$src1,
+          (bitconvert (memopv8i8 addr:$src2))))]>;
+}
 
 let isAsmParserOnly = 1, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
-  defm VPHADDW    : SS3I_binop_rm_int<0x01, "vphaddw", memopv4i16, memopv8i16,
-                                      int_x86_ssse3_phadd_w,
+  defm VPHADDW    : SS3I_binop_rm_int<0x01, "vphaddw", memopv8i16,
                                       int_x86_ssse3_phadd_w_128, 0>, VEX_4V;
-  defm VPHADDD    : SS3I_binop_rm_int<0x02, "vphaddd", memopv2i32, memopv4i32,
-                                      int_x86_ssse3_phadd_d,
+  defm VPHADDD    : SS3I_binop_rm_int<0x02, "vphaddd", memopv4i32,
                                       int_x86_ssse3_phadd_d_128, 0>, VEX_4V;
-  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw", memopv4i16, memopv8i16,
-                                      int_x86_ssse3_phadd_sw,
+  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw", memopv8i16,
                                       int_x86_ssse3_phadd_sw_128, 0>, VEX_4V;
-  defm VPHSUBW    : SS3I_binop_rm_int<0x05, "vphsubw", memopv4i16, memopv8i16,
-                                      int_x86_ssse3_phsub_w,
+  defm VPHSUBW    : SS3I_binop_rm_int<0x05, "vphsubw", memopv8i16,
                                       int_x86_ssse3_phsub_w_128, 0>, VEX_4V;
-  defm VPHSUBD    : SS3I_binop_rm_int<0x06, "vphsubd", memopv2i32, memopv4i32,
-                                      int_x86_ssse3_phsub_d,
+  defm VPHSUBD    : SS3I_binop_rm_int<0x06, "vphsubd", memopv4i32,
                                       int_x86_ssse3_phsub_d_128, 0>, VEX_4V;
-  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw", memopv4i16, memopv8i16,
-                                      int_x86_ssse3_phsub_sw,
+  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw", memopv8i16,
                                       int_x86_ssse3_phsub_sw_128, 0>, VEX_4V;
-  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", memopv8i8, memopv16i8,
-                                      int_x86_ssse3_pmadd_ub_sw,
+  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", memopv16i8,
                                       int_x86_ssse3_pmadd_ub_sw_128, 0>, VEX_4V;
-  defm VPSHUFB    : SS3I_binop_rm_int<0x00, "vpshufb", memopv8i8, memopv16i8,
-                                      int_x86_ssse3_pshuf_b,
+  defm VPSHUFB    : SS3I_binop_rm_int<0x00, "vpshufb", memopv16i8,
                                       int_x86_ssse3_pshuf_b_128, 0>, VEX_4V;
-  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb", memopv8i8, memopv16i8,
-                                      int_x86_ssse3_psign_b,
+  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb", memopv16i8,
                                       int_x86_ssse3_psign_b_128, 0>, VEX_4V;
-  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw", memopv4i16, memopv8i16,
-                                      int_x86_ssse3_psign_w,
+  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw", memopv8i16,
                                       int_x86_ssse3_psign_w_128, 0>, VEX_4V;
-  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd", memopv2i32, memopv4i32,
-                                      int_x86_ssse3_psign_d,
+  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd", memopv4i32,
                                       int_x86_ssse3_psign_d_128, 0>, VEX_4V;
 }
-defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw", memopv4i16, memopv8i16,
-                                      int_x86_ssse3_pmul_hr_sw,
+defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw", memopv8i16,
                                       int_x86_ssse3_pmul_hr_sw_128, 0>, VEX_4V;
 }
 
 // None of these have i8 immediate fields.
 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
 let isCommutable = 0 in {
-  defm PHADDW    : SS3I_binop_rm_int<0x01, "phaddw", memopv4i16, memopv8i16,
-                                     int_x86_ssse3_phadd_w,
-                                     int_x86_ssse3_phadd_w_128>;
-  defm PHADDD    : SS3I_binop_rm_int<0x02, "phaddd", memopv2i32, memopv4i32,
-                                     int_x86_ssse3_phadd_d,
-                                     int_x86_ssse3_phadd_d_128>;
-  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw", memopv4i16, memopv8i16,
-                                     int_x86_ssse3_phadd_sw,
-                                     int_x86_ssse3_phadd_sw_128>;
-  defm PHSUBW    : SS3I_binop_rm_int<0x05, "phsubw", memopv4i16, memopv8i16,
-                                     int_x86_ssse3_phsub_w,
-                                     int_x86_ssse3_phsub_w_128>;
-  defm PHSUBD    : SS3I_binop_rm_int<0x06, "phsubd", memopv2i32, memopv4i32,
-                                     int_x86_ssse3_phsub_d,
-                                     int_x86_ssse3_phsub_d_128>;
-  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw", memopv4i16, memopv8i16,
-                                     int_x86_ssse3_phsub_sw,
-                                     int_x86_ssse3_phsub_sw_128>;
-  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", memopv8i8, memopv16i8,
-                                     int_x86_ssse3_pmadd_ub_sw,
-                                     int_x86_ssse3_pmadd_ub_sw_128>;
-  defm PSHUFB    : SS3I_binop_rm_int<0x00, "pshufb", memopv8i8, memopv16i8,
-                                     int_x86_ssse3_pshuf_b,
-                                     int_x86_ssse3_pshuf_b_128>;
-  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", memopv8i8, memopv16i8,
-                                     int_x86_ssse3_psign_b,
-                                     int_x86_ssse3_psign_b_128>;
-  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", memopv4i16, memopv8i16,
-                                     int_x86_ssse3_psign_w,
-                                     int_x86_ssse3_psign_w_128>;
-  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", memopv2i32, memopv4i32,
-                                       int_x86_ssse3_psign_d,
-                                       int_x86_ssse3_psign_d_128>;
-}
-defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw", memopv4i16, memopv8i16,
-                                     int_x86_ssse3_pmul_hr_sw,
-                                     int_x86_ssse3_pmul_hr_sw_128>;
+  defm PHADDW    : SS3I_binop_rm_int<0x01, "phaddw", memopv8i16,
+                                     int_x86_ssse3_phadd_w_128>,
+                   SS3I_binop_rm_int_mm<0x01, "phaddw", memopv4i16,
+                                     int_x86_ssse3_phadd_w>;
+  defm PHADDD    : SS3I_binop_rm_int<0x02, "phaddd", memopv4i32,
+                                     int_x86_ssse3_phadd_d_128>,
+                   SS3I_binop_rm_int_mm<0x02, "phaddd", memopv2i32,
+                                     int_x86_ssse3_phadd_d>;
+  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw", memopv8i16,
+                                     int_x86_ssse3_phadd_sw_128>,
+                   SS3I_binop_rm_int_mm<0x03, "phaddsw", memopv4i16,
+                                     int_x86_ssse3_phadd_sw>;
+  defm PHSUBW    : SS3I_binop_rm_int<0x05, "phsubw", memopv8i16,
+                                     int_x86_ssse3_phsub_w_128>,
+                    SS3I_binop_rm_int_mm<0x05, "phsubw", memopv4i16,
+                                     int_x86_ssse3_phsub_w>;
+  defm PHSUBD    : SS3I_binop_rm_int<0x06, "phsubd", memopv4i32,
+                                     int_x86_ssse3_phsub_d_128>,
+                   SS3I_binop_rm_int_mm<0x06, "phsubd", memopv2i32,
+                                     int_x86_ssse3_phsub_d>;
+  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw", memopv8i16,
+                                     int_x86_ssse3_phsub_sw_128>,
+                   SS3I_binop_rm_int_mm<0x07, "phsubsw", memopv4i16,
+                                     int_x86_ssse3_phsub_sw>;
+  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", memopv16i8,
+                                     int_x86_ssse3_pmadd_ub_sw_128>,
+                   SS3I_binop_rm_int_mm<0x04, "pmaddubsw", memopv8i8,
+                                     int_x86_ssse3_pmadd_ub_sw>;
+  defm PSHUFB    : SS3I_binop_rm_int<0x00, "pshufb", memopv8i8, 
+                                     int_x86_ssse3_pshuf_b_128>,
+                   SS3I_binop_rm_int_mm<0x00, "pshufb", memopv8i8,
+                                     int_x86_ssse3_pshuf_b>;
+  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", memopv16i8,
+                                     int_x86_ssse3_psign_b_128>,
+                   SS3I_binop_rm_int_mm<0x08, "psignb", memopv8i8,
+                                     int_x86_ssse3_psign_b>;
+  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", memopv8i16,
+                                     int_x86_ssse3_psign_w_128>,
+                   SS3I_binop_rm_int_mm<0x09, "psignw", memopv4i16,
+                                     int_x86_ssse3_psign_w>;
+  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", memopv4i32,
+                                       int_x86_ssse3_psign_d_128>,
+                   SS3I_binop_rm_int_mm<0x0A, "psignd", memopv2i32,
+                                       int_x86_ssse3_psign_d>;
+}
+defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw", memopv8i16,
+                                     int_x86_ssse3_pmul_hr_sw_128>,
+                   SS3I_binop_rm_int_mm<0x0B, "pmulhrsw", memopv4i16,
+                                     int_x86_ssse3_pmul_hr_sw>;
 }
 
 def : Pat<(X86pshufb VR128:$src, VR128:$mask),
@@ -3396,22 +3718,16 @@ def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
 // SSSE3 - Packed Align Instruction Patterns
 //===---------------------------------------------------------------------===//
 
-multiclass sse3_palign<string asm, bit Is2Addr = 1> {
+multiclass ssse3_palign_mm<string asm> {
   def R64rr  : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
       (ins VR64:$src1, VR64:$src2, i8imm:$src3),
-      !if(Is2Addr,
-        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-        !strconcat(asm,
-                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-      []>;
+      !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
   def R64rm  : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
       (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
-      !if(Is2Addr,
-        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-        !strconcat(asm,
-                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-      []>;
+      !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
+}
 
+multiclass ssse3_palign<string asm, bit Is2Addr = 1> {
   def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
       (ins VR128:$src1, VR128:$src2, i8imm:$src3),
       !if(Is2Addr,
@@ -3429,9 +3745,10 @@ multiclass sse3_palign<string asm, bit Is2Addr = 1> {
 }
 
 let isAsmParserOnly = 1, Predicates = [HasAVX] in
-  defm VPALIGN : sse3_palign<"vpalignr", 0>, VEX_4V;
+  defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V;
 let Constraints = "$src1 = $dst" in
-  defm PALIGN : sse3_palign<"palignr">;
+  defm PALIGN : ssse3_palign<"palignr">,
+                ssse3_palign_mm<"palignr">;
 
 let AddedComplexity = 5 in {
 
@@ -3732,31 +4049,62 @@ def : Pat<(v2i32 (fp_to_sint (v2f64 VR128:$src))),
           (Int_CVTTPD2PIrr VR128:$src)>, Requires<[HasSSE2]>;
 
 // Use movaps / movups for SSE integer load / store (one byte shorter).
-def : Pat<(alignedloadv4i32 addr:$src),
-          (MOVAPSrm addr:$src)>;
-def : Pat<(loadv4i32 addr:$src),
-          (MOVUPSrm addr:$src)>;
-def : Pat<(alignedloadv2i64 addr:$src),
-          (MOVAPSrm addr:$src)>;
-def : Pat<(loadv2i64 addr:$src),
-          (MOVUPSrm addr:$src)>;
-
-def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
-          (MOVAPSmr addr:$dst, VR128:$src)>;
-def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
-          (MOVAPSmr addr:$dst, VR128:$src)>;
-def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
-          (MOVAPSmr addr:$dst, VR128:$src)>;
-def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
-          (MOVAPSmr addr:$dst, VR128:$src)>;
-def : Pat<(store (v2i64 VR128:$src), addr:$dst),
-          (MOVUPSmr addr:$dst, VR128:$src)>;
-def : Pat<(store (v4i32 VR128:$src), addr:$dst),
-          (MOVUPSmr addr:$dst, VR128:$src)>;
-def : Pat<(store (v8i16 VR128:$src), addr:$dst),
-          (MOVUPSmr addr:$dst, VR128:$src)>;
-def : Pat<(store (v16i8 VR128:$src), addr:$dst),
-          (MOVUPSmr addr:$dst, VR128:$src)>;
+let Predicates = [HasSSE1] in {
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (MOVUPSrm addr:$src)>;
+  def : Pat<(alignedloadv2i64 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(loadv2i64 addr:$src),
+            (MOVUPSrm addr:$src)>;
+
+  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
+// Use vmovaps/vmovups for AVX 128-bit integer load/store (one byte shorter).
+let Predicates = [HasAVX] in {
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (VMOVAPSrm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (VMOVUPSrm addr:$src)>;
+  def : Pat<(alignedloadv2i64 addr:$src),
+            (VMOVAPSrm addr:$src)>;
+  def : Pat<(loadv2i64 addr:$src),
+            (VMOVUPSrm addr:$src)>;
+
+  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Packed Move with Sign/Zero Extend
@@ -3923,8 +4271,12 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
 // (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
   defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
+  def  VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst),
+         (ins VR128:$src1, i32i8imm:$src2),
+         "vpextrb\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, OpSize, VEX;
+}
 
 defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
 
@@ -4007,8 +4359,13 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
                           addr:$dst)]>, OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let isAsmParserOnly = 1, Predicates = [HasAVX] in {
   defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
+  def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst),
+                  (ins VR128:$src1, i32i8imm:$src2),
+                  "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  []>, OpSize, VEX;
+}
 defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
 
 // Also match an EXTRACTPS store when the store is done as f32 instead of i32.
@@ -4131,80 +4488,84 @@ let isAsmParserOnly = 1, Predicates = [HasAVX] in
   defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
 
 def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
-          (INSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>;
+          (VINSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>,
+          Requires<[HasAVX]>;
+def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
+          (INSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>,
+          Requires<[HasSSE41]>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Round Instructions
 //===----------------------------------------------------------------------===//
 
-multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd,
-                            string OpcodeStr,
-                            Intrinsic V4F32Int,
-                            Intrinsic V2F64Int> {
+multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+                            X86MemOperand x86memop, RegisterClass RC,
+                            PatFrag mem_frag32, PatFrag mem_frag64,
+                            Intrinsic V4F32Int, Intrinsic V2F64Int> {
   // Intrinsic operation, reg.
   // Vector intrinsic operation, reg
   def PSr_Int : SS4AIi8<opcps, MRMSrcReg,
-                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set VR128:$dst, (V4F32Int VR128:$src1, imm:$src2))]>,
+                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>,
                     OpSize;
 
   // Vector intrinsic operation, mem
   def PSm_Int : Ii8<opcps, MRMSrcMem,
-                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins f256mem:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set VR128:$dst,
-                          (V4F32Int (memopv4f32 addr:$src1),imm:$src2))]>,
+                    [(set RC:$dst,
+                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>,
                     TA, OpSize,
                 Requires<[HasSSE41]>;
 
   // Vector intrinsic operation, reg
   def PDr_Int : SS4AIi8<opcpd, MRMSrcReg,
-                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set VR128:$dst, (V2F64Int VR128:$src1, imm:$src2))]>,
+                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>,
                     OpSize;
 
   // Vector intrinsic operation, mem
   def PDm_Int : SS4AIi8<opcpd, MRMSrcMem,
-                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins f256mem:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set VR128:$dst,
-                          (V2F64Int (memopv2f64 addr:$src1),imm:$src2))]>,
+                    [(set RC:$dst,
+                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>,
                     OpSize;
 }
 
-multiclass sse41_fp_unop_rm_avx<bits<8> opcps, bits<8> opcpd,
-                                string OpcodeStr> {
+multiclass sse41_fp_unop_rm_avx_p<bits<8> opcps, bits<8> opcpd,
+                   RegisterClass RC, X86MemOperand x86memop, string OpcodeStr> {
   // Intrinsic operation, reg.
   // Vector intrinsic operation, reg
   def PSr : SS4AIi8<opcps, MRMSrcReg,
-                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     []>, OpSize;
 
   // Vector intrinsic operation, mem
   def PSm : Ii8<opcps, MRMSrcMem,
-                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     []>, TA, OpSize, Requires<[HasSSE41]>;
 
   // Vector intrinsic operation, reg
   def PDr : SS4AIi8<opcpd, MRMSrcReg,
-                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     []>, OpSize;
 
   // Vector intrinsic operation, mem
   def PDm : SS4AIi8<opcpd, MRMSrcMem,
-                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     []>, OpSize;
@@ -4261,8 +4622,8 @@ multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
         OpSize;
 }
 
-multiclass sse41_fp_binop_rm_avx<bits<8> opcss, bits<8> opcsd,
-                                 string OpcodeStr> {
+multiclass sse41_fp_binop_rm_avx_s<bits<8> opcss, bits<8> opcsd,
+                                   string OpcodeStr> {
   // Intrinsic operation, reg.
   def SSr : SS4AIi8<opcss, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
@@ -4295,24 +4656,90 @@ multiclass sse41_fp_binop_rm_avx<bits<8> opcss, bits<8> opcsd,
 // FP round - roundss, roundps, roundsd, roundpd
 let isAsmParserOnly = 1, Predicates = [HasAVX] in {
   // Intrinsic form
-  defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround",
-                                int_x86_sse41_round_ps, int_x86_sse41_round_pd>,
-                                VEX;
+  defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
+                                  memopv4f32, memopv2f64,
+                                  int_x86_sse41_round_ps,
+                                  int_x86_sse41_round_pd>, VEX;
+  defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
+                                  memopv8f32, memopv4f64,
+                                  int_x86_avx_round_ps_256,
+                                  int_x86_avx_round_pd_256>, VEX;
   defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
-                                int_x86_sse41_round_ss, int_x86_sse41_round_sd,
-                                0>, VEX_4V;
+                                  int_x86_sse41_round_ss,
+                                  int_x86_sse41_round_sd, 0>, VEX_4V;
+
   // Instructions for the assembler
-  defm VROUND  : sse41_fp_unop_rm_avx<0x08, 0x09, "vround">, VEX;
-  defm VROUND  : sse41_fp_binop_rm_avx<0x0A, 0x0B, "vround">, VEX_4V;
+  defm VROUND  : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR128, f128mem, "vround">,
+                                        VEX;
+  defm VROUNDY : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR256, f256mem, "vround">,
+                                        VEX;
+  defm VROUND  : sse41_fp_binop_rm_avx_s<0x0A, 0x0B, "vround">, VEX_4V;
 }
 
-defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round",
+defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
+                               memopv4f32, memopv2f64,
                                int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
 let Constraints = "$src1 = $dst" in
 defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
                                int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
 
 //===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Bit Test
+//===----------------------------------------------------------------------===//
+
+// ptest instruction we'll lower to this in X86ISelLowering primarily from
+// the intel intrinsic that corresponds to this.
+let Defs = [EFLAGS], isAsmParserOnly = 1, Predicates = [HasAVX] in {
+def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>,
+                OpSize, VEX;
+def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS,(X86ptest VR128:$src1, (memopv4f32 addr:$src2)))]>,
+                OpSize, VEX;
+
+def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
+                OpSize, VEX;
+def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>,
+                OpSize, VEX;
+}
+
+let Defs = [EFLAGS] in {
+def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+              "ptest \t{$src2, $src1|$src1, $src2}",
+              [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>,
+              OpSize;
+def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+              "ptest \t{$src2, $src1|$src1, $src2}",
+              [(set EFLAGS, (X86ptest VR128:$src1, (memopv4f32 addr:$src2)))]>,
+              OpSize;
+}
+
+// The bit test instructions below are AVX only
+multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
+  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, OpSize, VEX;
+  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
+            OpSize, VEX;
+}
+
+let Defs = [EFLAGS], isAsmParserOnly = 1, Predicates = [HasAVX] in {
+defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>;
+defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>;
+}
+
+//===----------------------------------------------------------------------===//
 // SSE4.1 - Misc Instructions
 //===----------------------------------------------------------------------===//
 
@@ -4431,79 +4858,104 @@ let Constraints = "$src1 = $dst" in
 
 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
-                               Intrinsic IntId128, bit Is2Addr = 1> {
+                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
+                 X86MemOperand x86memop, bit Is2Addr = 1> {
   let isCommutable = 1 in
-  def rri : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-        (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, i32i8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        [(set VR128:$dst,
-          (IntId128 VR128:$src1, VR128:$src2, imm:$src3))]>,
+        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
         OpSize;
-  def rmi : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
-        (ins VR128:$src1, i128mem:$src2, i32i8imm:$src3),
+  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+        (ins RC:$src1, x86memop:$src2, i32i8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        [(set VR128:$dst,
-          (IntId128 VR128:$src1,
-           (bitconvert (memopv16i8 addr:$src2)), imm:$src3))]>,
+        [(set RC:$dst,
+          (IntId RC:$src1,
+           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
         OpSize;
 }
 
 let isAsmParserOnly = 1, Predicates = [HasAVX] in {
   let isCommutable = 0 in {
   defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
-                                                        0>, VEX_4V;
+                                      VR128, memopv16i8, i128mem, 0>, VEX_4V;
   defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
-                                                        0>, VEX_4V;
+                                      VR128, memopv16i8, i128mem, 0>, VEX_4V;
+  defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
+            int_x86_avx_blend_ps_256, VR256, memopv32i8, i256mem, 0>, VEX_4V;
+  defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
+            int_x86_avx_blend_pd_256, VR256, memopv32i8, i256mem, 0>, VEX_4V;
   defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
-                                                        0>, VEX_4V;
+                                      VR128, memopv16i8, i128mem, 0>, VEX_4V;
   defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
-                                                        0>, VEX_4V;
+                                      VR128, memopv16i8, i128mem, 0>, VEX_4V;
   }
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
-                                                        0>, VEX_4V;
+                                   VR128, memopv16i8, i128mem, 0>, VEX_4V;
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
-                                                        0>, VEX_4V;
+                                   VR128, memopv16i8, i128mem, 0>, VEX_4V;
+  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
+                                   VR256, memopv32i8, i256mem, 0>, VEX_4V;
 }
 
 let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in {
-  defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps>;
-  defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd>;
-  defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw>;
-  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw>;
+  defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
+                                     VR128, memopv16i8, i128mem>;
+  defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
+                                     VR128, memopv16i8, i128mem>;
+  defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
+                                     VR128, memopv16i8, i128mem>;
+  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
+                                     VR128, memopv16i8, i128mem>;
   }
-  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps>;
-  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd>;
+  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
+                                  VR128, memopv16i8, i128mem>;
+  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
+                                  VR128, memopv16i8, i128mem>;
 }
 
 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
 let isAsmParserOnly = 1, Predicates = [HasAVX] in {
-  multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr> {
-    def rr : I<opc, MRMSrcReg, (outs VR128:$dst),
-                    (ins VR128:$src1, VR128:$src2, VR128:$src3),
-                    !strconcat(OpcodeStr,
-                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                     [], SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
-
-    def rm : I<opc, MRMSrcMem, (outs VR128:$dst),
-                    (ins VR128:$src1, i128mem:$src2, VR128:$src3),
-                    !strconcat(OpcodeStr,
-                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                     [], SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
-  }
-}
-
-defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd">;
-defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps">;
-defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb">;
+multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
+                                    RegisterClass RC, X86MemOperand x86memop,
+                                    PatFrag mem_frag, Intrinsic IntId> {
+  def rr : I<opc, MRMSrcReg, (outs RC:$dst),
+                  (ins RC:$src1, RC:$src2, RC:$src3),
+                  !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                  [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
+                  SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
+
+  def rm : I<opc, MRMSrcMem, (outs RC:$dst),
+                  (ins RC:$src1, x86memop:$src2, RC:$src3),
+                  !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                  [(set RC:$dst,
+                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
+                               RC:$src3))],
+                  SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
+}
+}
+
+defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, i128mem,
+                                           memopv16i8, int_x86_sse41_blendvpd>;
+defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem,
+                                           memopv16i8, int_x86_sse41_blendvps>;
+defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
+                                           memopv16i8, int_x86_sse41_pblendvb>;
+defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem,
+                                         memopv32i8, int_x86_avx_blendv_pd_256>;
+defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem,
+                                         memopv32i8, int_x86_avx_blendv_ps_256>;
 
 /// SS41I_ternary_int - SSE 4.1 ternary operator
 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
@@ -4529,30 +4981,6 @@ defm BLENDVPD     : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
 defm BLENDVPS     : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
 defm PBLENDVB     : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
 
-// ptest instruction we'll lower to this in X86ISelLowering primarily from
-// the intel intrinsic that corresponds to this.
-let Defs = [EFLAGS], isAsmParserOnly = 1, Predicates = [HasAVX] in {
-def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
-                    "vptest\t{$src2, $src1|$src1, $src2}",
-                    [(set EFLAGS, (X86ptest VR128:$src1, VR128:$src2))]>,
-              OpSize, VEX;
-def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2),
-                    "vptest\t{$src2, $src1|$src1, $src2}",
-                    [(set EFLAGS, (X86ptest VR128:$src1, (load addr:$src2)))]>,
-              OpSize, VEX;
-}
-
-let Defs = [EFLAGS] in {
-def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
-                    "ptest \t{$src2, $src1|$src1, $src2}",
-                    [(set EFLAGS, (X86ptest VR128:$src1, VR128:$src2))]>,
-              OpSize;
-def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2),
-                    "ptest \t{$src2, $src1|$src1, $src2}",
-                    [(set EFLAGS, (X86ptest VR128:$src1, (load addr:$src2)))]>,
-              OpSize;
-}
-
 let isAsmParserOnly = 1, Predicates = [HasAVX] in
 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "vmovntdqa\t{$src, $dst|$dst, $src}",
@@ -4603,17 +5031,20 @@ def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
 //===----------------------------------------------------------------------===//
 
 // Packed Compare Implicit Length Strings, Return Mask
-let Defs = [EFLAGS], usesCustomInserter = 1 in {
-  def PCMPISTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
-    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-    "#PCMPISTRM128rr PSEUDO!",
+multiclass pseudo_pcmpistrm<string asm> {
+  def REG : Ii8<0, Pseudo, (outs VR128:$dst),
+    (ins VR128:$src1, VR128:$src2, i8imm:$src3), !strconcat(asm, "rr PSEUDO"),
     [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
-                                                  imm:$src3))]>, OpSize;
-  def PCMPISTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
-    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-    "#PCMPISTRM128rm PSEUDO!",
+                                                  imm:$src3))]>;
+  def MEM : Ii8<0, Pseudo, (outs VR128:$dst),
+    (ins VR128:$src1, i128mem:$src2, i8imm:$src3), !strconcat(asm, "rm PSEUDO"),
     [(set VR128:$dst, (int_x86_sse42_pcmpistrm128
-                          VR128:$src1, (load addr:$src2), imm:$src3))]>, OpSize;
+                       VR128:$src1, (load addr:$src2), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[HasSSE42]>;
+  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
 }
 
 let Defs = [XMM0, EFLAGS], isAsmParserOnly = 1,
@@ -4636,20 +5067,20 @@ let Defs = [XMM0, EFLAGS] in {
 }
 
 // Packed Compare Explicit Length Strings, Return Mask
-let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
-  def PCMPESTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
-    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
-    "#PCMPESTRM128rr PSEUDO!",
-    [(set VR128:$dst,
-          (int_x86_sse42_pcmpestrm128
-           VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>, OpSize;
-
-  def PCMPESTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
-    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-    "#PCMPESTRM128rm PSEUDO!",
+multiclass pseudo_pcmpestrm<string asm> {
+  def REG : Ii8<0, Pseudo, (outs VR128:$dst),
+    (ins VR128:$src1, VR128:$src3, i8imm:$src5), !strconcat(asm, "rr PSEUDO"),
+    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
+                       VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+  def MEM : Ii8<0, Pseudo, (outs VR128:$dst),
+    (ins VR128:$src1, i128mem:$src3, i8imm:$src5), !strconcat(asm, "rm PSEUDO"),
     [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
-                       VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>,
-    OpSize;
+                       VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[HasSSE42]>;
+  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
 }
 
 let isAsmParserOnly = 1, Predicates = [HasAVX],
@@ -4941,3 +5372,579 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
     (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)),
                                     imm:$src2))]>,
   OpSize;
+
+//===----------------------------------------------------------------------===//
+// CLMUL Instructions
+//===----------------------------------------------------------------------===//
+
+// Only the AVX version of CLMUL instructions are described here.
+
+// Carry-less Multiplication instructions
+let isAsmParserOnly = 1 in {
+def VPCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           []>;
+
+def VPCLMULQDQrm : CLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           []>;
+
+// Assembler Only
+multiclass avx_vpclmul<string asm> {
+  def rr : I<0, Pseudo, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+             !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             []>;
+
+  def rm : I<0, Pseudo, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+             !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             []>;
+}
+defm VPCLMULHQHQDQ : avx_vpclmul<"vpclmulhqhqdq">;
+defm VPCLMULHQLQDQ : avx_vpclmul<"vpclmulhqlqdq">;
+defm VPCLMULLQHQDQ : avx_vpclmul<"vpclmullqhqdq">;
+defm VPCLMULLQLQDQ : avx_vpclmul<"vpclmullqlqdq">;
+
+} // isAsmParserOnly
+
+//===----------------------------------------------------------------------===//
+// AVX Instructions
+//===----------------------------------------------------------------------===//
+
+let isAsmParserOnly = 1 in {
+
+// Load from memory and broadcast to all elements of the destination operand
+class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                    X86MemOperand x86memop, Intrinsic Int> :
+  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+        [(set RC:$dst, (Int addr:$src))]>, VEX;
+
+def VBROADCASTSS   : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
+                                   int_x86_avx_vbroadcastss>;
+def VBROADCASTSSY  : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
+                                   int_x86_avx_vbroadcastss_256>;
+def VBROADCASTSD   : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
+                                   int_x86_avx_vbroadcast_sd_256>;
+def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
+                                   int_x86_avx_vbroadcastf128_pd_256>;
+
+// Insert packed floating-point values
+def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
+          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, VEX_4V;
+def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f128mem:$src2, i8imm:$src3),
+          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, VEX_4V;
+
+// Extract packed floating-point values
+def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
+          (ins VR256:$src1, i8imm:$src2),
+          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, VEX;
+def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
+          (ins f128mem:$dst, VR256:$src1, i8imm:$src2),
+          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, VEX;
+
+// Conditional SIMD Packed Loads and Stores
+multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
+                          Intrinsic IntLd, Intrinsic IntLd256,
+                          Intrinsic IntSt, Intrinsic IntSt256,
+                          PatFrag pf128, PatFrag pf256> {
+  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, f128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
+             VEX_4V;
+  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, f256mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+             VEX_4V;
+  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
+             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
+             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V;
+}
+
+defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
+                                 int_x86_avx_maskload_ps,
+                                 int_x86_avx_maskload_ps_256,
+                                 int_x86_avx_maskstore_ps,
+                                 int_x86_avx_maskstore_ps_256,
+                                 memopv4f32, memopv8f32>;
+defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
+                                 int_x86_avx_maskload_pd,
+                                 int_x86_avx_maskload_pd_256,
+                                 int_x86_avx_maskstore_pd,
+                                 int_x86_avx_maskstore_pd_256,
+                                 memopv2f64, memopv4f64>;
+
+// Permute Floating-Point Values
+multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
+                      RegisterClass RC, X86MemOperand x86memop_f,
+                      X86MemOperand x86memop_i, PatFrag f_frag, PatFrag i_frag,
+                      Intrinsic IntVar, Intrinsic IntImm> {
+  def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, RC:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V;
+  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
+             (ins RC:$src1, x86memop_i:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (IntVar RC:$src1, (i_frag addr:$src2)))]>, VEX_4V;
+
+  def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, i8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (IntImm RC:$src1, imm:$src2))]>, VEX;
+  def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
+             (ins x86memop_f:$src1, i8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (IntImm (f_frag addr:$src1), imm:$src2))]>, VEX;
+}
+
+defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
+                             memopv4f32, memopv4i32,
+                             int_x86_avx_vpermilvar_ps,
+                             int_x86_avx_vpermil_ps>;
+defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
+                             memopv8f32, memopv8i32,
+                             int_x86_avx_vpermilvar_ps_256,
+                             int_x86_avx_vpermil_ps_256>;
+defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
+                             memopv2f64, memopv2i64,
+                             int_x86_avx_vpermilvar_pd,
+                             int_x86_avx_vpermil_pd>;
+defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
+                             memopv4f64, memopv4i64,
+                             int_x86_avx_vpermilvar_pd_256,
+                             int_x86_avx_vpermil_pd_256>;
+
+def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
+          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, VEX_4V;
+def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
+          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, VEX_4V;
+
+// Zero All YMM registers
+def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
+                 [(int_x86_avx_vzeroall)]>, VEX, VEX_L, Requires<[HasAVX]>;
+
+// Zero Upper bits of YMM registers
+def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
+                   [(int_x86_avx_vzeroupper)]>, VEX, Requires<[HasAVX]>;
+
+} // isAsmParserOnly
+
+def : Pat<(int_x86_avx_vinsertf128_pd_256 VR256:$src1, VR128:$src2, imm:$src3),
+          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
+def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3),
+          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
+def : Pat<(int_x86_avx_vinsertf128_si_256 VR256:$src1, VR128:$src2, imm:$src3),
+          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
+
+def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2),
+          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
+def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2),
+          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
+def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2),
+          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
+
+def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
+          (VBROADCASTF128 addr:$src)>;
+
+def : Pat<(int_x86_avx_vperm2f128_ps_256 VR256:$src1, VR256:$src2, imm:$src3),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>;
+def : Pat<(int_x86_avx_vperm2f128_pd_256 VR256:$src1, VR256:$src2, imm:$src3),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>;
+def : Pat<(int_x86_avx_vperm2f128_si_256 VR256:$src1, VR256:$src2, imm:$src3),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>;
+
+def : Pat<(int_x86_avx_vperm2f128_ps_256
+                  VR256:$src1, (memopv8f32 addr:$src2), imm:$src3),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
+def : Pat<(int_x86_avx_vperm2f128_pd_256
+                  VR256:$src1, (memopv4f64 addr:$src2), imm:$src3),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
+def : Pat<(int_x86_avx_vperm2f128_si_256
+                  VR256:$src1, (memopv8i32 addr:$src2), imm:$src3),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
+
+//===----------------------------------------------------------------------===//
+// SSE Shuffle pattern fragments
+//===----------------------------------------------------------------------===//
+
+// This is part of a "work in progress" refactoring. The idea is that all
+// vector shuffles are going to be translated into target specific nodes and
+// directly matched by the patterns below (which can be changed along the way)
+// The AVX version of some but not all of them are described here, and more
+// should come in a near future.
+
+// Shuffle with PSHUFD instruction folding loads. The first two patterns match
+// SSE2 loads, which are always promoted to v2i64. The last one should match
+// the SSE1 case, where the only legal load is v4f32, but there is no PSHUFD
+// in SSE2, how does it ever worked? Anyway, the pattern will remain here until
+// we investigate further.
+def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)),
+                                 (i8 imm:$imm))),
+          (VPSHUFDmi addr:$src1, imm:$imm)>, Requires<[HasAVX]>;
+def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)),
+                                 (i8 imm:$imm))),
+          (PSHUFDmi addr:$src1, imm:$imm)>;
+def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)),
+                                 (i8 imm:$imm))),
+          (PSHUFDmi addr:$src1, imm:$imm)>; // FIXME: has this ever worked?
+
+// Shuffle with PSHUFD instruction.
+def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+          (VPSHUFDri VR128:$src1, imm:$imm)>, Requires<[HasAVX]>;
+def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+          (PSHUFDri VR128:$src1, imm:$imm)>;
+
+def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+          (VPSHUFDri VR128:$src1, imm:$imm)>, Requires<[HasAVX]>;
+def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+          (PSHUFDri VR128:$src1, imm:$imm)>;
+
+// Shuffle with SHUFPD instruction.
+def : Pat<(v2f64 (X86Shufps VR128:$src1,
+                     (memopv2f64 addr:$src2), (i8 imm:$imm))),
+          (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>, Requires<[HasAVX]>;
+def : Pat<(v2f64 (X86Shufps VR128:$src1,
+                     (memopv2f64 addr:$src2), (i8 imm:$imm))),
+          (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+
+def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>, Requires<[HasAVX]>;
+def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>, Requires<[HasAVX]>;
+def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+// Shuffle with SHUFPS instruction.
+def : Pat<(v4f32 (X86Shufps VR128:$src1,
+                     (memopv4f32 addr:$src2), (i8 imm:$imm))),
+          (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>, Requires<[HasAVX]>;
+def : Pat<(v4f32 (X86Shufps VR128:$src1,
+                     (memopv4f32 addr:$src2), (i8 imm:$imm))),
+          (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+
+def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>, Requires<[HasAVX]>;
+def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+def : Pat<(v4i32 (X86Shufps VR128:$src1,
+                     (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+          (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>, Requires<[HasAVX]>;
+def : Pat<(v4i32 (X86Shufps VR128:$src1,
+                     (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+          (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+
+def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>, Requires<[HasAVX]>;
+def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+// Shuffle with MOVHLPS instruction
+def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)),
+          (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+          (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+
+// Shuffle with MOVDDUP instruction
+def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+          (MOVDDUPrm addr:$src)>;
+
+def : Pat<(X86Movddup (bc_v4f32 (memopv2f64 addr:$src))),
+          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+def : Pat<(X86Movddup (bc_v4f32 (memopv2f64 addr:$src))),
+          (MOVDDUPrm addr:$src)>;
+
+def : Pat<(X86Movddup (memopv2i64 addr:$src)),
+          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+def : Pat<(X86Movddup (memopv2i64 addr:$src)),
+          (MOVDDUPrm addr:$src)>;
+
+def : Pat<(X86Movddup (bc_v4i32 (memopv2i64 addr:$src))),
+          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+def : Pat<(X86Movddup (bc_v4i32 (memopv2i64 addr:$src))),
+          (MOVDDUPrm addr:$src)>;
+
+def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
+          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
+          (MOVDDUPrm addr:$src)>;
+
+def : Pat<(X86Movddup (bc_v2f64
+                           (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+def : Pat<(X86Movddup (bc_v2f64
+                           (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+          (MOVDDUPrm addr:$src)>;
+
+// Shuffle with UNPCKLPS
+def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
+          (VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
+def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
+          (UNPCKLPSrm VR128:$src1, addr:$src2)>;
+
+def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
+          (VUNPCKLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
+def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
+          (UNPCKLPSrr VR128:$src1, VR128:$src2)>;
+
+// Shuffle with UNPCKHPS
+def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))),
+          (VUNPCKHPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
+def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))),
+          (UNPCKHPSrm VR128:$src1, addr:$src2)>;
+
+def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
+          (VUNPCKHPSrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
+def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
+          (UNPCKHPSrr VR128:$src1, VR128:$src2)>;
+
+// Shuffle with UNPCKLPD
+def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
+          (VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
+def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
+          (UNPCKLPSrm VR128:$src1, addr:$src2)>;
+
+def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
+          (VUNPCKLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
+def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
+          (UNPCKLPDrr VR128:$src1, VR128:$src2)>;
+
+// Shuffle with UNPCKHPD
+def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))),
+          (VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
+def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))),
+          (UNPCKLPSrm VR128:$src1, addr:$src2)>;
+
+def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
+          (VUNPCKHPDrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
+def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
+          (UNPCKHPDrr VR128:$src1, VR128:$src2)>;
+
+// Shuffle with PUNPCKLBW
+def : Pat<(v16i8 (X86Punpcklbw VR128:$src1,
+                                   (bc_v16i8 (memopv2i64 addr:$src2)))),
+          (PUNPCKLBWrm VR128:$src1, addr:$src2)>;
+def : Pat<(v16i8 (X86Punpcklbw VR128:$src1, VR128:$src2)),
+          (PUNPCKLBWrr VR128:$src1, VR128:$src2)>;
+
+// Shuffle with PUNPCKLWD
+def : Pat<(v8i16 (X86Punpcklwd VR128:$src1,
+                                   (bc_v8i16 (memopv2i64 addr:$src2)))),
+          (PUNPCKLWDrm VR128:$src1, addr:$src2)>;
+def : Pat<(v8i16 (X86Punpcklwd VR128:$src1, VR128:$src2)),
+          (PUNPCKLWDrr VR128:$src1, VR128:$src2)>;
+
+// Shuffle with PUNPCKLDQ
+def : Pat<(v4i32 (X86Punpckldq VR128:$src1,
+                                   (bc_v4i32 (memopv2i64 addr:$src2)))),
+          (PUNPCKLDQrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4i32 (X86Punpckldq VR128:$src1, VR128:$src2)),
+          (PUNPCKLDQrr VR128:$src1, VR128:$src2)>;
+
+// Shuffle with PUNPCKLQDQ
+def : Pat<(v2i64 (X86Punpcklqdq VR128:$src1, (memopv2i64 addr:$src2))),
+          (PUNPCKLQDQrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (X86Punpcklqdq VR128:$src1, VR128:$src2)),
+          (PUNPCKLQDQrr VR128:$src1, VR128:$src2)>;
+
+// Shuffle with PUNPCKHBW
+def : Pat<(v16i8 (X86Punpckhbw VR128:$src1,
+                                   (bc_v16i8 (memopv2i64 addr:$src2)))),
+          (PUNPCKHBWrm VR128:$src1, addr:$src2)>;
+def : Pat<(v16i8 (X86Punpckhbw VR128:$src1, VR128:$src2)),
+          (PUNPCKHBWrr VR128:$src1, VR128:$src2)>;
+
+// Shuffle with PUNPCKHWD
+def : Pat<(v8i16 (X86Punpckhwd VR128:$src1,
+                                   (bc_v8i16 (memopv2i64 addr:$src2)))),
+          (PUNPCKHWDrm VR128:$src1, addr:$src2)>;
+def : Pat<(v8i16 (X86Punpckhwd VR128:$src1, VR128:$src2)),
+          (PUNPCKHWDrr VR128:$src1, VR128:$src2)>;
+
+// Shuffle with PUNPCKHDQ
+def : Pat<(v4i32 (X86Punpckhdq VR128:$src1,
+                                   (bc_v4i32 (memopv2i64 addr:$src2)))),
+          (PUNPCKHDQrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4i32 (X86Punpckhdq VR128:$src1, VR128:$src2)),
+          (PUNPCKHDQrr VR128:$src1, VR128:$src2)>;
+
+// Shuffle with PUNPCKHQDQ
+def : Pat<(v2i64 (X86Punpckhqdq VR128:$src1, (memopv2i64 addr:$src2))),
+          (PUNPCKHQDQrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (X86Punpckhqdq VR128:$src1, VR128:$src2)),
+          (PUNPCKHQDQrr VR128:$src1, VR128:$src2)>;
+
+// Shuffle with MOVLHPS
+def : Pat<(X86Movlhps VR128:$src1,
+                    (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+          (MOVHPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86Movlhps VR128:$src1,
+                    (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+          (MOVHPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
+          (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+          (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+          (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+// Shuffle with MOVLHPD
+def : Pat<(v2f64 (X86Movlhpd VR128:$src1,
+                    (scalar_to_vector (loadf64 addr:$src2)))),
+          (MOVHPDrm VR128:$src1, addr:$src2)>;
+// FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem
+// is during lowering, where it's not possible to recognize the load fold cause
+// it has two uses through a bitcast. One use disappears at isel time and the
+// fold opportunity reappears.
+def : Pat<(v2f64 (X86Unpcklpd VR128:$src1,
+                    (scalar_to_vector (loadf64 addr:$src2)))),
+          (MOVHPDrm VR128:$src1, addr:$src2)>;
+
+// Shuffle with MOVSS
+def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
+          (MOVSSrr VR128:$src1, FR32:$src2)>;
+def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+          (MOVSSrr (v4i32 VR128:$src1),
+                   (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
+def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+          (MOVSSrr (v4f32 VR128:$src1),
+                   (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
+// FIXME: Instead of a X86Movss there should be a X86Movlps here, the problem
+// is during lowering, where it's not possible to recognize the load fold cause
+// it has two uses through a bitcast. One use disappears at isel time and the
+// fold opportunity reappears.
+def : Pat<(X86Movss VR128:$src1,
+                    (bc_v4i32 (v2i64 (load addr:$src2)))),
+          (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+// Shuffle with MOVSD
+def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
+          (MOVSDrr VR128:$src1, FR64:$src2)>;
+def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+          (MOVSDrr (v2i64 VR128:$src1),
+                   (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
+def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+          (MOVSDrr (v2f64 VR128:$src1),
+                   (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
+def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>;
+def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>;
+
+// Shuffle with MOVSHDUP
+def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+          (MOVSHDUPrr VR128:$src)>;
+def : Pat<(X86Movshdup (bc_v4i32 (memopv2i64 addr:$src))),
+          (MOVSHDUPrm addr:$src)>;
+
+def : Pat<(v4f32 (X86Movshdup VR128:$src)),
+          (MOVSHDUPrr VR128:$src)>;
+def : Pat<(X86Movshdup (memopv4f32 addr:$src)),
+          (MOVSHDUPrm addr:$src)>;
+
+// Shuffle with MOVSLDUP
+def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+          (MOVSLDUPrr VR128:$src)>;
+def : Pat<(X86Movsldup (bc_v4i32 (memopv2i64 addr:$src))),
+          (MOVSLDUPrm addr:$src)>;
+
+def : Pat<(v4f32 (X86Movsldup VR128:$src)),
+          (MOVSLDUPrr VR128:$src)>;
+def : Pat<(X86Movsldup (memopv4f32 addr:$src)),
+          (MOVSLDUPrm addr:$src)>;
+
+// Shuffle with PSHUFHW
+def : Pat<(v8i16 (X86PShufhwLd addr:$src, (i8 imm:$imm))),
+          (PSHUFHWmi addr:$src, imm:$imm)>;
+def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))),
+          (PSHUFHWri VR128:$src, imm:$imm)>;
+def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)), (i8 imm:$imm))),
+          (PSHUFHWmi addr:$src, imm:$imm)>;
+
+// Shuffle with PSHUFLW
+def : Pat<(v8i16 (X86PShuflwLd addr:$src, (i8 imm:$imm))),
+          (PSHUFLWmi addr:$src, imm:$imm)>;
+def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))),
+          (PSHUFLWri VR128:$src, imm:$imm)>;
+def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)), (i8 imm:$imm))),
+          (PSHUFLWmi addr:$src, imm:$imm)>;
+
+// Shuffle with PALIGN
+def : Pat<(v1i64 (X86PAlign VR64:$src1, VR64:$src2, (i8 imm:$imm))),
+          (PALIGNR64rr VR64:$src2, VR64:$src1, imm:$imm)>;
+def : Pat<(v2i32 (X86PAlign VR64:$src1, VR64:$src2, (i8 imm:$imm))),
+          (PALIGNR64rr VR64:$src2, VR64:$src1, imm:$imm)>;
+def : Pat<(v4i16 (X86PAlign VR64:$src1, VR64:$src2, (i8 imm:$imm))),
+          (PALIGNR64rr VR64:$src2, VR64:$src1, imm:$imm)>;
+def : Pat<(v8i8 (X86PAlign VR64:$src1, VR64:$src2, (i8 imm:$imm))),
+          (PALIGNR64rr VR64:$src2, VR64:$src1, imm:$imm)>;
+
+def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+
+// Shuffle with MOVLPS
+def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+          (MOVLPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+          (MOVLPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86Movlps VR128:$src1,
+                    (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+          (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+// Shuffle with MOVLPD
+def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+          (MOVLPDrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+          (MOVLPDrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2f64 (X86Movlpd VR128:$src1,
+                            (scalar_to_vector (loadf64 addr:$src2)))),
+          (MOVLPDrm VR128:$src1, addr:$src2)>;
+
+// Extra patterns to match stores with MOVHPS/PD and MOVLPS/PD
+def : Pat<(store (f64 (vector_extract
+          (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))),addr:$dst),
+          (MOVHPSmr addr:$dst, VR128:$src)>;
+def : Pat<(store (f64 (vector_extract
+          (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))),addr:$dst),
+          (MOVHPDmr addr:$dst, VR128:$src)>;
+
+def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),addr:$src1),
+          (MOVLPSmr addr:$src1, VR128:$src2)>;
+def : Pat<(store (v4i32 (X86Movlps
+                 (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
+          (MOVLPSmr addr:$src1, VR128:$src2)>;
+
+def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),addr:$src1),
+          (MOVLPDmr addr:$src1, VR128:$src2)>;
+def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),addr:$src1),
+          (MOVLPDmr addr:$src1, VR128:$src2)>;
diff --git a/lib/Target/X86/X86MCAsmInfo.cpp b/lib/Target/X86/X86MCAsmInfo.cpp
index 2b8720bac3438..36badb403e815 100644
--- a/lib/Target/X86/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/X86MCAsmInfo.cpp
@@ -103,6 +103,9 @@ getNonexecutableStackSection(MCContext &Ctx) const {
 }
 
 X86MCAsmInfoCOFF::X86MCAsmInfoCOFF(const Triple &Triple) {
+  if (Triple.getArch() == Triple::x86_64)
+    GlobalPrefix = "";
+
   AsmTransCBE = x86_asm_table;
   AssemblerDialect = AsmWriterFlavor;
 
diff --git a/lib/Target/X86/X86MCCodeEmitter.cpp b/lib/Target/X86/X86MCCodeEmitter.cpp
index 23b0666f5f30f..9564fe0b92d4c 100644
--- a/lib/Target/X86/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/X86MCCodeEmitter.cpp
@@ -365,7 +365,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
                                            const TargetInstrDesc &Desc,
                                            raw_ostream &OS) const {
   bool HasVEX_4V = false;
-  if (TSFlags & X86II::VEX_4V)
+  if ((TSFlags >> 32) & X86II::VEX_4V)
     HasVEX_4V = true;
 
   // VEX_R: opcode externsion equivalent to REX.R in
@@ -429,10 +429,10 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   if (TSFlags & X86II::OpSize)
     VEX_PP = 0x01;
 
-  if (TSFlags & X86II::VEX_W)
+  if ((TSFlags >> 32) & X86II::VEX_W)
     VEX_W = 1;
 
-  if (TSFlags & X86II::VEX_L)
+  if ((TSFlags >> 32) & X86II::VEX_L)
     VEX_L = 1;
 
   switch (TSFlags & X86II::Op0Mask) {
@@ -469,33 +469,39 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
 
   unsigned NumOps = MI.getNumOperands();
   unsigned CurOp = 0;
+  bool IsDestMem = false;
 
   switch (TSFlags & X86II::FormMask) {
   case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!");
+  case X86II::MRMDestMem:
+    IsDestMem = true;
+    // The important info for the VEX prefix is never beyond the address
+    // registers. Don't check beyond that.
+    NumOps = CurOp = X86::AddrNumOperands;
   case X86II::MRM0m: case X86II::MRM1m:
   case X86II::MRM2m: case X86II::MRM3m:
   case X86II::MRM4m: case X86II::MRM5m:
   case X86II::MRM6m: case X86II::MRM7m:
-  case X86II::MRMDestMem:
-    NumOps = CurOp = X86::AddrNumOperands;
   case X86II::MRMSrcMem:
   case X86II::MRMSrcReg:
     if (MI.getNumOperands() > CurOp && MI.getOperand(CurOp).isReg() &&
         X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_R = 0x0;
-
-    // CurOp and NumOps are equal when VEX_R represents a register used
-    // to index a memory destination (which is the last operand)
-    CurOp = (CurOp == NumOps) ? 0 : CurOp+1;
+    CurOp++;
 
     if (HasVEX_4V) {
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      VEX_4V = getVEXRegisterEncoding(MI, IsDestMem ? CurOp-1 : CurOp);
       CurOp++;
     }
 
+    // To only check operands before the memory address ones, start
+    // the search from the begining
+    if (IsDestMem)
+      CurOp = 0;
+
     // If the last register should be encoded in the immediate field
     // do not use any bit from VEX prefix to this register, ignore it
-    if (TSFlags & X86II::VEX_I8IMM)
+    if ((TSFlags >> 32) & X86II::VEX_I8IMM)
       NumOps--;
 
     for (; CurOp != NumOps; ++CurOp) {
@@ -508,7 +514,10 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
         VEX_X = 0x0;
     }
     break;
-  default: // MRMDestReg, MRM0r-MRM7r
+  default: // MRMDestReg, MRM0r-MRM7r, RawFrm
+    if (!MI.getNumOperands())
+      break;
+
     if (MI.getOperand(CurOp).isReg() &&
         X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_B = 0;
@@ -524,7 +533,6 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
         VEX_R = 0x0;
     }
     break;
-    assert(0 && "Not implemented!");
   }
 
   // Emit segment override opcode prefix as needed.
@@ -793,9 +801,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   // It uses the VEX.VVVV field?
   bool HasVEX_4V = false;
 
-  if (TSFlags & X86II::VEX)
+  if ((TSFlags >> 32) & X86II::VEX)
     HasVEXPrefix = true;
-  if (TSFlags & X86II::VEX_4V)
+  if ((TSFlags >> 32) & X86II::VEX_4V)
     HasVEX_4V = true;
 
   // Determine where the memory operand starts, if present.
@@ -819,6 +827,14 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::RawFrm:
     EmitByte(BaseOpcode, CurByte, OS);
     break;
+      
+  case X86II::RawFrmImm16:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    EmitImmediate(MI.getOperand(CurOp++), 2, FK_Data_2, CurByte, OS, Fixups);
+    break;
 
   case X86II::AddRegFrm:
     EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
@@ -833,10 +849,15 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
 
   case X86II::MRMDestMem:
     EmitByte(BaseOpcode, CurByte, OS);
+    SrcRegNum = CurOp + X86::AddrNumOperands;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      SrcRegNum++;
+
     EmitMemModRMByte(MI, CurOp,
-                     GetX86RegNum(MI.getOperand(CurOp + X86::AddrNumOperands)),
+                     GetX86RegNum(MI.getOperand(SrcRegNum)),
                      TSFlags, CurByte, OS, Fixups);
-    CurOp += X86::AddrNumOperands + 1;
+    CurOp = SrcRegNum + 1;
     break;
 
   case X86II::MRMSrcReg:
@@ -934,7 +955,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   if (CurOp != NumOps) {
     // The last source register of a 4 operand instruction in AVX is encoded
     // in bits[7:4] of a immediate byte, and bits[3:0] are ignored.
-    if (TSFlags & X86II::VEX_I8IMM) {
+    if ((TSFlags >> 32) & X86II::VEX_I8IMM) {
       const MCOperand &MO = MI.getOperand(CurOp++);
       bool IsExtReg =
         X86InstrInfo::isX86_64ExtendedReg(MO.getReg());
diff --git a/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index e67fc06a6cd75..8c4620f921771 100644
--- a/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -16,7 +16,6 @@
 #include "X86AsmPrinter.h"
 #include "X86COFFMachineModuleInfo.h"
 #include "X86MCAsmInfo.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -29,21 +28,19 @@
 #include "llvm/Type.h"
 using namespace llvm;
 
-
-const X86Subtarget &X86MCInstLower::getSubtarget() const {
-  return AsmPrinter.getSubtarget();
-}
+X86MCInstLower::X86MCInstLower(Mangler *mang, const MachineFunction &mf,
+                               X86AsmPrinter &asmprinter)
+: Ctx(mf.getContext()), Mang(mang), MF(mf), TM(mf.getTarget()),
+  MAI(*TM.getMCAsmInfo()), AsmPrinter(asmprinter) {}
 
 MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
-  assert(getSubtarget().isTargetDarwin() &&"Can only get MachO info on darwin");
-  return AsmPrinter.MMI->getObjFileInfo<MachineModuleInfoMachO>(); 
+  return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
 }
 
 
 MCSymbol *X86MCInstLower::GetPICBaseSymbol() const {
-  const TargetLowering *TLI = AsmPrinter.TM.getTargetLowering();
-  return static_cast<const X86TargetLowering*>(TLI)->
-    getPICBaseSymbol(AsmPrinter.MF, Ctx);
+  return static_cast<const X86TargetLowering*>(TM.getTargetLowering())->
+    getPICBaseSymbol(&MF, Ctx);
 }
 
 /// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
@@ -56,7 +53,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   
   if (!MO.isGlobal()) {
     assert(MO.isSymbol());
-    Name += AsmPrinter.MAI->getGlobalPrefix();
+    Name += MAI.getGlobalPrefix();
     Name += MO.getSymbolName();
   } else {    
     const GlobalValue *GV = MO.getGlobal();
@@ -91,7 +88,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(AsmPrinter.Mang->getSymbol(MO.getGlobal()),
+        StubValueTy(Mang->getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     }
     return Sym;
@@ -105,7 +102,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(AsmPrinter.Mang->getSymbol(MO.getGlobal()),
+        StubValueTy(Mang->getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     }
     return Sym;
@@ -121,7 +118,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
     if (MO.isGlobal()) {
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(AsmPrinter.Mang->getSymbol(MO.getGlobal()),
+        StubValueTy(Mang->getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     } else {
       Name.erase(Name.end()-5, Name.end());
@@ -178,7 +175,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     Expr = MCBinaryExpr::CreateSub(Expr, 
                                MCSymbolRefExpr::Create(GetPICBaseSymbol(), Ctx),
                                    Ctx);
-    if (MO.isJTI() && AsmPrinter.MAI->hasSetDirective()) {
+    if (MO.isJTI() && MAI.hasSetDirective()) {
       // If .set directive is supported, use it to reduce the number of
       // relocations the assembler will generate for differences between
       // local labels. This is only safe when the symbols are in the same
@@ -255,7 +252,13 @@ static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
 }
 
 /// \brief Simplify things like MOV32rm to MOV32o32a.
-static void SimplifyShortMoveForm(MCInst &Inst, unsigned Opcode) {
+static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
+                                  unsigned Opcode) {
+  // Don't make these simplifications in 64-bit mode; other assemblers don't
+  // perform them because they make the code larger.
+  if (Printer.getSubtarget().is64Bit())
+    return;
+
   bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg();
   unsigned AddrBase = IsStore;
   unsigned RegOp = IsStore ? 0 : 5;
@@ -336,7 +339,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       break;
     case MachineOperand::MO_BlockAddress:
       MCOp = LowerSymbolOperand(MO,
-                        AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
+                     AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
       break;
     }
     
@@ -377,12 +380,17 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   case X86::MMX_V_SET0:   LowerUnaryToTwoAddr(OutMI, X86::MMX_PXORrr); break;
   case X86::MMX_V_SETALLONES:
     LowerUnaryToTwoAddr(OutMI, X86::MMX_PCMPEQDrr); break;
-  case X86::FsFLD0SS:     LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
-  case X86::FsFLD0SD:     LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
-  case X86::V_SET0PS:     LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break;
-  case X86::V_SET0PD:     LowerUnaryToTwoAddr(OutMI, X86::XORPDrr); break;
-  case X86::V_SET0PI:     LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
-  case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
+  case X86::FsFLD0SS:      LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
+  case X86::FsFLD0SD:      LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
+  case X86::V_SET0PS:      LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break;
+  case X86::V_SET0PD:      LowerUnaryToTwoAddr(OutMI, X86::XORPDrr); break;
+  case X86::V_SET0PI:      LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
+  case X86::V_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
+  case X86::AVX_SET0PS:    LowerUnaryToTwoAddr(OutMI, X86::VXORPSrr); break;
+  case X86::AVX_SET0PSY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break;
+  case X86::AVX_SET0PD:    LowerUnaryToTwoAddr(OutMI, X86::VXORPDrr); break;
+  case X86::AVX_SET0PDY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break;
+  case X86::AVX_SET0PI:    LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break;
 
   case X86::MOV16r0:
     LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV16r0 -> MOV32r0
@@ -393,12 +401,14 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
     break;
 
-  // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have
+  // TAILJMPr64, [WIN]CALL64r, [WIN]CALL64pcrel32 - These instructions have
   // register inputs modeled as normal uses instead of implicit uses.  As such,
   // truncate off all but the first operand (the callee).  FIXME: Change isel.
   case X86::TAILJMPr64:
   case X86::CALL64r:
-  case X86::CALL64pcrel32: {
+  case X86::CALL64pcrel32:
+  case X86::WINCALL64r:
+  case X86::WINCALL64pcrel32: {
     unsigned Opcode = OutMI.getOpcode();
     MCOperand Saved = OutMI.getOperand(0);
     OutMI = MCInst();
@@ -456,15 +466,13 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   // MOV64ao8, MOV64o8a
   // XCHG16ar, XCHG32ar, XCHG64ar
   case X86::MOV8mr_NOREX:
-  case X86::MOV8mr:     SimplifyShortMoveForm(OutMI, X86::MOV8ao8); break;
+  case X86::MOV8mr:     SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao8); break;
   case X86::MOV8rm_NOREX:
-  case X86::MOV8rm:     SimplifyShortMoveForm(OutMI, X86::MOV8o8a); break;
-  case X86::MOV16mr:    SimplifyShortMoveForm(OutMI, X86::MOV16ao16); break;
-  case X86::MOV16rm:    SimplifyShortMoveForm(OutMI, X86::MOV16o16a); break;
-  case X86::MOV32mr:    SimplifyShortMoveForm(OutMI, X86::MOV32ao32); break;
-  case X86::MOV32rm:    SimplifyShortMoveForm(OutMI, X86::MOV32o32a); break;
-  case X86::MOV64mr:    SimplifyShortMoveForm(OutMI, X86::MOV64ao64); break;
-  case X86::MOV64rm:    SimplifyShortMoveForm(OutMI, X86::MOV64o64a); break;
+  case X86::MOV8rm:     SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o8a); break;
+  case X86::MOV16mr:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao16); break;
+  case X86::MOV16rm:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o16a); break;
+  case X86::MOV32mr:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break;
+  case X86::MOV32rm:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break;
 
   case X86::ADC8ri:     SimplifyShortImmForm(OutMI, X86::ADC8i8);    break;
   case X86::ADC16ri:    SimplifyShortImmForm(OutMI, X86::ADC16i16);  break;
@@ -505,46 +513,9 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   }
 }
 
-void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
-                                           raw_ostream &O) {
-  // Only the target-dependent form of DBG_VALUE should get here.
-  // Referencing the offset and metadata as NOps-2 and NOps-1 is
-  // probably portable to other targets; frame pointer location is not.
-  unsigned NOps = MI->getNumOperands();
-  assert(NOps==7);
-  O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
-  // cast away const; DIetc do not take const operands for some reason.
-  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
-  if (V.getContext().isSubprogram())
-    O << DISubprogram(V.getContext()).getDisplayName() << ":";
-  O << V.getName();
-  O << " <- ";
-  // Frame address.  Currently handles register +- offset only.
-  O << '['; 
-  if (MI->getOperand(0).isReg() && MI->getOperand(0).getReg())
-    printOperand(MI, 0, O); 
-  else
-    O << "undef";
-  O << '+'; printOperand(MI, 3, O);
-  O << ']';
-  O << "+";
-  printOperand(MI, NOps-2, O);
-}
-
-MachineLocation 
-X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
-  MachineLocation Location;
-  assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!");
-  // Frame address.  Currently handles register +- offset only.
-
-  if (MI->getOperand(0).isReg() && MI->getOperand(3).isImm())
-    Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm());
-  return Location;
-}
-
 
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  X86MCInstLower MCInstLowering(OutContext, Mang, *this);
+  X86MCInstLower MCInstLowering(Mang, *MF, *this);
   switch (MI->getOpcode()) {
   case TargetOpcode::DBG_VALUE:
     if (isVerbose() && OutStreamer.hasRawTextSupport()) {
@@ -555,6 +526,12 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     return;
 
+  // Emit nothing here but a comment if we can.
+  case X86::Int_MemBarrier:
+    if (OutStreamer.hasRawTextSupport())
+      OutStreamer.EmitRawText(StringRef("\t#MEMBARRIER"));
+    return;
+        
   case X86::TAILJMPr:
   case X86::TAILJMPd:
   case X86::TAILJMPd64:
diff --git a/lib/Target/X86/AsmPrinter/X86MCInstLower.h b/lib/Target/X86/X86MCInstLower.h
index 9e5474fc81b33..539b09be6fd74 100644
--- a/lib/Target/X86/AsmPrinter/X86MCInstLower.h
+++ b/lib/Target/X86/X86MCInstLower.h
@@ -13,27 +13,30 @@
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
+  class MCAsmInfo;
   class MCContext;
   class MCInst;
   class MCOperand;
   class MCSymbol;
   class MachineInstr;
+  class MachineFunction;
   class MachineModuleInfoMachO;
   class MachineOperand;
   class Mangler;
+  class TargetMachine;
   class X86AsmPrinter;
-  class X86Subtarget;
   
 /// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
 class LLVM_LIBRARY_VISIBILITY X86MCInstLower {
   MCContext &Ctx;
   Mangler *Mang;
+  const MachineFunction &MF;
+  const TargetMachine &TM;
+  const MCAsmInfo &MAI;
   X86AsmPrinter &AsmPrinter;
-
-  const X86Subtarget &getSubtarget() const;
 public:
-  X86MCInstLower(MCContext &ctx, Mangler *mang, X86AsmPrinter &asmprinter)
-    : Ctx(ctx), Mang(mang), AsmPrinter(asmprinter) {}
+  X86MCInstLower(Mangler *mang, const MachineFunction &MF,
+                 X86AsmPrinter &asmprinter);
   
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 5f31e00ebabd8..fedd49ebb5403 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -38,8 +38,15 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
+static cl::opt<bool>
+ForceStackAlign("force-align-stack",
+                 cl::desc("Force align the stack to the minimum alignment"
+                           " needed for the function."),
+                 cl::init(false), cl::Hidden);
+
 X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
                                  const TargetInstrInfo &tii)
   : X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit() ?
@@ -193,6 +200,12 @@ unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) {
   case X86::DR7:
     return 7;
 
+  // Pseudo index registers are equivalent to a "none"
+  // scaled index (See Intel Manual 2A, table 2-3)
+  case X86::EIZ:
+  case X86::RIZ:
+    return 4;
+
   default:
     assert(isVirtualRegister(RegNo) && "Unknown physical register!");
     llvm_unreachable("Register allocator hasn't allocated reg correctly yet!");
@@ -456,26 +469,29 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
 bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
-  bool requiresRealignment =
-    RealignStack && ((MFI->getMaxAlignment() > StackAlign) ||
-                     F->hasFnAttr(Attribute::StackAlignment));
+  bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
+                               F->hasFnAttr(Attribute::StackAlignment));
 
   // FIXME: Currently we don't support stack realignment for functions with
   //        variable-sized allocas.
-  // FIXME: Temporary disable the error - it seems to be too conservative.
+  // FIXME: It's more complicated than this...
   if (0 && requiresRealignment && MFI->hasVarSizedObjects())
     report_fatal_error(
       "Stack realignment in presense of dynamic allocas is not supported");
-
-  return (requiresRealignment && !MFI->hasVarSizedObjects());
+    
+  // If we've requested that we force align the stack do so now.
+  if (ForceStackAlign)
+    return canRealignStack(MF);
+    
+  return requiresRealignment && canRealignStack(MF);
 }
 
-bool X86RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
+bool X86RegisterInfo::hasReservedCallFrame(const MachineFunction &MF) const {
   return !MF.getFrameInfo()->hasVarSizedObjects();
 }
 
-bool X86RegisterInfo::hasReservedSpillSlot(MachineFunction &MF, unsigned Reg,
-                                           int &FrameIdx) const {
+bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+                                           unsigned Reg, int &FrameIdx) const {
   if (Reg == FramePtr && hasFP(MF)) {
     FrameIdx = MF.getFrameInfo()->getObjectIndexBegin();
     return true;
@@ -610,10 +626,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
-unsigned
+void
 X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                     int SPAdj, FrameIndexValue *Value,
-                                     RegScavenger *RS) const{
+                                     int SPAdj, RegScavenger *RS) const{
   assert(SPAdj == 0 && "Unexpected");
 
   unsigned i = 0;
@@ -660,7 +675,6 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     uint64_t Offset = FIOffset + (uint64_t)MI.getOperand(i+3).getOffset();
     MI.getOperand(i+3).setOffset(Offset);
   }
-  return 0;
 }
 
 void
@@ -750,7 +764,7 @@ void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
   }
 }
 
-/// mergeSPUpdatesUp - Merge two stack-manipulating instructions lower iterator.
+/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower iterator.
 static
 void mergeSPUpdatesDown(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator &MBBI,
@@ -901,6 +915,17 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
   bool HasFP = hasFP(MF);
   DebugLoc DL;
 
+  // If we're forcing a stack realignment we can't rely on just the frame
+  // info, we need to know the ABI stack alignment as well in case we
+  // have a call out.  Otherwise just make sure we have some alignment - we'll
+  // go with the minimum SlotSize.
+  if (ForceStackAlign) {
+    if (MFI->hasCalls())
+      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+    else if (MaxAlign < SlotSize)
+      MaxAlign = SlotSize;
+  }
+
   // Add RETADDR move area to callee saved frame size.
   int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
   if (TailCallReturnAddrDelta < 0)
@@ -979,7 +1004,7 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
     if (needsFrameMoves) {
       // Mark the place where EBP/RBP was saved.
       MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addSym(FrameLabel);
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(FrameLabel);
 
       // Define the current CFA rule to use the provided offset.
       if (StackSize) {
@@ -1007,7 +1032,7 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
     if (needsFrameMoves) {
       // Mark effective beginning of when frame pointer becomes valid.
       MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addSym(FrameLabel);
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(FrameLabel);
 
       // Define the current CFA to use the EBP/RBP register.
       MachineLocation FPDst(FramePtr);
@@ -1047,7 +1072,7 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
     if (!HasFP && needsFrameMoves) {
       // Mark callee-saved push instruction.
       MCSymbol *Label = MMI.getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addSym(Label);
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label);
 
       // Define the current CFA rule to use the provided offset.
       unsigned Ptr = StackSize ?
@@ -1062,7 +1087,17 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
   DL = MBB.findDebugLoc(MBBI);
 
   // Adjust stack pointer: ESP -= numbytes.
-  if (NumBytes >= 4096 && Subtarget->isTargetCygMing()) {
+
+  // Windows and cygwin/mingw require a prologue helper routine when allocating
+  // more than 4K bytes on the stack.  Windows uses __chkstk and cygwin/mingw
+  // uses __alloca.  __alloca and the 32-bit version of __chkstk will probe
+  // the stack and adjust the stack pointer in one go.  The 64-bit version
+  // of __chkstk is only responsible for probing the stack.  The 64-bit
+  // prologue is responsible for adjusting the stack pointer.  Touching the
+  // stack at 4K increments is necessary to ensure that the guard pages used
+  // by the OS virtual memory manager are allocated in correct sequence.
+  if (NumBytes >= 4096 &&
+     (Subtarget->isTargetCygMing() || Subtarget->isTargetWin32())) {
     // Check, whether EAX is livein for this function.
     bool isEAXAlive = false;
     for (MachineRegisterInfo::livein_iterator
@@ -1073,16 +1108,16 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
                     Reg == X86::AH || Reg == X86::AL);
     }
 
-    // Function prologue calls _alloca to probe the stack when allocating more
-    // than 4k bytes in one go. Touching the stack at 4K increments is necessary
-    // to ensure that the guard pages used by the OS virtual memory manager are
-    // allocated in correct sequence.
+
+    const char *StackProbeSymbol =
+      Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
     if (!isEAXAlive) {
       BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
         .addImm(NumBytes);
       BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
-        .addExternalSymbol("_alloca")
-        .addReg(StackPtr, RegState::Define | RegState::Implicit);
+        .addExternalSymbol(StackProbeSymbol)
+        .addReg(StackPtr,    RegState::Define | RegState::Implicit)
+        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
     } else {
       // Save EAX
       BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
@@ -1093,8 +1128,9 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
         .addImm(NumBytes - 4);
       BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
-        .addExternalSymbol("_alloca")
-        .addReg(StackPtr, RegState::Define | RegState::Implicit);
+        .addExternalSymbol(StackProbeSymbol)
+        .addReg(StackPtr,    RegState::Define | RegState::Implicit)
+        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
 
       // Restore EAX
       MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
@@ -1119,7 +1155,7 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
   if ((NumBytes || PushedRegs) && needsFrameMoves) {
     // Mark end of stack pointer adjustment.
     MCSymbol *Label = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addSym(Label);
+    BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label);
 
     if (!HasFP && NumBytes) {
       // Define the current CFA rule to use the provided offset.
@@ -1172,6 +1208,17 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
   unsigned CSSize = X86FI->getCalleeSavedFrameSize();
   uint64_t NumBytes = 0;
 
+  // If we're forcing a stack realignment we can't rely on just the frame
+  // info, we need to know the ABI stack alignment as well in case we
+  // have a call out.  Otherwise just make sure we have some alignment - we'll
+  // go with the minimum.
+  if (ForceStackAlign) {
+    if (MFI->hasCalls())
+      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+    else
+      MaxAlign = MaxAlign ? MaxAlign : 4;
+  }
+
   if (hasFP(MF)) {
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
@@ -1519,7 +1566,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) {
 namespace {
   struct MSAH : public MachineFunctionPass {
     static char ID;
-    MSAH() : MachineFunctionPass(&ID) {}
+    MSAH() : MachineFunctionPass(ID) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF) {
       const X86TargetMachine *TM =
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index d852bcd2011c4..527df05c58fce 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -117,18 +117,17 @@ public:
 
   bool needsStackRealignment(const MachineFunction &MF) const;
 
-  bool hasReservedCallFrame(MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
 
-  bool hasReservedSpillSlot(MachineFunction &MF, unsigned Reg,
+  bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
                             int &FrameIdx) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MI) const;
 
-  unsigned eliminateFrameIndex(MachineBasicBlock::iterator MI,
-                               int SPAdj, FrameIndexValue *Value = NULL,
-                               RegScavenger *RS = NULL) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                           int SPAdj, RegScavenger *RS = NULL) const;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS = NULL) const;
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 9f0382e3fae91..95269b15760e0 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -241,6 +241,10 @@ let Namespace = "X86" in {
   def CR6 : Register<"cr6">;
   def CR7 : Register<"cr7">;
   def CR8 : Register<"cr8">;
+
+  // Pseudo index registers
+  def EIZ : Register<"eiz">;
+  def RIZ : Register<"riz">;
 }
 
 
@@ -804,7 +808,7 @@ def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
   }];
 }
 
-def VR256 : RegisterClass<"X86", [v8i32, v4i64, v8f32, v4f64], 256,
+def VR256 : RegisterClass<"X86", [v32i8, v8i32, v4i64, v8f32, v4f64], 256,
                           [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
                            YMM8, YMM9, YMM10, YMM11,
                            YMM12, YMM13, YMM14, YMM15]> {
@@ -829,4 +833,15 @@ def VR256 : RegisterClass<"X86", [v8i32, v4i64, v8f32, v4f64], 256,
 // Status flags registers.
 def CCR : RegisterClass<"X86", [i32], 32, [EFLAGS]> {
   let CopyCost = -1;  // Don't allow copying of status registers.
+
+  // EFLAGS is not allocatable.
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    CCRClass::iterator
+    CCRClass::allocation_order_end(const MachineFunction &MF) const {
+      return allocation_order_begin(MF);
+    }
+  }];
 }
diff --git a/lib/Target/X86/X86ShuffleDecode.h b/lib/Target/X86/X86ShuffleDecode.h
new file mode 100644
index 0000000000000..df040520bc8f5
--- /dev/null
+++ b/lib/Target/X86/X86ShuffleDecode.h
@@ -0,0 +1,155 @@
+//===-- X86ShuffleDecode.h - X86 shuffle decode logic ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_SHUFFLE_DECODE_H
+#define X86_SHUFFLE_DECODE_H
+
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+enum {
+  SM_SentinelZero = ~0U
+};
+
+static inline
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask) {
+  // Defaults the copying the dest value.
+  ShuffleMask.push_back(0);
+  ShuffleMask.push_back(1);
+  ShuffleMask.push_back(2);
+  ShuffleMask.push_back(3);
+
+  // Decode the immediate.
+  unsigned ZMask = Imm & 15;
+  unsigned CountD = (Imm >> 4) & 3;
+  unsigned CountS = (Imm >> 6) & 3;
+
+  // CountS selects which input element to use.
+  unsigned InVal = 4+CountS;
+  // CountD specifies which element of destination to update.
+  ShuffleMask[CountD] = InVal;
+  // ZMask zaps values, potentially overriding the CountD elt.
+  if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
+  if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
+  if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
+  if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
+}
+
+// <3,1> or <6,7,2,3>
+static void DecodeMOVHLPSMask(unsigned NElts,
+                              SmallVectorImpl<unsigned> &ShuffleMask) {
+  for (unsigned i = NElts/2; i != NElts; ++i)
+    ShuffleMask.push_back(NElts+i);
+
+  for (unsigned i = NElts/2; i != NElts; ++i)
+    ShuffleMask.push_back(i);
+}
+
+// <0,2> or <0,1,4,5>
+static void DecodeMOVLHPSMask(unsigned NElts,
+                              SmallVectorImpl<unsigned> &ShuffleMask) {
+  for (unsigned i = 0; i != NElts/2; ++i)
+    ShuffleMask.push_back(i);
+
+  for (unsigned i = 0; i != NElts/2; ++i)
+    ShuffleMask.push_back(NElts+i);
+}
+
+static void DecodePSHUFMask(unsigned NElts, unsigned Imm,
+                            SmallVectorImpl<unsigned> &ShuffleMask) {
+  for (unsigned i = 0; i != NElts; ++i) {
+    ShuffleMask.push_back(Imm % NElts);
+    Imm /= NElts;
+  }
+}
+
+static void DecodePSHUFHWMask(unsigned Imm,
+                              SmallVectorImpl<unsigned> &ShuffleMask) {
+  ShuffleMask.push_back(0);
+  ShuffleMask.push_back(1);
+  ShuffleMask.push_back(2);
+  ShuffleMask.push_back(3);
+  for (unsigned i = 0; i != 4; ++i) {
+    ShuffleMask.push_back(4+(Imm & 3));
+    Imm >>= 2;
+  }
+}
+
+static void DecodePSHUFLWMask(unsigned Imm,
+                              SmallVectorImpl<unsigned> &ShuffleMask) {
+  for (unsigned i = 0; i != 4; ++i) {
+    ShuffleMask.push_back((Imm & 3));
+    Imm >>= 2;
+  }
+  ShuffleMask.push_back(4);
+  ShuffleMask.push_back(5);
+  ShuffleMask.push_back(6);
+  ShuffleMask.push_back(7);
+}
+
+static void DecodePUNPCKLMask(unsigned NElts,
+                              SmallVectorImpl<unsigned> &ShuffleMask) {
+  for (unsigned i = 0; i != NElts/2; ++i) {
+    ShuffleMask.push_back(i);
+    ShuffleMask.push_back(i+NElts);
+  }
+}
+
+static void DecodePUNPCKHMask(unsigned NElts,
+                              SmallVectorImpl<unsigned> &ShuffleMask) {
+  for (unsigned i = 0; i != NElts/2; ++i) {
+    ShuffleMask.push_back(i+NElts/2);
+    ShuffleMask.push_back(i+NElts+NElts/2);
+  }
+}
+
+static void DecodeSHUFPSMask(unsigned NElts, unsigned Imm,
+                             SmallVectorImpl<unsigned> &ShuffleMask) {
+  // Part that reads from dest.
+  for (unsigned i = 0; i != NElts/2; ++i) {
+    ShuffleMask.push_back(Imm % NElts);
+    Imm /= NElts;
+  }
+  // Part that reads from src.
+  for (unsigned i = 0; i != NElts/2; ++i) {
+    ShuffleMask.push_back(Imm % NElts + NElts);
+    Imm /= NElts;
+  }
+}
+
+static void DecodeUNPCKHPMask(unsigned NElts,
+                              SmallVectorImpl<unsigned> &ShuffleMask) {
+  for (unsigned i = 0; i != NElts/2; ++i) {
+    ShuffleMask.push_back(i+NElts/2);        // Reads from dest
+    ShuffleMask.push_back(i+NElts+NElts/2);  // Reads from src
+  }
+}
+
+
+/// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// etc.  NElts indicates the number of elements in the vector allowing it to
+/// handle different datatypes and vector widths.
+static void DecodeUNPCKLPMask(unsigned NElts,
+                              SmallVectorImpl<unsigned> &ShuffleMask) {
+  for (unsigned i = 0; i != NElts/2; ++i) {
+    ShuffleMask.push_back(i);        // Reads from dest
+    ShuffleMask.push_back(i+NElts);  // Reads from src
+  }
+}
+
+#endif
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 4a10be518f03f..0d02e5ee472bb 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -73,7 +73,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
       if (GV->hasDefaultVisibility() &&
           (isDecl || GV->isWeakForLinker()))
         return X86II::MO_GOTPCREL;
-    } else {
+    } else if (!isTargetWin64()) {
       assert(isTargetELF() && "Unknown rip-relative target");
 
       // Extra load is needed for all externally visible.
@@ -260,9 +260,10 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
   bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
   bool IsAMD   = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
 
-  HasFMA3 = IsIntel && ((ECX >> 12) & 0x1);
-  HasAVX = ((ECX >> 28) & 0x1);
-  HasAES = IsIntel && ((ECX >> 25) & 0x1);
+  HasCLMUL = IsIntel && ((ECX >> 1) & 0x1);
+  HasFMA3  = IsIntel && ((ECX >> 12) & 0x1);
+  HasAVX   = ((ECX >> 28) & 0x1);
+  HasAES   = IsIntel && ((ECX >> 25) & 0x1);
 
   if (IsIntel || IsAMD) {
     // Determine if bit test memory instructions are slow.
@@ -291,6 +292,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS,
   , HasSSE4A(false)
   , HasAVX(false)
   , HasAES(false)
+  , HasCLMUL(false)
   , HasFMA3(false)
   , HasFMA4(false)
   , IsBTMemSlow(false)
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 486dbc4e2e900..0ee91abe21f4e 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -74,6 +74,9 @@ protected:
   /// HasAES - Target has AES instructions
   bool HasAES;
 
+  /// HasCLMUL - Target has carry-less multiplication
+  bool HasCLMUL;
+
   /// HasFMA3 - Target has 3-operand fused multiply-add
   bool HasFMA3;
 
@@ -149,6 +152,7 @@ public:
   bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
   bool hasAVX() const { return HasAVX; }
   bool hasAES() const { return HasAES; }
+  bool hasCLMUL() const { return HasCLMUL; }
   bool hasFMA3() const { return HasFMA3; }
   bool hasFMA4() const { return HasFMA4; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
@@ -182,6 +186,10 @@ public:
     return Is64Bit && (isTargetMingw() || isTargetWindows());
   }
 
+  bool isTargetWin32() const {
+    return !Is64Bit && (isTargetMingw() || isTargetWindows());
+  }
+
   std::string getDataLayout() const {
     const char *p;
     if (is64Bit())
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index df00d3ffcc791..ce8636eb72b54 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -46,8 +46,15 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
                                     bool RelaxAll) {
   Triple TheTriple(TT);
   switch (TheTriple.getOS()) {
-  default:
+  case Triple::Darwin:
     return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll);
+  case Triple::MinGW32:
+  case Triple::MinGW64:
+  case Triple::Cygwin:
+  case Triple::Win32:
+    return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll);
+  default:
+    return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll);
   }
 }
 
@@ -105,15 +112,21 @@ X86TargetMachine::X86TargetMachine(const Target &T, const std::string &TT,
     InstrInfo(*this), JITInfo(*this), TLInfo(*this), TSInfo(*this),
     ELFWriterInfo(*this) {
   DefRelocModel = getRelocationModel();
-      
+
   // If no relocation model was picked, default as appropriate for the target.
   if (getRelocationModel() == Reloc::Default) {
-    if (!Subtarget.isTargetDarwin())
-      setRelocationModel(Reloc::Static);
-    else if (Subtarget.is64Bit())
+    // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
+    // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
+    // use static relocation model by default.
+    if (Subtarget.isTargetDarwin()) {
+      if (Subtarget.is64Bit())
+        setRelocationModel(Reloc::PIC_);
+      else
+        setRelocationModel(Reloc::DynamicNoPIC);
+    } else if (Subtarget.isTargetWin64())
       setRelocationModel(Reloc::PIC_);
     else
-      setRelocationModel(Reloc::DynamicNoPIC);
+      setRelocationModel(Reloc::Static);
   }
 
   assert(getRelocationModel() != Reloc::Default &&
@@ -136,29 +149,27 @@ X86TargetMachine::X86TargetMachine(const Target &T, const std::string &TT,
       Subtarget.isTargetDarwin() &&
       is64Bit)
     setRelocationModel(Reloc::PIC_);
-      
+
   // Determine the PICStyle based on the target selected.
   if (getRelocationModel() == Reloc::Static) {
     // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
     Subtarget.setPICStyle(PICStyles::None);
+  } else if (Subtarget.is64Bit()) {
+    // PIC in 64 bit mode is always rip-rel.
+    Subtarget.setPICStyle(PICStyles::RIPRel);
   } else if (Subtarget.isTargetCygMing()) {
     Subtarget.setPICStyle(PICStyles::None);
   } else if (Subtarget.isTargetDarwin()) {
-    if (Subtarget.is64Bit())
-      Subtarget.setPICStyle(PICStyles::RIPRel);
-    else if (getRelocationModel() == Reloc::PIC_)
+    if (getRelocationModel() == Reloc::PIC_)
       Subtarget.setPICStyle(PICStyles::StubPIC);
     else {
       assert(getRelocationModel() == Reloc::DynamicNoPIC);
       Subtarget.setPICStyle(PICStyles::StubDynamicNoPIC);
     }
   } else if (Subtarget.isTargetELF()) {
-    if (Subtarget.is64Bit())
-      Subtarget.setPICStyle(PICStyles::RIPRel);
-    else
-      Subtarget.setPICStyle(PICStyles::GOT);
+    Subtarget.setPICStyle(PICStyles::GOT);
   }
-      
+
   // Finally, if we have "none" as our PIC style, force to static mode.
   if (Subtarget.getPICStyle() == PICStyles::None)
     setRelocationModel(Reloc::Static);
@@ -182,9 +193,6 @@ bool X86TargetMachine::addInstSelector(PassManagerBase &PM,
 
 bool X86TargetMachine::addPreRegAlloc(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel) {
-  // Install a pass to insert x87 FP_REG_KILL instructions, as needed.
-  PM.add(createX87FPRegKillInserterPass());
-
   PM.add(createX86MaxStackAlignmentHeuristicPass());
   return false;  // -print-machineinstr shouldn't print after this.
 }
diff --git a/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp b/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp
index 6656bdc10eae4..8f06dd32662f2 100644
--- a/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp
@@ -264,15 +264,13 @@ bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
 void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   SmallString<128> Str;
   raw_svector_ostream O(Str);
-  
+
   // Check for mov mnemonic
-  unsigned src, dst, srcSR, dstSR;
-  if (TM.getInstrInfo()->isMoveInstr(*MI, src, dst, srcSR, dstSR)) {
-    O << "\tmov " << getRegisterName(dst) << ", ";
-    O << getRegisterName(src);
-  } else {
+  if (MI->getOpcode() == XCore::ADD_2rus && !MI->getOperand(2).getImm())
+    O << "\tmov " << getRegisterName(MI->getOperand(0).getReg()) << ", "
+      << getRegisterName(MI->getOperand(1).getReg());
+  else
     printInstruction(MI, O);
-  }
   OutStreamer.EmitRawText(O.str());
 }
 
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index 1b8e7edfc7ca9..38b35d7666c0e 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -10,7 +10,7 @@ tablegen(XCoreGenDAGISel.inc -gen-dag-isel)
 tablegen(XCoreGenCallingConv.inc -gen-callingconv)
 tablegen(XCoreGenSubtarget.inc -gen-subtarget)
 
-add_llvm_target(XCore
+add_llvm_target(XCoreCodeGen
   XCoreFrameInfo.cpp
   XCoreInstrInfo.cpp
   XCoreISelDAGToDAG.cpp
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 5564ddf133eaf..755ece7e9abac 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -56,6 +56,17 @@ namespace {
       return CurDAG->getTargetConstant(Imm, MVT::i32);
     }
 
+    inline bool immMskBitp(SDNode *inN) const {
+      ConstantSDNode *N = cast<ConstantSDNode>(inN);
+      uint32_t value = (uint32_t)N->getZExtValue();
+      if (!isMask_32(value)) {
+        return false;
+      }
+      int msksize = 32 - CountLeadingZeros_32(value);
+      return (msksize >= 1 && msksize <= 8) ||
+              msksize == 16 || msksize == 24 || msksize == 32;
+    }
+
     // Complex Pattern Selectors.
     bool SelectADDRspii(SDNode *Op, SDValue Addr, SDValue &Base,
                         SDValue &Offset);
@@ -151,17 +162,15 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
     switch (N->getOpcode()) {
       default: break;
       case ISD::Constant: {
-        if (Predicate_immMskBitp(N)) {
+        uint64_t Val = cast<ConstantSDNode>(N)->getZExtValue();
+        if (immMskBitp(N)) {
           // Transformation function: get the size of a mask
-          int64_t MaskVal = cast<ConstantSDNode>(N)->getZExtValue();
-          assert(isMask_32(MaskVal));
           // Look for the first non-zero bit
-          SDValue MskSize = getI32Imm(32 - CountLeadingZeros_32(MaskVal));
+          SDValue MskSize = getI32Imm(32 - CountLeadingZeros_32(Val));
           return CurDAG->getMachineNode(XCore::MKMSK_rus, dl,
                                         MVT::i32, MskSize);
         }
-        else if (! Predicate_immU16(N)) {
-          unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
+        else if (!isUInt<16>(Val)) {
           SDValue CPIdx =
             CurDAG->getTargetConstantPool(ConstantInt::get(
                                   Type::getInt32Ty(*CurDAG->getContext()), Val),
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index dd90ea9767705..ad00046af17de 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -46,33 +46,6 @@ static bool isZeroImm(const MachineOperand &op) {
   return op.isImm() && op.getImm() == 0;
 }
 
-/// Return true if the instruction is a register to register move and
-/// leave the source and dest operands in the passed parameters.
-///
-bool XCoreInstrInfo::isMoveInstr(const MachineInstr &MI,
-                                 unsigned &SrcReg, unsigned &DstReg,
-                                 unsigned &SrcSR, unsigned &DstSR) const {
-  SrcSR = DstSR = 0; // No sub-registers.
-
-  // We look for 4 kinds of patterns here:
-  // add dst, src, 0
-  // sub dst, src, 0
-  // or dst, src, src
-  // and dst, src, src
-  if ((MI.getOpcode() == XCore::ADD_2rus || MI.getOpcode() == XCore::SUB_2rus)
-      && isZeroImm(MI.getOperand(2))) {
-    DstReg = MI.getOperand(0).getReg();
-    SrcReg = MI.getOperand(1).getReg();
-    return true;
-  } else if ((MI.getOpcode() == XCore::OR_3r || MI.getOpcode() == XCore::AND_3r)
-      && MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
-    DstReg = MI.getOperand(0).getReg();
-    SrcReg = MI.getOperand(1).getReg();
-    return true;
-  }
-  return false;
-}
-
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
 /// load from a stack slot, return the virtual or physical register number of
 /// the destination along with the FrameIndex of the loaded stack slot.  If
@@ -437,7 +410,7 @@ bool XCoreInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                         it->getFrameIdx(), RC, &RI);
     if (emitFrameMoves) {
       MCSymbol *SaveLabel = MF->getContext().CreateTempSymbol();
-      BuildMI(MBB, MI, DL, get(XCore::DBG_LABEL)).addSym(SaveLabel);
+      BuildMI(MBB, MI, DL, get(XCore::PROLOG_LABEL)).addSym(SaveLabel);
       XFI->getSpillLabels().push_back(std::make_pair(SaveLabel, *it));
     }
   }
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index e5b0171579fce..d2b116eef0d8f 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -30,12 +30,6 @@ public:
   ///
   virtual const TargetRegisterInfo &getRegisterInfo() const { return RI; }
 
-  /// Return true if the instruction is a register to register move and return
-  /// the source and dest operands and their sub-register indices by reference.
-  virtual bool isMoveInstr(const MachineInstr &MI,
-                           unsigned &SrcReg, unsigned &DstReg,
-                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-  
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 19b9b1f8c00c6..6b3b39ba1d494 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -140,17 +140,7 @@ def immU20 : PatLeaf<(imm), [{
   return (uint32_t)N->getZExtValue() < (1 << 20);
 }]>;
 
-def immMskBitp : PatLeaf<(imm), [{
-  uint32_t value = (uint32_t)N->getZExtValue();
-  if (!isMask_32(value)) {
-    return false;
-  }
-  int msksize = 32 - CountLeadingZeros_32(value);
-  return (msksize >= 1 && msksize <= 8)
-          || msksize == 16
-          || msksize == 24
-          || msksize == 32;
-}]>;
+def immMskBitp : PatLeaf<(imm), [{ return immMskBitp(N); }]>;
 
 def immBitp : PatLeaf<(imm), [{
   uint32_t value = (uint32_t)N->getZExtValue();
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 2a88342180e40..f82e59814e775 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -155,10 +155,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
-unsigned
+void
 XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                       int SPAdj, FrameIndexValue *Value,
-                                       RegScavenger *RS) const {
+                                       int SPAdj, RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
   MachineInstr &MI = *II;
   DebugLoc dl = MI.getDebugLoc();
@@ -291,7 +290,6 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
   // Erase old instruction.
   MBB.erase(II);
-  return 0;
 }
 
 void
@@ -420,7 +418,7 @@ void XCoreRegisterInfo::emitPrologue(MachineFunction &MF) const {
       
       // Show update of SP.
       MCSymbol *FrameLabel = MMI->getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, dl, TII.get(XCore::DBG_LABEL)).addSym(FrameLabel);
+      BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(FrameLabel);
       
       MachineLocation SPDst(MachineLocation::VirtualFP);
       MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize * 4);
@@ -439,7 +437,7 @@ void XCoreRegisterInfo::emitPrologue(MachineFunction &MF) const {
       
       if (emitFrameMoves) {
         MCSymbol *SaveLRLabel = MMI->getContext().CreateTempSymbol();
-        BuildMI(MBB, MBBI, dl, TII.get(XCore::DBG_LABEL)).addSym(SaveLRLabel);
+        BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveLRLabel);
         MachineLocation CSDst(MachineLocation::VirtualFP, LRSpillOffset);
         MachineLocation CSSrc(XCore::LR);
         MMI->getFrameMoves().push_back(MachineMove(SaveLRLabel, CSDst, CSSrc));
@@ -455,7 +453,7 @@ void XCoreRegisterInfo::emitPrologue(MachineFunction &MF) const {
     MBB.addLiveIn(XCore::R10);
     if (emitFrameMoves) {
       MCSymbol *SaveR10Label = MMI->getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, dl, TII.get(XCore::DBG_LABEL)).addSym(SaveR10Label);
+      BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveR10Label);
       MachineLocation CSDst(MachineLocation::VirtualFP, FPSpillOffset);
       MachineLocation CSSrc(XCore::R10);
       MMI->getFrameMoves().push_back(MachineMove(SaveR10Label, CSDst, CSSrc));
@@ -467,7 +465,7 @@ void XCoreRegisterInfo::emitPrologue(MachineFunction &MF) const {
     if (emitFrameMoves) {
       // Show FP is now valid.
       MCSymbol *FrameLabel = MMI->getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, dl, TII.get(XCore::DBG_LABEL)).addSym(FrameLabel);
+      BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(FrameLabel);
       MachineLocation SPDst(FramePtr);
       MachineLocation SPSrc(MachineLocation::VirtualFP);
       MMI->getFrameMoves().push_back(MachineMove(FrameLabel, SPDst, SPSrc));
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 66132ba8ff66f..e636c1c7298aa 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -54,9 +54,8 @@ public:
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
-  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
-                               int SPAdj, FrameIndexValue *Value = NULL,
-                               RegScavenger *RS = NULL) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                 RegScavenger *RS = NULL) const;
diff --git a/lib/Transforms/Hello/Hello.cpp b/lib/Transforms/Hello/Hello.cpp
index abfa514e20cfe..838d5505490f5 100644
--- a/lib/Transforms/Hello/Hello.cpp
+++ b/lib/Transforms/Hello/Hello.cpp
@@ -25,7 +25,7 @@ namespace {
   // Hello - The first implementation, without getAnalysisUsage.
   struct Hello : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    Hello() : FunctionPass(&ID) {}
+    Hello() : FunctionPass(ID) {}
 
     virtual bool runOnFunction(Function &F) {
       ++HelloCounter;
@@ -37,13 +37,13 @@ namespace {
 }
 
 char Hello::ID = 0;
-static RegisterPass<Hello> X("hello", "Hello World Pass");
+INITIALIZE_PASS(Hello, "hello", "Hello World Pass", false, false);
 
 namespace {
   // Hello2 - The second implementation with getAnalysisUsage implemented.
   struct Hello2 : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    Hello2() : FunctionPass(&ID) {}
+    Hello2() : FunctionPass(ID) {}
 
     virtual bool runOnFunction(Function &F) {
       ++HelloCounter;
@@ -60,5 +60,6 @@ namespace {
 }
 
 char Hello2::ID = 0;
-static RegisterPass<Hello2>
-Y("hello2", "Hello World Pass (with getAnalysisUsage implemented)");
+INITIALIZE_PASS(Hello2, "hello2",
+                "Hello World Pass (with getAnalysisUsage implemented)",
+                false, false);
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index 28ea079abd360..0c77e1fd8cff4 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -67,7 +67,7 @@ namespace {
     virtual bool runOnSCC(CallGraphSCC &SCC);
     static char ID; // Pass identification, replacement for typeid
     explicit ArgPromotion(unsigned maxElements = 3)
-      : CallGraphSCCPass(&ID), maxElements(maxElements) {}
+      : CallGraphSCCPass(ID), maxElements(maxElements) {}
 
     /// A vector used to hold the indices of a single GEP instruction
     typedef std::vector<uint64_t> IndicesVector;
@@ -84,8 +84,8 @@ namespace {
 }
 
 char ArgPromotion::ID = 0;
-static RegisterPass<ArgPromotion>
-X("argpromotion", "Promote 'by reference' arguments to scalars");
+INITIALIZE_PASS(ArgPromotion, "argpromotion",
+                "Promote 'by reference' arguments to scalars", false, false);
 
 Pass *llvm::createArgumentPromotionPass(unsigned maxElements) {
   return new ArgPromotion(maxElements);
@@ -208,8 +208,8 @@ static bool AllCalleesPassInValidPointerForArgument(Argument *Arg) {
   // have direct callees.
   for (Value::use_iterator UI = Callee->use_begin(), E = Callee->use_end();
        UI != E; ++UI) {
-    CallSite CS = CallSite::get(*UI);
-    assert(CS.getInstruction() && "Should only have direct calls!");
+    CallSite CS(*UI);
+    assert(CS && "Should only have direct calls!");
 
     if (!IsAlwaysValidPointer(CS.getArgument(ArgNo)))
       return false;
@@ -619,14 +619,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   
   // Get a new callgraph node for NF.
   CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF);
-  
 
   // Loop over all of the callers of the function, transforming the call sites
   // to pass in the loaded pointers.
   //
   SmallVector<Value*, 16> Args;
   while (!F->use_empty()) {
-    CallSite CS = CallSite::get(F->use_back());
+    CallSite CS(F->use_back());
     assert(CS.getCalledFunction() == F);
     Instruction *Call = CS.getInstruction();
     const AttrListPtr &CallPAL = CS.getAttributes();
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index 3c05f88027a62..64e8d792dc3ad 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -19,10 +19,12 @@
 
 #define DEBUG_TYPE "constmerge"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
@@ -31,7 +33,7 @@ STATISTIC(NumMerged, "Number of global constants merged");
 namespace {
   struct ConstantMerge : public ModulePass {
     static char ID; // Pass identification, replacement for typeid
-    ConstantMerge() : ModulePass(&ID) {}
+    ConstantMerge() : ModulePass(ID) {}
 
     // run - For this pass, process all of the globals in the module,
     // eliminating duplicate constants.
@@ -41,12 +43,32 @@ namespace {
 }
 
 char ConstantMerge::ID = 0;
-static RegisterPass<ConstantMerge>
-X("constmerge", "Merge Duplicate Global Constants");
+INITIALIZE_PASS(ConstantMerge, "constmerge",
+                "Merge Duplicate Global Constants", false, false);
 
 ModulePass *llvm::createConstantMergePass() { return new ConstantMerge(); }
 
+
+
+/// Find values that are marked as llvm.used.
+static void FindUsedValues(GlobalVariable *LLVMUsed,
+                           SmallPtrSet<const GlobalValue*, 8> &UsedValues) {
+  if (LLVMUsed == 0) return;
+  ConstantArray *Inits = dyn_cast<ConstantArray>(LLVMUsed->getInitializer());
+  if (Inits == 0) return;
+  
+  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
+    if (GlobalValue *GV = 
+        dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
+      UsedValues.insert(GV);
+}
+
 bool ConstantMerge::runOnModule(Module &M) {
+  // Find all the globals that are marked "used".  These cannot be merged.
+  SmallPtrSet<const GlobalValue*, 8> UsedGlobals;
+  FindUsedValues(M.getGlobalVariable("llvm.used"), UsedGlobals);
+  FindUsedValues(M.getGlobalVariable("llvm.compiler.used"), UsedGlobals);
+  
   // Map unique constant/section pairs to globals.  We don't want to merge
   // globals in different sections.
   DenseMap<Constant*, GlobalVariable*> CMap;
@@ -79,9 +101,13 @@ bool ConstantMerge::runOnModule(Module &M) {
       
       // Only process constants with initializers in the default addres space.
       if (!GV->isConstant() ||!GV->hasDefinitiveInitializer() ||
-          GV->getType()->getAddressSpace() != 0 || !GV->getSection().empty())
+          GV->getType()->getAddressSpace() != 0 || !GV->getSection().empty() ||
+          // Don't touch values marked with attribute(used).
+          UsedGlobals.count(GV))
         continue;
       
+      
+      
       Constant *Init = GV->getInitializer();
 
       // Check to see if the initializer is already known.
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 475eee8b19e4b..47df235424e2f 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -122,11 +122,11 @@ namespace {
 
   protected:
     // DAH uses this to specify a different ID.
-    explicit DAE(void *ID) : ModulePass(ID) {}
+    explicit DAE(char &ID) : ModulePass(ID) {}
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    DAE() : ModulePass(&ID) {}
+    DAE() : ModulePass(ID) {}
 
     bool runOnModule(Module &M);
 
@@ -151,8 +151,7 @@ namespace {
 
 
 char DAE::ID = 0;
-static RegisterPass<DAE>
-X("deadargelim", "Dead Argument Elimination");
+INITIALIZE_PASS(DAE, "deadargelim", "Dead Argument Elimination", false, false);
 
 namespace {
   /// DAH - DeadArgumentHacking pass - Same as dead argument elimination, but
@@ -160,15 +159,16 @@ namespace {
   /// by bugpoint.
   struct DAH : public DAE {
     static char ID;
-    DAH() : DAE(&ID) {}
+    DAH() : DAE(ID) {}
 
     virtual bool ShouldHackArguments() const { return true; }
   };
 }
 
 char DAH::ID = 0;
-static RegisterPass<DAH>
-Y("deadarghaX0r", "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)");
+INITIALIZE_PASS(DAH, "deadarghaX0r", 
+                "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)",
+                false, false);
 
 /// createDeadArgEliminationPass - This pass removes arguments from functions
 /// which are not used by the body of the function.
@@ -220,11 +220,11 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
   //
   std::vector<Value*> Args;
   while (!Fn.use_empty()) {
-    CallSite CS = CallSite::get(Fn.use_back());
+    CallSite CS(Fn.use_back());
     Instruction *Call = CS.getInstruction();
 
     // Pass all the same arguments.
-    Args.assign(CS.arg_begin(), CS.arg_begin()+NumArgs);
+    Args.assign(CS.arg_begin(), CS.arg_begin() + NumArgs);
 
     // Drop any attributes that were on the vararg arguments.
     AttrListPtr PAL = CS.getAttributes();
@@ -250,8 +250,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
       if (cast<CallInst>(Call)->isTailCall())
         cast<CallInst>(New)->setTailCall();
     }
-    if (MDNode *N = Call->getDbgMetadata())
-      New->setDbgMetadata(N);
+    New->setDebugLoc(Call->getDebugLoc());
 
     Args.clear();
 
@@ -725,7 +724,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   //
   std::vector<Value*> Args;
   while (!F->use_empty()) {
-    CallSite CS = CallSite::get(F->use_back());
+    CallSite CS(F->use_back());
     Instruction *Call = CS.getInstruction();
 
     AttributesVec.clear();
@@ -780,8 +779,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
       if (cast<CallInst>(Call)->isTailCall())
         cast<CallInst>(New)->setTailCall();
     }
-    if (MDNode *N = Call->getDbgMetadata())
-      New->setDbgMetadata(N);
+    New->setDebugLoc(Call->getDebugLoc());
 
     Args.clear();
 
diff --git a/lib/Transforms/IPO/DeadTypeElimination.cpp b/lib/Transforms/IPO/DeadTypeElimination.cpp
index 662fbb5cd4130..5dc50c5bef32f 100644
--- a/lib/Transforms/IPO/DeadTypeElimination.cpp
+++ b/lib/Transforms/IPO/DeadTypeElimination.cpp
@@ -26,7 +26,7 @@ STATISTIC(NumKilled, "Number of unused typenames removed from symtab");
 namespace {
   struct DTE : public ModulePass {
     static char ID; // Pass identification, replacement for typeid
-    DTE() : ModulePass(&ID) {}
+    DTE() : ModulePass(ID) {}
 
     // doPassInitialization - For this pass, it removes global symbol table
     // entries for primitive types.  These are never used for linking in GCC and
@@ -45,7 +45,7 @@ namespace {
 }
 
 char DTE::ID = 0;
-static RegisterPass<DTE> X("deadtypeelim", "Dead Type Elimination");
+INITIALIZE_PASS(DTE, "deadtypeelim", "Dead Type Elimination", false, false);
 
 ModulePass *llvm::createDeadTypeEliminationPass() {
   return new DTE();
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index 7f67e48ade839..45c5fe76ba7c5 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -17,15 +17,15 @@
 #include "llvm/Pass.h"
 #include "llvm/Constants.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SetVector.h"
 #include <algorithm>
 using namespace llvm;
 
 namespace {
   /// @brief A pass to extract specific functions and their dependencies.
   class GVExtractorPass : public ModulePass {
-    std::vector<GlobalValue*> Named;
+    SetVector<GlobalValue *> Named;
     bool deleteStuff;
-    bool reLink;
   public:
     static char ID; // Pass identification, replacement for typeid
 
@@ -33,135 +33,42 @@ namespace {
     /// specified function. Otherwise, it deletes as much of the module as
     /// possible, except for the function specified.
     ///
-    explicit GVExtractorPass(std::vector<GlobalValue*>& GVs, bool deleteS = true,
-                             bool relinkCallees = false)
-      : ModulePass(&ID), Named(GVs), deleteStuff(deleteS),
-        reLink(relinkCallees) {}
+    explicit GVExtractorPass(std::vector<GlobalValue*>& GVs, bool deleteS = true)
+      : ModulePass(ID), Named(GVs.begin(), GVs.end()), deleteStuff(deleteS) {}
 
     bool runOnModule(Module &M) {
-      if (Named.size() == 0) {
-        return false;  // Nothing to extract
-      }
-      
-      
-      if (deleteStuff)
-        return deleteGV();
-      M.setModuleInlineAsm("");
-      return isolateGV(M);
-    }
-
-    bool deleteGV() {
-      for (std::vector<GlobalValue*>::iterator GI = Named.begin(), 
-             GE = Named.end(); GI != GE; ++GI) {
-        if (Function* NamedFunc = dyn_cast<Function>(*GI)) {
-         // If we're in relinking mode, set linkage of all internal callees to
-         // external. This will allow us extract function, and then - link
-         // everything together
-         if (reLink) {
-           for (Function::iterator B = NamedFunc->begin(), BE = NamedFunc->end();
-                B != BE; ++B) {
-             for (BasicBlock::iterator I = B->begin(), E = B->end();
-                  I != E; ++I) {
-               if (CallInst* callInst = dyn_cast<CallInst>(&*I)) {
-                 Function* Callee = callInst->getCalledFunction();
-                 if (Callee && Callee->hasLocalLinkage())
-                   Callee->setLinkage(GlobalValue::ExternalLinkage);
-               }
-             }
-           }
-         }
-         
-         NamedFunc->setLinkage(GlobalValue::ExternalLinkage);
-         NamedFunc->deleteBody();
-         assert(NamedFunc->isDeclaration() && "This didn't make the function external!");
-       } else {
-          if (!(*GI)->isDeclaration()) {
-            cast<GlobalVariable>(*GI)->setInitializer(0);  //clear the initializer
-            (*GI)->setLinkage(GlobalValue::ExternalLinkage);
-          }
-        }
-      }
-      return true;
-    }
-
-    bool isolateGV(Module &M) {
-      // Mark all globals internal
-      // FIXME: what should we do with private linkage?
-      for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I)
+      // Visit the global inline asm.
+      if (!deleteStuff)
+        M.setModuleInlineAsm("");
+
+      // For simplicity, just give all GlobalValues ExternalLinkage. A trickier
+      // implementation could figure out which GlobalValues are actually
+      // referenced by the Named set, and which GlobalValues in the rest of
+      // the module are referenced by the NamedSet, and get away with leaving
+      // more internal and private things internal and private. But for now,
+      // be conservative and simple.
+
+      // Visit the GlobalVariables.
+      for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+           I != E; ++I)
         if (!I->isDeclaration()) {
-          I->setLinkage(GlobalValue::InternalLinkage);
+          if (I->hasLocalLinkage())
+            I->setVisibility(GlobalValue::HiddenVisibility);
+          I->setLinkage(GlobalValue::ExternalLinkage);
+          if (deleteStuff == Named.count(I))
+            I->setInitializer(0);
         }
+
+      // Visit the Functions.
       for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
         if (!I->isDeclaration()) {
-          I->setLinkage(GlobalValue::InternalLinkage);
-        }
-
-      // Make sure our result is globally accessible...
-      // by putting them in the used array
-      {
-        std::vector<Constant *> AUGs;
-        const Type *SBP=
-              Type::getInt8PtrTy(M.getContext());
-        for (std::vector<GlobalValue*>::iterator GI = Named.begin(), 
-               GE = Named.end(); GI != GE; ++GI) {
-          (*GI)->setLinkage(GlobalValue::ExternalLinkage);
-          AUGs.push_back(ConstantExpr::getBitCast(*GI, SBP));
-        }
-        ArrayType *AT = ArrayType::get(SBP, AUGs.size());
-        Constant *Init = ConstantArray::get(AT, AUGs);
-        GlobalValue *gv = new GlobalVariable(M, AT, false, 
-                                             GlobalValue::AppendingLinkage, 
-                                             Init, "llvm.used");
-        gv->setSection("llvm.metadata");
-      }
-
-      // All of the functions may be used by global variables or the named
-      // globals.  Loop through them and create a new, external functions that
-      // can be "used", instead of ones with bodies.
-      std::vector<Function*> NewFunctions;
-
-      Function *Last = --M.end();  // Figure out where the last real fn is.
-
-      for (Module::iterator I = M.begin(); ; ++I) {
-        if (std::find(Named.begin(), Named.end(), &*I) == Named.end()) {
-          Function *New = Function::Create(I->getFunctionType(),
-                                           GlobalValue::ExternalLinkage);
-          New->copyAttributesFrom(I);
-
-          // If it's not the named function, delete the body of the function
-          I->dropAllReferences();
-
-          M.getFunctionList().push_back(New);
-          NewFunctions.push_back(New);
-          New->takeName(I);
+          if (I->hasLocalLinkage())
+            I->setVisibility(GlobalValue::HiddenVisibility);
+          I->setLinkage(GlobalValue::ExternalLinkage);
+          if (deleteStuff == Named.count(I))
+            I->deleteBody();
         }
 
-        if (&*I == Last) break;  // Stop after processing the last function
-      }
-
-      // Now that we have replacements all set up, loop through the module,
-      // deleting the old functions, replacing them with the newly created
-      // functions.
-      if (!NewFunctions.empty()) {
-        unsigned FuncNum = 0;
-        Module::iterator I = M.begin();
-        do {
-          if (std::find(Named.begin(), Named.end(), &*I) == Named.end()) {
-            // Make everything that uses the old function use the new dummy fn
-            I->replaceAllUsesWith(NewFunctions[FuncNum++]);
-
-            Function *Old = I;
-            ++I;  // Move the iterator to the new function
-
-            // Delete the old function!
-            M.getFunctionList().erase(Old);
-
-          } else {
-            ++I;  // Skip the function we are extracting
-          }
-        } while (&*I != NewFunctions[0]);
-      }
-
       return true;
     }
   };
@@ -170,6 +77,6 @@ namespace {
 }
 
 ModulePass *llvm::createGVExtractionPass(std::vector<GlobalValue*>& GVs, 
-                                         bool deleteFn, bool relinkCallees) {
-  return new GVExtractorPass(GVs, deleteFn, relinkCallees);
+                                         bool deleteFn) {
+  return new GVExtractorPass(GVs, deleteFn);
 }
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 9bd7af61c531f..6165ba023f737 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -41,7 +41,7 @@ STATISTIC(NumNoAlias, "Number of function returns marked noalias");
 namespace {
   struct FunctionAttrs : public CallGraphSCCPass {
     static char ID; // Pass identification, replacement for typeid
-    FunctionAttrs() : CallGraphSCCPass(&ID) {}
+    FunctionAttrs() : CallGraphSCCPass(ID) {}
 
     // runOnSCC - Analyze the SCC, performing the transformation if possible.
     bool runOnSCC(CallGraphSCC &SCC);
@@ -69,8 +69,8 @@ namespace {
 }
 
 char FunctionAttrs::ID = 0;
-static RegisterPass<FunctionAttrs>
-X("functionattrs", "Deduce function attributes");
+INITIALIZE_PASS(FunctionAttrs, "functionattrs",
+                "Deduce function attributes", false, false);
 
 Pass *llvm::createFunctionAttrsPass() { return new FunctionAttrs(); }
 
@@ -162,14 +162,14 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {
 
       // Some instructions can be ignored even if they read or write memory.
       // Detect these now, skipping to the next instruction if one is found.
-      CallSite CS = CallSite::get(I);
-      if (CS.getInstruction() && CS.getCalledFunction()) {
+      CallSite CS(cast<Value>(I));
+      if (CS && CS.getCalledFunction()) {
         // Ignore calls to functions in the same SCC.
         if (SCCNodes.count(CS.getCalledFunction()))
           continue;
         // Ignore intrinsics that only access local memory.
         if (unsigned id = CS.getCalledFunction()->getIntrinsicID())
-          if (AliasAnalysis::getModRefBehavior(id) ==
+          if (AliasAnalysis::getIntrinsicModRefBehavior(id) ==
               AliasAnalysis::AccessesArguments) {
             // Check that all pointer arguments point to local memory.
             for (CallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 44216a6df99c4..aa18601b9aeca 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -31,7 +31,7 @@ STATISTIC(NumVariables, "Number of global variables removed");
 namespace {
   struct GlobalDCE : public ModulePass {
     static char ID; // Pass identification, replacement for typeid
-    GlobalDCE() : ModulePass(&ID) {}
+    GlobalDCE() : ModulePass(ID) {}
 
     // run - Do the GlobalDCE pass on the specified module, optionally updating
     // the specified callgraph to reflect the changes.
@@ -51,7 +51,8 @@ namespace {
 }
 
 char GlobalDCE::ID = 0;
-static RegisterPass<GlobalDCE> X("globaldce", "Dead Global Elimination");
+INITIALIZE_PASS(GlobalDCE, "globaldce",
+                "Dead Global Elimination", false, false);
 
 ModulePass *llvm::createGlobalDCEPass() { return new GlobalDCE(); }
 
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 735a1c47c39b5..a77af549caa13 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -59,7 +59,7 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
     }
     static char ID; // Pass identification, replacement for typeid
-    GlobalOpt() : ModulePass(&ID) {}
+    GlobalOpt() : ModulePass(ID) {}
 
     bool runOnModule(Module &M);
 
@@ -74,7 +74,8 @@ namespace {
 }
 
 char GlobalOpt::ID = 0;
-static RegisterPass<GlobalOpt> X("globalopt", "Global Variable Optimizer");
+INITIALIZE_PASS(GlobalOpt, "globalopt",
+                "Global Variable Optimizer", false, false);
 
 ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); }
 
@@ -1467,7 +1468,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
                                                TargetData *TD) {
   if (!TD)
     return false;
-          
+  
   // If this is a malloc of an abstract type, don't touch it.
   if (!AllocTy->isSized())
     return false;
@@ -2077,7 +2078,7 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) {
         return false;
 
       // The first index must be zero.
-      ConstantInt *CI = dyn_cast<ConstantInt>(*next(CE->op_begin()));
+      ConstantInt *CI = dyn_cast<ConstantInt>(*llvm::next(CE->op_begin()));
       if (!CI || !CI->isZero()) return false;
 
       // The remaining indices must be compile-time known integers within the
@@ -2302,7 +2303,8 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
       if (isa<InlineAsm>(CI->getCalledValue())) return false;
 
       // Resolve function pointers.
-      Function *Callee = dyn_cast<Function>(getVal(Values, CI->getCalledValue()));
+      Function *Callee = dyn_cast<Function>(getVal(Values,
+                                                   CI->getCalledValue()));
       if (!Callee) return false;  // Cannot resolve.
 
       SmallVector<Constant*, 8> Formals;
diff --git a/lib/Transforms/IPO/IPConstantPropagation.cpp b/lib/Transforms/IPO/IPConstantPropagation.cpp
index e4db235b1d108..1b3cf7810cc68 100644
--- a/lib/Transforms/IPO/IPConstantPropagation.cpp
+++ b/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -35,7 +35,7 @@ namespace {
   ///
   struct IPCP : public ModulePass {
     static char ID; // Pass identification, replacement for typeid
-    IPCP() : ModulePass(&ID) {}
+    IPCP() : ModulePass(ID) {}
 
     bool runOnModule(Module &M);
   private:
@@ -45,8 +45,8 @@ namespace {
 }
 
 char IPCP::ID = 0;
-static RegisterPass<IPCP>
-X("ipconstprop", "Interprocedural constant propagation");
+INITIALIZE_PASS(IPCP, "ipconstprop",
+                "Interprocedural constant propagation", false, false);
 
 ModulePass *llvm::createIPConstantPropagationPass() { return new IPCP(); }
 
@@ -94,7 +94,7 @@ bool IPCP::PropagateConstantsIntoArguments(Function &F) {
     if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
       return false;
     
-    CallSite CS = CallSite::get(cast<Instruction>(U));
+    CallSite CS(cast<Instruction>(U));
     if (!CS.isCallee(UI))
       return false;
 
@@ -219,7 +219,7 @@ bool IPCP::PropagateConstantReturn(Function &F) {
   // constant.
   bool MadeChange = false;
   for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E; ++UI) {
-    CallSite CS = CallSite::get(*UI);
+    CallSite CS(*UI);
     Instruction* Call = CS.getInstruction();
 
     // Not a call instruction or a call instruction that's not calling F
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index 8e312e7d91855..ecc60ad069325 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -36,7 +36,7 @@ namespace {
     InlineCostAnalyzer CA;
   public:
     // Use extremely low threshold. 
-    AlwaysInliner() : Inliner(&ID, -2000000000) {}
+    AlwaysInliner() : Inliner(ID, -2000000000) {}
     static char ID; // Pass identification, replacement for typeid
     InlineCost getInlineCost(CallSite CS) {
       return CA.getInlineCost(CS, NeverInline);
@@ -61,8 +61,8 @@ namespace {
 }
 
 char AlwaysInliner::ID = 0;
-static RegisterPass<AlwaysInliner>
-X("always-inline", "Inliner for always_inline functions");
+INITIALIZE_PASS(AlwaysInliner, "always-inline",
+                "Inliner for always_inline functions", false, false);
 
 Pass *llvm::createAlwaysInlinerPass() { return new AlwaysInliner(); }
 
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index 74b4a1c10ece2..9c6637dfe5ad6 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -33,8 +33,8 @@ namespace {
     SmallPtrSet<const Function*, 16> NeverInline; 
     InlineCostAnalyzer CA;
   public:
-    SimpleInliner() : Inliner(&ID) {}
-    SimpleInliner(int Threshold) : Inliner(&ID, Threshold) {}
+    SimpleInliner() : Inliner(ID) {}
+    SimpleInliner(int Threshold) : Inliner(ID, Threshold) {}
     static char ID; // Pass identification, replacement for typeid
     InlineCost getInlineCost(CallSite CS) {
       return CA.getInlineCost(CS, NeverInline);
@@ -56,8 +56,8 @@ namespace {
 }
 
 char SimpleInliner::ID = 0;
-static RegisterPass<SimpleInliner>
-X("inline", "Function Integration/Inlining");
+INITIALIZE_PASS(SimpleInliner, "inline",
+                "Function Integration/Inlining", false, false);
 
 Pass *llvm::createFunctionInliningPass() { return new SimpleInliner(); }
 
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 9bb01f5699fe3..4983e8e13a3ee 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -48,10 +48,10 @@ HintThreshold("inlinehint-threshold", cl::Hidden, cl::init(325),
 // Threshold to use when optsize is specified (and there is no -inline-limit).
 const int OptSizeThreshold = 75;
 
-Inliner::Inliner(void *ID) 
+Inliner::Inliner(char &ID) 
   : CallGraphSCCPass(ID), InlineThreshold(InlineLimit) {}
 
-Inliner::Inliner(void *ID, int Threshold) 
+Inliner::Inliner(char &ID, int Threshold) 
   : CallGraphSCCPass(ID), InlineThreshold(Threshold) {}
 
 /// getAnalysisUsage - For this class, we declare that we require and preserve
@@ -238,11 +238,11 @@ bool Inliner::shouldInline(CallSite CS) {
     bool someOuterCallWouldNotBeInlined = false;
     for (Value::use_iterator I = Caller->use_begin(), E =Caller->use_end(); 
          I != E; ++I) {
-      CallSite CS2 = CallSite::get(*I);
+      CallSite CS2(*I);
 
       // If this isn't a call to Caller (it could be some other sort
       // of reference) skip it.
-      if (CS2.getInstruction() == 0 || CS2.getCalledFunction() != Caller)
+      if (!CS2 || CS2.getCalledFunction() != Caller)
         continue;
 
       InlineCost IC2 = getInlineCost(CS2);
@@ -334,10 +334,10 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
     
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-        CallSite CS = CallSite::get(I);
+        CallSite CS(cast<Value>(I));
         // If this isn't a call, or it is a call to an intrinsic, it can
         // never be inlined.
-        if (CS.getInstruction() == 0 || isa<IntrinsicInst>(I))
+        if (!CS || isa<IntrinsicInst>(I))
           continue;
         
         // If this is a direct call to an external function, we can never inline
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index 47abb7dfd8121..a1d919fd8a042 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -63,11 +63,11 @@ namespace {
 } // end anonymous namespace
 
 char InternalizePass::ID = 0;
-static RegisterPass<InternalizePass>
-X("internalize", "Internalize Global Symbols");
+INITIALIZE_PASS(InternalizePass, "internalize",
+                "Internalize Global Symbols", false, false);
 
 InternalizePass::InternalizePass(bool AllButMain)
-  : ModulePass(&ID), AllButMain(AllButMain){
+  : ModulePass(ID), AllButMain(AllButMain){
   if (!APIFile.empty())           // If a filename is specified, use it.
     LoadFile(APIFile.c_str());
   if (!APIList.empty())           // If a list is specified, use it as well.
@@ -75,7 +75,7 @@ InternalizePass::InternalizePass(bool AllButMain)
 }
 
 InternalizePass::InternalizePass(const std::vector<const char *>&exportList)
-  : ModulePass(&ID), AllButMain(false){
+  : ModulePass(ID), AllButMain(false){
   for(std::vector<const char *>::const_iterator itr = exportList.begin();
         itr != exportList.end(); itr++) {
     ExternalNames.insert(*itr);
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index cb813303facb9..f88dff67d7c98 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -37,7 +37,7 @@ namespace {
     unsigned NumLoops;
 
     explicit LoopExtractor(unsigned numLoops = ~0) 
-      : LoopPass(&ID), NumLoops(numLoops) {}
+      : LoopPass(ID), NumLoops(numLoops) {}
 
     virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
 
@@ -50,8 +50,8 @@ namespace {
 }
 
 char LoopExtractor::ID = 0;
-static RegisterPass<LoopExtractor>
-X("loop-extract", "Extract loops into new functions");
+INITIALIZE_PASS(LoopExtractor, "loop-extract",
+                "Extract loops into new functions", false, false);
 
 namespace {
   /// SingleLoopExtractor - For bugpoint.
@@ -62,8 +62,8 @@ namespace {
 } // End anonymous namespace
 
 char SingleLoopExtractor::ID = 0;
-static RegisterPass<SingleLoopExtractor>
-Y("loop-extract-single", "Extract at most one loop into a new function");
+INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single",
+                "Extract at most one loop into a new function", false, false);
 
 // createLoopExtractorPass - This pass extracts all natural loops from the
 // program into a function if it can.
@@ -147,27 +147,26 @@ namespace {
     std::vector<std::pair<std::string, std::string> > BlocksToNotExtractByName;
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit BlockExtractorPass(const std::vector<BasicBlock*> &B) 
-      : ModulePass(&ID), BlocksToNotExtract(B) {
+    BlockExtractorPass() : ModulePass(ID) {
       if (!BlockFile.empty())
         LoadFile(BlockFile.c_str());
     }
-    BlockExtractorPass() : ModulePass(&ID) {}
 
     bool runOnModule(Module &M);
   };
 }
 
 char BlockExtractorPass::ID = 0;
-static RegisterPass<BlockExtractorPass>
-XX("extract-blocks", "Extract Basic Blocks From Module (for bugpoint use)");
+INITIALIZE_PASS(BlockExtractorPass, "extract-blocks",
+                "Extract Basic Blocks From Module (for bugpoint use)",
+                false, false);
 
 // createBlockExtractorPass - This pass extracts all blocks (except those
 // specified in the argument list) from the functions in the module.
 //
-ModulePass *llvm::createBlockExtractorPass(const std::vector<BasicBlock*> &BTNE)
+ModulePass *llvm::createBlockExtractorPass()
 {
-  return new BlockExtractorPass(BTNE);
+  return new BlockExtractorPass();
 }
 
 void BlockExtractorPass::LoadFile(const char *Filename) {
diff --git a/lib/Transforms/IPO/LowerSetJmp.cpp b/lib/Transforms/IPO/LowerSetJmp.cpp
index 76cfef8335c98..6c715de04b763 100644
--- a/lib/Transforms/IPO/LowerSetJmp.cpp
+++ b/lib/Transforms/IPO/LowerSetJmp.cpp
@@ -109,7 +109,7 @@ namespace {
     bool IsTransformableFunction(StringRef Name);
   public:
     static char ID; // Pass identification, replacement for typeid
-    LowerSetJmp() : ModulePass(&ID) {}
+    LowerSetJmp() : ModulePass(ID) {}
 
     void visitCallInst(CallInst& CI);
     void visitInvokeInst(InvokeInst& II);
@@ -122,7 +122,7 @@ namespace {
 } // end anonymous namespace
 
 char LowerSetJmp::ID = 0;
-static RegisterPass<LowerSetJmp> X("lowersetjmp", "Lower Set Jump");
+INITIALIZE_PASS(LowerSetJmp, "lowersetjmp", "Lower Set Jump", false, false);
 
 // run - Run the transformation on the program. We grab the function
 // prototypes for longjmp and setjmp. If they are used in the program,
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index aeeafe7fd19dc..5d838f98aa082 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -29,44 +29,27 @@
 //
 // Many functions have their address taken by the virtual function table for
 // the object they belong to. However, as long as it's only used for a lookup
-// and call, this is irrelevant, and we'd like to fold such implementations.
+// and call, this is irrelevant, and we'd like to fold such functions.
 //
-// * use SCC to cut down on pair-wise comparisons and solve larger cycles.
+// * switch from n^2 pair-wise comparisons to an n-way comparison for each
+// bucket.
 //
-// The current implementation loops over a pair-wise comparison of all
-// functions in the program where the two functions in the pair are treated as
-// assumed to be equal until proven otherwise. We could both use fewer
-// comparisons and optimize more complex cases if we used strongly connected
-// components of the call graph.
-//
-// * be smarter about bitcast.
+// * be smarter about bitcasts.
 //
 // In order to fold functions, we will sometimes add either bitcast instructions
 // or bitcast constant expressions. Unfortunately, this can confound further
 // analysis since the two functions differ where one has a bitcast and the
-// other doesn't. We should learn to peer through bitcasts without imposing bad
-// performance properties.
-//
-// * don't emit aliases for Mach-O.
-//
-// Mach-O doesn't support aliases which means that we must avoid introducing
-// them in the bitcode on architectures which don't support them, such as
-// Mac OSX. There's a few approaches to this problem;
-//   a) teach codegen to lower global aliases to thunks on platforms which don't
-//      support them.
-//   b) always emit thunks, and create a separate thunk-to-alias pass which
-//      runs on ELF systems. This has the added benefit of transforming other
-//      thunks such as those produced by a C++ frontend into aliases when legal
-//      to do so.
+// other doesn't. We should learn to look through bitcasts.
 //
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mergefunc"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Constants.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
@@ -76,68 +59,103 @@
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
-#include <map>
 #include <vector>
 using namespace llvm;
 
 STATISTIC(NumFunctionsMerged, "Number of functions merged");
 
 namespace {
+  /// MergeFunctions finds functions which will generate identical machine code,
+  /// by considering all pointer types to be equivalent. Once identified,
+  /// MergeFunctions will fold them by replacing a call to one to a call to a
+  /// bitcast of the other.
+  ///
   class MergeFunctions : public ModulePass {
   public:
-    static char ID; // Pass identification, replacement for typeid
-    MergeFunctions() : ModulePass(&ID) {}
+    static char ID;
+    MergeFunctions() : ModulePass(ID) {}
 
     bool runOnModule(Module &M);
 
   private:
-    bool isEquivalentGEP(const GetElementPtrInst *GEP1,
-                         const GetElementPtrInst *GEP2);
-
-    bool equals(const BasicBlock *BB1, const BasicBlock *BB2);
-    bool equals(const Function *F, const Function *G);
+    /// MergeTwoFunctions - Merge two equivalent functions. Upon completion, G
+    /// may be deleted, or may be converted into a thunk. In either case, it
+    /// should never be visited again.
+    void MergeTwoFunctions(Function *F, Function *G) const;
 
-    bool compare(const Value *V1, const Value *V2);
+    /// WriteThunk - Replace G with a simple tail call to bitcast(F). Also
+    /// replace direct uses of G with bitcast(F).
+    void WriteThunk(Function *F, Function *G) const;
 
-    const Function *LHS, *RHS;
-    typedef DenseMap<const Value *, unsigned long> IDMap;
-    IDMap Map;
-    DenseMap<const Function *, IDMap> Domains;
-    DenseMap<const Function *, unsigned long> DomainCount;
     TargetData *TD;
   };
 }
 
 char MergeFunctions::ID = 0;
-static RegisterPass<MergeFunctions> X("mergefunc", "Merge Functions");
+INITIALIZE_PASS(MergeFunctions, "mergefunc", "Merge Functions", false, false);
 
 ModulePass *llvm::createMergeFunctionsPass() {
   return new MergeFunctions();
 }
 
-// ===----------------------------------------------------------------------===
-// Comparison of functions
-// ===----------------------------------------------------------------------===
+namespace {
+/// FunctionComparator - Compares two functions to determine whether or not
+/// they will generate machine code with the same behaviour. TargetData is
+/// used if available. The comparator always fails conservatively (erring on the
+/// side of claiming that two functions are different).
+class FunctionComparator {
+public:
+  FunctionComparator(const TargetData *TD, const Function *F1,
+                     const Function *F2)
+    : F1(F1), F2(F2), TD(TD), IDMap1Count(0), IDMap2Count(0) {}
+
+  /// Compare - test whether the two functions have equivalent behaviour.
+  bool Compare();
+
+private:
+  /// Compare - test whether two basic blocks have equivalent behaviour.
+  bool Compare(const BasicBlock *BB1, const BasicBlock *BB2);
+
+  /// Enumerate - Assign or look up previously assigned numbers for the two
+  /// values, and return whether the numbers are equal. Numbers are assigned in
+  /// the order visited.
+  bool Enumerate(const Value *V1, const Value *V2);
+
+  /// isEquivalentOperation - Compare two Instructions for equivalence, similar
+  /// to Instruction::isSameOperationAs but with modifications to the type
+  /// comparison.
+  bool isEquivalentOperation(const Instruction *I1,
+                             const Instruction *I2) const;
+
+  /// isEquivalentGEP - Compare two GEPs for equivalent pointer arithmetic.
+  bool isEquivalentGEP(const GEPOperator *GEP1, const GEPOperator *GEP2);
+  bool isEquivalentGEP(const GetElementPtrInst *GEP1,
+                       const GetElementPtrInst *GEP2) {
+    return isEquivalentGEP(cast<GEPOperator>(GEP1), cast<GEPOperator>(GEP2));
+  }
 
-static unsigned long hash(const Function *F) {
-  const FunctionType *FTy = F->getFunctionType();
+  /// isEquivalentType - Compare two Types, treating all pointer types as equal.
+  bool isEquivalentType(const Type *Ty1, const Type *Ty2) const;
 
-  FoldingSetNodeID ID;
-  ID.AddInteger(F->size());
-  ID.AddInteger(F->getCallingConv());
-  ID.AddBoolean(F->hasGC());
-  ID.AddBoolean(FTy->isVarArg());
-  ID.AddInteger(FTy->getReturnType()->getTypeID());
-  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
-    ID.AddInteger(FTy->getParamType(i)->getTypeID());
-  return ID.ComputeHash();
+  // The two functions undergoing comparison.
+  const Function *F1, *F2;
+
+  const TargetData *TD;
+
+  typedef DenseMap<const Value *, unsigned long> IDMap;
+  IDMap Map1, Map2;
+  unsigned long IDMap1Count, IDMap2Count;
+};
 }
 
-/// isEquivalentType - any two pointers are equivalent. Otherwise, standard
-/// type equivalence rules apply.
-static bool isEquivalentType(const Type *Ty1, const Type *Ty2) {
+/// isEquivalentType - any two pointers in the same address space are
+/// equivalent. Otherwise, standard type equivalence rules apply.
+bool FunctionComparator::isEquivalentType(const Type *Ty1,
+                                          const Type *Ty2) const {
   if (Ty1 == Ty2)
     return true;
   if (Ty1->getTypeID() != Ty2->getTypeID())
@@ -184,21 +202,6 @@ static bool isEquivalentType(const Type *Ty1, const Type *Ty2) {
     return true;
   }
 
-  case Type::UnionTyID: {
-    const UnionType *UTy1 = cast<UnionType>(Ty1);
-    const UnionType *UTy2 = cast<UnionType>(Ty2);
-
-    // TODO: we could be fancy with union(A, union(A, B)) === union(A, B), etc.
-    if (UTy1->getNumElements() != UTy2->getNumElements())
-      return false;
-
-    for (unsigned i = 0, e = UTy1->getNumElements(); i != e; ++i) {
-      if (!isEquivalentType(UTy1->getElementType(i), UTy2->getElementType(i)))
-        return false;
-    }
-    return true;
-  }
-
   case Type::FunctionTyID: {
     const FunctionType *FTy1 = cast<FunctionType>(Ty1);
     const FunctionType *FTy2 = cast<FunctionType>(Ty2);
@@ -216,11 +219,18 @@ static bool isEquivalentType(const Type *Ty1, const Type *Ty2) {
     return true;
   }
 
-  case Type::ArrayTyID:
+  case Type::ArrayTyID: {
+    const ArrayType *ATy1 = cast<ArrayType>(Ty1);
+    const ArrayType *ATy2 = cast<ArrayType>(Ty2);
+    return ATy1->getNumElements() == ATy2->getNumElements() &&
+           isEquivalentType(ATy1->getElementType(), ATy2->getElementType());
+  }
+
   case Type::VectorTyID: {
-    const SequentialType *STy1 = cast<SequentialType>(Ty1);
-    const SequentialType *STy2 = cast<SequentialType>(Ty2);
-    return isEquivalentType(STy1->getElementType(), STy2->getElementType());
+    const VectorType *VTy1 = cast<VectorType>(Ty1);
+    const VectorType *VTy2 = cast<VectorType>(Ty2);
+    return VTy1->getNumElements() == VTy2->getNumElements() &&
+           isEquivalentType(VTy1->getElementType(), VTy2->getElementType());
   }
   }
 }
@@ -228,8 +238,8 @@ static bool isEquivalentType(const Type *Ty1, const Type *Ty2) {
 /// isEquivalentOperation - determine whether the two operations are the same
 /// except that pointer-to-A and pointer-to-B are equivalent. This should be
 /// kept in sync with Instruction::isSameOperationAs.
-static bool
-isEquivalentOperation(const Instruction *I1, const Instruction *I2) {
+bool FunctionComparator::isEquivalentOperation(const Instruction *I1,
+                                               const Instruction *I2) const {
   if (I1->getOpcode() != I2->getOpcode() ||
       I1->getNumOperands() != I2->getNumOperands() ||
       !isEquivalentType(I1->getType(), I2->getType()) ||
@@ -281,18 +291,15 @@ isEquivalentOperation(const Instruction *I1, const Instruction *I2) {
   return true;
 }
 
-bool MergeFunctions::isEquivalentGEP(const GetElementPtrInst *GEP1,
-                                     const GetElementPtrInst *GEP2) {
+/// isEquivalentGEP - determine whether two GEP operations perform the same
+/// underlying arithmetic.
+bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1,
+                                         const GEPOperator *GEP2) {
+  // When we have target data, we can reduce the GEP down to the value in bytes
+  // added to the address.
   if (TD && GEP1->hasAllConstantIndices() && GEP2->hasAllConstantIndices()) {
-    SmallVector<Value *, 8> Indices1, Indices2;
-    for (GetElementPtrInst::const_op_iterator I = GEP1->idx_begin(),
-           E = GEP1->idx_end(); I != E; ++I) {
-      Indices1.push_back(*I);
-    }
-    for (GetElementPtrInst::const_op_iterator I = GEP2->idx_begin(),
-           E = GEP2->idx_end(); I != E; ++I) {
-      Indices2.push_back(*I);
-    }
+    SmallVector<Value *, 8> Indices1(GEP1->idx_begin(), GEP1->idx_end());
+    SmallVector<Value *, 8> Indices2(GEP2->idx_begin(), GEP2->idx_end());
     uint64_t Offset1 = TD->getIndexedOffset(GEP1->getPointerOperandType(),
                                             Indices1.data(), Indices1.size());
     uint64_t Offset2 = TD->getIndexedOffset(GEP2->getPointerOperandType(),
@@ -300,7 +307,6 @@ bool MergeFunctions::isEquivalentGEP(const GetElementPtrInst *GEP1,
     return Offset1 == Offset2;
   }
 
-  // Equivalent types aren't enough.
   if (GEP1->getPointerOperand()->getType() !=
       GEP2->getPointerOperand()->getType())
     return false;
@@ -309,19 +315,26 @@ bool MergeFunctions::isEquivalentGEP(const GetElementPtrInst *GEP1,
     return false;
 
   for (unsigned i = 0, e = GEP1->getNumOperands(); i != e; ++i) {
-    if (!compare(GEP1->getOperand(i), GEP2->getOperand(i)))
+    if (!Enumerate(GEP1->getOperand(i), GEP2->getOperand(i)))
       return false;
   }
 
   return true;
 }
 
-bool MergeFunctions::compare(const Value *V1, const Value *V2) {
-  if (V1 == LHS || V1 == RHS)
-    if (V2 == LHS || V2 == RHS)
-      return true;
+/// Enumerate - Compare two values used by the two functions under pair-wise
+/// comparison. If this is the first time the values are seen, they're added to
+/// the mapping so that we will detect mismatches on next use.
+bool FunctionComparator::Enumerate(const Value *V1, const Value *V2) {
+  // Check for function @f1 referring to itself and function @f2 referring to
+  // itself, or referring to each other, or both referring to either of them.
+  // They're all equivalent if the two functions are otherwise equivalent.
+  if (V1 == F1 && V2 == F2)
+    return true;
+  if (V1 == F2 && V2 == F1)
+    return true;
 
-  // TODO: constant expressions in terms of LHS and RHS
+  // TODO: constant expressions with GEP or references to F1 or F2.
   if (isa<Constant>(V1))
     return V1 == V2;
 
@@ -332,228 +345,138 @@ bool MergeFunctions::compare(const Value *V1, const Value *V2) {
            IA1->getConstraintString() == IA2->getConstraintString();
   }
 
-  // We enumerate constants globally and arguments, basic blocks or
-  // instructions within the function they belong to.
-  const Function *Domain1 = NULL;
-  if (const Argument *A = dyn_cast<Argument>(V1)) {
-    Domain1 = A->getParent();
-  } else if (const BasicBlock *BB = dyn_cast<BasicBlock>(V1)) {
-    Domain1 = BB->getParent();
-  } else if (const Instruction *I = dyn_cast<Instruction>(V1)) {
-    Domain1 = I->getParent()->getParent();
-  }
-
-  const Function *Domain2 = NULL;
-  if (const Argument *A = dyn_cast<Argument>(V2)) {
-    Domain2 = A->getParent();
-  } else if (const BasicBlock *BB = dyn_cast<BasicBlock>(V2)) {
-    Domain2 = BB->getParent();
-  } else if (const Instruction *I = dyn_cast<Instruction>(V2)) {
-    Domain2 = I->getParent()->getParent();
-  }
-
-  if (Domain1 != Domain2)
-    if (Domain1 != LHS && Domain1 != RHS)
-      if (Domain2 != LHS && Domain2 != RHS)
-        return false;
-
-  IDMap &Map1 = Domains[Domain1];
   unsigned long &ID1 = Map1[V1];
   if (!ID1)
-    ID1 = ++DomainCount[Domain1];
+    ID1 = ++IDMap1Count;
 
-  IDMap &Map2 = Domains[Domain2];
   unsigned long &ID2 = Map2[V2];
   if (!ID2)
-    ID2 = ++DomainCount[Domain2];
+    ID2 = ++IDMap2Count;
 
   return ID1 == ID2;
 }
 
-bool MergeFunctions::equals(const BasicBlock *BB1, const BasicBlock *BB2) {
-  BasicBlock::const_iterator FI = BB1->begin(), FE = BB1->end();
-  BasicBlock::const_iterator GI = BB2->begin(), GE = BB2->end();
+/// Compare - test whether two basic blocks have equivalent behaviour.
+bool FunctionComparator::Compare(const BasicBlock *BB1, const BasicBlock *BB2) {
+  BasicBlock::const_iterator F1I = BB1->begin(), F1E = BB1->end();
+  BasicBlock::const_iterator F2I = BB2->begin(), F2E = BB2->end();
 
   do {
-    if (!compare(FI, GI))
+    if (!Enumerate(F1I, F2I))
       return false;
 
-    if (isa<GetElementPtrInst>(FI) && isa<GetElementPtrInst>(GI)) {
-      const GetElementPtrInst *GEP1 = cast<GetElementPtrInst>(FI);
-      const GetElementPtrInst *GEP2 = cast<GetElementPtrInst>(GI);
+    if (const GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(F1I)) {
+      const GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(F2I);
+      if (!GEP2)
+        return false;
 
-      if (!compare(GEP1->getPointerOperand(), GEP2->getPointerOperand()))
+      if (!Enumerate(GEP1->getPointerOperand(), GEP2->getPointerOperand()))
         return false;
 
       if (!isEquivalentGEP(GEP1, GEP2))
         return false;
     } else {
-      if (!isEquivalentOperation(FI, GI))
+      if (!isEquivalentOperation(F1I, F2I))
         return false;
 
-      for (unsigned i = 0, e = FI->getNumOperands(); i != e; ++i) {
-        Value *OpF = FI->getOperand(i);
-        Value *OpG = GI->getOperand(i);
+      assert(F1I->getNumOperands() == F2I->getNumOperands());
+      for (unsigned i = 0, e = F1I->getNumOperands(); i != e; ++i) {
+        Value *OpF1 = F1I->getOperand(i);
+        Value *OpF2 = F2I->getOperand(i);
 
-        if (!compare(OpF, OpG))
+        if (!Enumerate(OpF1, OpF2))
           return false;
 
-        if (OpF->getValueID() != OpG->getValueID() ||
-            !isEquivalentType(OpF->getType(), OpG->getType()))
+        if (OpF1->getValueID() != OpF2->getValueID() ||
+            !isEquivalentType(OpF1->getType(), OpF2->getType()))
           return false;
       }
     }
 
-    ++FI, ++GI;
-  } while (FI != FE && GI != GE);
+    ++F1I, ++F2I;
+  } while (F1I != F1E && F2I != F2E);
 
-  return FI == FE && GI == GE;
+  return F1I == F1E && F2I == F2E;
 }
 
-bool MergeFunctions::equals(const Function *F, const Function *G) {
+/// Compare - test whether the two functions have equivalent behaviour.
+bool FunctionComparator::Compare() {
   // We need to recheck everything, but check the things that weren't included
   // in the hash first.
 
-  if (F->getAttributes() != G->getAttributes())
+  if (F1->getAttributes() != F2->getAttributes())
     return false;
 
-  if (F->hasGC() != G->hasGC())
+  if (F1->hasGC() != F2->hasGC())
     return false;
 
-  if (F->hasGC() && F->getGC() != G->getGC())
+  if (F1->hasGC() && F1->getGC() != F2->getGC())
     return false;
 
-  if (F->hasSection() != G->hasSection())
+  if (F1->hasSection() != F2->hasSection())
     return false;
 
-  if (F->hasSection() && F->getSection() != G->getSection())
+  if (F1->hasSection() && F1->getSection() != F2->getSection())
     return false;
 
-  if (F->isVarArg() != G->isVarArg())
+  if (F1->isVarArg() != F2->isVarArg())
     return false;
 
   // TODO: if it's internal and only used in direct calls, we could handle this
   // case too.
-  if (F->getCallingConv() != G->getCallingConv())
+  if (F1->getCallingConv() != F2->getCallingConv())
     return false;
 
-  if (!isEquivalentType(F->getFunctionType(), G->getFunctionType()))
+  if (!isEquivalentType(F1->getFunctionType(), F2->getFunctionType()))
     return false;
 
-  assert(F->arg_size() == G->arg_size() &&
+  assert(F1->arg_size() == F2->arg_size() &&
          "Identical functions have a different number of args.");
 
-  LHS = F;
-  RHS = G;
-
   // Visit the arguments so that they get enumerated in the order they're
   // passed in.
-  for (Function::const_arg_iterator fi = F->arg_begin(), gi = G->arg_begin(),
-         fe = F->arg_end(); fi != fe; ++fi, ++gi) {
-    if (!compare(fi, gi))
+  for (Function::const_arg_iterator f1i = F1->arg_begin(),
+         f2i = F2->arg_begin(), f1e = F1->arg_end(); f1i != f1e; ++f1i, ++f2i) {
+    if (!Enumerate(f1i, f2i))
       llvm_unreachable("Arguments repeat");
   }
 
-  SmallVector<const BasicBlock *, 8> FBBs, GBBs;
-  SmallSet<const BasicBlock *, 128> VisitedBBs; // in terms of F.
-  FBBs.push_back(&F->getEntryBlock());
-  GBBs.push_back(&G->getEntryBlock());
-  VisitedBBs.insert(FBBs[0]);
-  while (!FBBs.empty()) {
-    const BasicBlock *FBB = FBBs.pop_back_val();
-    const BasicBlock *GBB = GBBs.pop_back_val();
-    if (!compare(FBB, GBB) || !equals(FBB, GBB)) {
-      Domains.clear();
-      DomainCount.clear();
-      return false;
-    }
-    const TerminatorInst *FTI = FBB->getTerminator();
-    const TerminatorInst *GTI = GBB->getTerminator();
-    assert(FTI->getNumSuccessors() == GTI->getNumSuccessors());
-    for (unsigned i = 0, e = FTI->getNumSuccessors(); i != e; ++i) {
-      if (!VisitedBBs.insert(FTI->getSuccessor(i)))
-        continue;
-      FBBs.push_back(FTI->getSuccessor(i));
-      GBBs.push_back(GTI->getSuccessor(i));
-    }
-  }
+  // We do a CFG-ordered walk since the actual ordering of the blocks in the
+  // linked list is immaterial. Our walk starts at the entry block for both
+  // functions, then takes each block from each terminator in order. As an
+  // artifact, this also means that unreachable blocks are ignored.
+  SmallVector<const BasicBlock *, 8> F1BBs, F2BBs;
+  SmallSet<const BasicBlock *, 128> VisitedBBs; // in terms of F1.
 
-  Domains.clear();
-  DomainCount.clear();
-  return true;
-}
+  F1BBs.push_back(&F1->getEntryBlock());
+  F2BBs.push_back(&F2->getEntryBlock());
 
-// ===----------------------------------------------------------------------===
-// Folding of functions
-// ===----------------------------------------------------------------------===
-
-// Cases:
-// * F is external strong, G is external strong:
-//   turn G into a thunk to F    (1)
-// * F is external strong, G is external weak:
-//   turn G into a thunk to F    (1)
-// * F is external weak, G is external weak:
-//   unfoldable
-// * F is external strong, G is internal:
-//   address of G taken:
-//     turn G into a thunk to F  (1)
-//   address of G not taken:
-//     make G an alias to F      (2)
-// * F is internal, G is external weak
-//   address of F is taken:
-//     turn G into a thunk to F  (1)
-//   address of F is not taken:
-//     make G an alias of F      (2)
-// * F is internal, G is internal:
-//   address of F and G are taken:
-//     turn G into a thunk to F  (1)
-//   address of G is not taken:
-//     make G an alias to F      (2)
-//
-// alias requires linkage == (external,local,weak) fallback to creating a thunk
-// external means 'externally visible' linkage != (internal,private)
-// internal means linkage == (internal,private)
-// weak means linkage mayBeOverridable
-// being external implies that the address is taken
-//
-// 1. turn G into a thunk to F
-// 2. make G an alias to F
+  VisitedBBs.insert(F1BBs[0]);
+  while (!F1BBs.empty()) {
+    const BasicBlock *F1BB = F1BBs.pop_back_val();
+    const BasicBlock *F2BB = F2BBs.pop_back_val();
 
-enum LinkageCategory {
-  ExternalStrong,
-  ExternalWeak,
-  Internal
-};
+    if (!Enumerate(F1BB, F2BB) || !Compare(F1BB, F2BB))
+      return false;
 
-static LinkageCategory categorize(const Function *F) {
-  switch (F->getLinkage()) {
-  case GlobalValue::InternalLinkage:
-  case GlobalValue::PrivateLinkage:
-  case GlobalValue::LinkerPrivateLinkage:
-    return Internal;
-
-  case GlobalValue::WeakAnyLinkage:
-  case GlobalValue::WeakODRLinkage:
-  case GlobalValue::ExternalWeakLinkage:
-  case GlobalValue::LinkerPrivateWeakLinkage:
-    return ExternalWeak;
-
-  case GlobalValue::ExternalLinkage:
-  case GlobalValue::AvailableExternallyLinkage:
-  case GlobalValue::LinkOnceAnyLinkage:
-  case GlobalValue::LinkOnceODRLinkage:
-  case GlobalValue::AppendingLinkage:
-  case GlobalValue::DLLImportLinkage:
-  case GlobalValue::DLLExportLinkage:
-  case GlobalValue::CommonLinkage:
-    return ExternalStrong;
-  }
+    const TerminatorInst *F1TI = F1BB->getTerminator();
+    const TerminatorInst *F2TI = F2BB->getTerminator();
 
-  llvm_unreachable("Unknown LinkageType.");
-  return ExternalWeak;
+    assert(F1TI->getNumSuccessors() == F2TI->getNumSuccessors());
+    for (unsigned i = 0, e = F1TI->getNumSuccessors(); i != e; ++i) {
+      if (!VisitedBBs.insert(F1TI->getSuccessor(i)))
+        continue;
+
+      F1BBs.push_back(F1TI->getSuccessor(i));
+      F2BBs.push_back(F2TI->getSuccessor(i));
+    }
+  }
+  return true;
 }
 
-static void ThunkGToF(Function *F, Function *G) {
+/// WriteThunk - Replace G with a simple tail call to bitcast(F). Also replace
+/// direct uses of G with bitcast(F).
+void MergeFunctions::WriteThunk(Function *F, Function *G) const {
   if (!G->mayBeOverridden()) {
     // Redirect direct callers of G to F.
     Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
@@ -567,33 +490,34 @@ static void ThunkGToF(Function *F, Function *G) {
     }
   }
 
+  // If G was internal then we may have replaced all uses if G with F. If so,
+  // stop here and delete G. There's no need for a thunk.
+  if (G->hasLocalLinkage() && G->use_empty()) {
+    G->eraseFromParent();
+    return;
+  }
+
   Function *NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "",
                                     G->getParent());
   BasicBlock *BB = BasicBlock::Create(F->getContext(), "", NewG);
+  IRBuilder<false> Builder(BB);
 
   SmallVector<Value *, 16> Args;
   unsigned i = 0;
   const FunctionType *FFTy = F->getFunctionType();
   for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end();
        AI != AE; ++AI) {
-    if (FFTy->getParamType(i) == AI->getType()) {
-      Args.push_back(AI);
-    } else {
-      Args.push_back(new BitCastInst(AI, FFTy->getParamType(i), "", BB));
-    }
+    Args.push_back(Builder.CreateBitCast(AI, FFTy->getParamType(i)));
     ++i;
   }
 
-  CallInst *CI = CallInst::Create(F, Args.begin(), Args.end(), "", BB);
+  CallInst *CI = Builder.CreateCall(F, Args.begin(), Args.end());
   CI->setTailCall();
   CI->setCallingConv(F->getCallingConv());
   if (NewG->getReturnType()->isVoidTy()) {
-    ReturnInst::Create(F->getContext(), BB);
-  } else if (CI->getType() != NewG->getReturnType()) {
-    Value *BCI = new BitCastInst(CI, NewG->getReturnType(), "", BB);
-    ReturnInst::Create(F->getContext(), BCI, BB);
+    Builder.CreateRetVoid();
   } else {
-    ReturnInst::Create(F->getContext(), CI, BB);
+    Builder.CreateRet(Builder.CreateBitCast(CI, NewG->getReturnType()));
   }
 
   NewG->copyAttributesFrom(G);
@@ -602,152 +526,126 @@ static void ThunkGToF(Function *F, Function *G) {
   G->eraseFromParent();
 }
 
-static void AliasGToF(Function *F, Function *G) {
-  // Darwin will trigger llvm_unreachable if asked to codegen an alias.
-  return ThunkGToF(F, G);
-
-#if 0
-  if (!G->hasExternalLinkage() && !G->hasLocalLinkage() && !G->hasWeakLinkage())
-    return ThunkGToF(F, G);
-
-  GlobalAlias *GA = new GlobalAlias(
-    G->getType(), G->getLinkage(), "",
-    ConstantExpr::getBitCast(F, G->getType()), G->getParent());
-  F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
-  GA->takeName(G);
-  GA->setVisibility(G->getVisibility());
-  G->replaceAllUsesWith(GA);
-  G->eraseFromParent();
-#endif
-}
-
-static bool fold(std::vector<Function *> &FnVec, unsigned i, unsigned j) {
-  Function *F = FnVec[i];
-  Function *G = FnVec[j];
-
-  LinkageCategory catF = categorize(F);
-  LinkageCategory catG = categorize(G);
-
-  if (catF == ExternalWeak || (catF == Internal && catG == ExternalStrong)) {
-    std::swap(FnVec[i], FnVec[j]);
-    std::swap(F, G);
-    std::swap(catF, catG);
-  }
-
-  switch (catF) {
-  case ExternalStrong:
-    switch (catG) {
-    case ExternalStrong:
-    case ExternalWeak:
-      ThunkGToF(F, G);
-      break;
-    case Internal:
-      if (G->hasAddressTaken())
-        ThunkGToF(F, G);
-      else
-        AliasGToF(F, G);
-      break;
-    }
-    break;
-
-  case ExternalWeak: {
-    assert(catG == ExternalWeak);
+/// MergeTwoFunctions - Merge two equivalent functions. Upon completion,
+/// Function G is deleted.
+void MergeFunctions::MergeTwoFunctions(Function *F, Function *G) const {
+  if (F->isWeakForLinker()) {
+    assert(G->isWeakForLinker());
 
     // Make them both thunks to the same internal function.
-    F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
     Function *H = Function::Create(F->getFunctionType(), F->getLinkage(), "",
                                    F->getParent());
     H->copyAttributesFrom(F);
     H->takeName(F);
     F->replaceAllUsesWith(H);
 
-    ThunkGToF(F, G);
-    ThunkGToF(F, H);
+    unsigned MaxAlignment = std::max(G->getAlignment(), H->getAlignment());
 
-    F->setLinkage(GlobalValue::InternalLinkage);
-  } break;
-
-  case Internal:
-    switch (catG) {
-    case ExternalStrong:
-      llvm_unreachable(0);
-      // fall-through
-    case ExternalWeak:
-      if (F->hasAddressTaken())
-        ThunkGToF(F, G);
-      else
-        AliasGToF(F, G);
-      break;
-    case Internal: {
-      bool addrTakenF = F->hasAddressTaken();
-      bool addrTakenG = G->hasAddressTaken();
-      if (!addrTakenF && addrTakenG) {
-        std::swap(FnVec[i], FnVec[j]);
-        std::swap(F, G);
-        std::swap(addrTakenF, addrTakenG);
-      }
+    WriteThunk(F, G);
+    WriteThunk(F, H);
 
-      if (addrTakenF && addrTakenG) {
-        ThunkGToF(F, G);
-      } else {
-        assert(!addrTakenG);
-        AliasGToF(F, G);
-      }
-    } break;
-  } break;
+    F->setAlignment(MaxAlignment);
+    F->setLinkage(GlobalValue::InternalLinkage);
+  } else {
+    WriteThunk(F, G);
   }
 
   ++NumFunctionsMerged;
-  return true;
 }
 
-// ===----------------------------------------------------------------------===
-// Pass definition
-// ===----------------------------------------------------------------------===
+static unsigned ProfileFunction(const Function *F) {
+  const FunctionType *FTy = F->getFunctionType();
 
-bool MergeFunctions::runOnModule(Module &M) {
-  bool Changed = false;
+  FoldingSetNodeID ID;
+  ID.AddInteger(F->size());
+  ID.AddInteger(F->getCallingConv());
+  ID.AddBoolean(F->hasGC());
+  ID.AddBoolean(FTy->isVarArg());
+  ID.AddInteger(FTy->getReturnType()->getTypeID());
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+    ID.AddInteger(FTy->getParamType(i)->getTypeID());
+  return ID.ComputeHash();
+}
 
-  std::map<unsigned long, std::vector<Function *> > FnMap;
+class ComparableFunction {
+public:
+  ComparableFunction(Function *Func, TargetData *TD)
+    : Func(Func), Hash(ProfileFunction(Func)), TD(TD) {}
 
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration())
-      continue;
+  AssertingVH<Function> const Func;
+  const unsigned Hash;
+  TargetData * const TD;
+};
 
-    FnMap[hash(F)].push_back(F);
+struct MergeFunctionsEqualityInfo {
+  static ComparableFunction *getEmptyKey() {
+    return reinterpret_cast<ComparableFunction*>(0);
+  }
+  static ComparableFunction *getTombstoneKey() {
+    return reinterpret_cast<ComparableFunction*>(-1);
   }
+  static unsigned getHashValue(const ComparableFunction *CF) {
+    return CF->Hash;
+  }
+  static bool isEqual(const ComparableFunction *LHS,
+                      const ComparableFunction *RHS) {
+    if (LHS == RHS)
+      return true;
+    if (LHS == getEmptyKey() || LHS == getTombstoneKey() ||
+        RHS == getEmptyKey() || RHS == getTombstoneKey())
+      return false;
+    assert(LHS->TD == RHS->TD && "Comparing functions for different targets");
+    return FunctionComparator(LHS->TD, LHS->Func, RHS->Func).Compare();
+  }
+};
 
+bool MergeFunctions::runOnModule(Module &M) {
+  typedef DenseSet<ComparableFunction *, MergeFunctionsEqualityInfo> FnSetType;
+
+  bool Changed = false;
   TD = getAnalysisIfAvailable<TargetData>();
 
+  std::vector<Function *> Funcs;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage())
+      Funcs.push_back(F);
+  }
+
   bool LocalChanged;
   do {
     LocalChanged = false;
-    DEBUG(dbgs() << "size: " << FnMap.size() << "\n");
-    for (std::map<unsigned long, std::vector<Function *> >::iterator
-           I = FnMap.begin(), E = FnMap.end(); I != E; ++I) {
-      std::vector<Function *> &FnVec = I->second;
-      DEBUG(dbgs() << "hash (" << I->first << "): " << FnVec.size() << "\n");
-
-      for (int i = 0, e = FnVec.size(); i != e; ++i) {
-        for (int j = i + 1; j != e; ++j) {
-          bool isEqual = equals(FnVec[i], FnVec[j]);
-
-          DEBUG(dbgs() << "  " << FnVec[i]->getName()
-                << (isEqual ? " == " : " != ")
-                << FnVec[j]->getName() << "\n");
-
-          if (isEqual) {
-            if (fold(FnVec, i, j)) {
-              LocalChanged = true;
-              FnVec.erase(FnVec.begin() + j);
-              --j, --e;
-            }
-          }
-        }
-      }
 
+    FnSetType FnSet;
+    for (unsigned i = 0, e = Funcs.size(); i != e;) {
+      Function *F = Funcs[i];
+      ComparableFunction *NewF = new ComparableFunction(F, TD);
+      std::pair<FnSetType::iterator, bool> Result = FnSet.insert(NewF);
+      if (!Result.second) {
+        ComparableFunction *&OldF = *Result.first;
+        assert(OldF && "Expected a hash collision");
+
+        // NewF will be deleted in favour of OldF unless NewF is strong and
+        // OldF is weak in which case swap them to keep the strong definition.
+
+        if (OldF->Func->isWeakForLinker() && !NewF->Func->isWeakForLinker())
+          std::swap(OldF, NewF);
+
+        DEBUG(dbgs() << "  " << OldF->Func->getName() << " == "
+                     << NewF->Func->getName() << '\n');
+
+	Funcs.erase(Funcs.begin() + i);
+	--e;
+
+        Function *DeleteF = NewF->Func;
+        delete NewF;
+        MergeTwoFunctions(OldF->Func, DeleteF);
+	LocalChanged = true;
+        Changed = true;
+      } else {
+	++i;
+      }
     }
-    Changed |= LocalChanged;
+    DeleteContainerPointers(FnSet);
   } while (LocalChanged);
 
   return Changed;
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 6b9814ceb8769..432f7c53a67d1 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -30,7 +30,7 @@ namespace {
   struct PartialInliner : public ModulePass {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const { }
     static char ID; // Pass identification, replacement for typeid
-    PartialInliner() : ModulePass(&ID) {}
+    PartialInliner() : ModulePass(ID) {}
     
     bool runOnModule(Module& M);
     
@@ -40,7 +40,8 @@ namespace {
 }
 
 char PartialInliner::ID = 0;
-static RegisterPass<PartialInliner> X("partial-inliner", "Partial Inliner");
+INITIALIZE_PASS(PartialInliner, "partial-inliner",
+                "Partial Inliner", false, false);
 
 ModulePass* llvm::createPartialInliningPass() { return new PartialInliner(); }
 
@@ -67,7 +68,8 @@ Function* PartialInliner::unswitchFunction(Function* F) {
   
   // Clone the function, so that we can hack away on it.
   ValueMap<const Value*, Value*> VMap;
-  Function* duplicateFunction = CloneFunction(F, VMap);
+  Function* duplicateFunction = CloneFunction(F, VMap,
+                                              /*ModuleLevelChanges=*/false);
   duplicateFunction->setLinkage(GlobalValue::InternalLinkage);
   F->getParent()->getFunctionList().push_back(duplicateFunction);
   BasicBlock* newEntryBlock = cast<BasicBlock>(VMap[entryBlock]);
@@ -159,7 +161,7 @@ bool PartialInliner::runOnModule(Module& M) {
     bool recursive = false;
     for (Function::use_iterator UI = currFunc->use_begin(),
          UE = currFunc->use_end(); UI != UE; ++UI)
-      if (Instruction* I = dyn_cast<Instruction>(UI))
+      if (Instruction* I = dyn_cast<Instruction>(*UI))
         if (I->getParent()->getParent() == currFunc) {
           recursive = true;
           break;
diff --git a/lib/Transforms/IPO/PartialSpecialization.cpp b/lib/Transforms/IPO/PartialSpecialization.cpp
index 58e14481b0edc..4a99a411ab338 100644
--- a/lib/Transforms/IPO/PartialSpecialization.cpp
+++ b/lib/Transforms/IPO/PartialSpecialization.cpp
@@ -50,14 +50,14 @@ namespace {
     int scanDistribution(Function&, int, std::map<Constant*, int>&);
   public :
     static char ID; // Pass identification, replacement for typeid
-    PartSpec() : ModulePass(&ID) {}
+    PartSpec() : ModulePass(ID) {}
     bool runOnModule(Module &M);
   };
 }
 
 char PartSpec::ID = 0;
-static RegisterPass<PartSpec>
-X("partialspecialization", "Partial Specialization");
+INITIALIZE_PASS(PartSpec, "partialspecialization",
+                "Partial Specialization", false, false);
 
 // Specialize F by replacing the arguments (keys) in replacements with the 
 // constants (values).  Replace all calls to F with those constants with
@@ -74,7 +74,8 @@ SpecializeFunction(Function* F,
     deleted[arg->getArgNo()] = arg;
   }
 
-  Function* NF = CloneFunction(F, replacements);
+  Function* NF = CloneFunction(F, replacements,
+                               /*ModuleLevelChanges=*/false);
   NF->setLinkage(GlobalValue::InternalLinkage);
   F->getParent()->getFunctionList().push_back(NF);
 
@@ -82,10 +83,10 @@ SpecializeFunction(Function* F,
        ii != ee; ) {
     Value::use_iterator i = ii;
     ++ii;
-    if (isa<CallInst>(i) || isa<InvokeInst>(i)) {
-      CallSite CS(cast<Instruction>(i));
+    User *U = *i;
+    CallSite CS(U);
+    if (CS) {
       if (CS.getCalledFunction() == F) {
-        
         SmallVector<Value*, 6> args;
         // Assemble the non-specialized arguments for the updated callsite.
         // In the process, make sure that the specialized arguments are
@@ -105,13 +106,13 @@ SpecializeFunction(Function* F,
           }
         }
         Value* NCall;
-        if (CallInst *CI = dyn_cast<CallInst>(i)) {
+        if (CallInst *CI = dyn_cast<CallInst>(U)) {
           NCall = CallInst::Create(NF, args.begin(), args.end(), 
                                    CI->getName(), CI);
           cast<CallInst>(NCall)->setTailCall(CI->isTailCall());
           cast<CallInst>(NCall)->setCallingConv(CI->getCallingConv());
         } else {
-          InvokeInst *II = cast<InvokeInst>(i);
+          InvokeInst *II = cast<InvokeInst>(U);
           NCall = InvokeInst::Create(NF, II->getNormalDest(),
                                      II->getUnwindDest(),
                                      args.begin(), args.end(), 
@@ -123,8 +124,7 @@ SpecializeFunction(Function* F,
         ++numReplaced;
       }
     }
-  next_use:
-    ;
+    next_use:;
   }
   return NF;
 }
@@ -174,14 +174,14 @@ void PartSpec::scanForInterest(Function& F, InterestingArgVector& args) {
         ui != ue; ++ui) {
 
       bool interesting = false;
-
-      if (isa<CmpInst>(ui)) interesting = true;
-      else if (isa<CallInst>(ui))
+      User *U = *ui;
+      if (isa<CmpInst>(U)) interesting = true;
+      else if (isa<CallInst>(U))
         interesting = ui->getOperand(0) == ii;
-      else if (isa<InvokeInst>(ui))
+      else if (isa<InvokeInst>(U))
         interesting = ui->getOperand(0) == ii;
-      else if (isa<SwitchInst>(ui)) interesting = true;
-      else if (isa<BranchInst>(ui)) interesting = true;
+      else if (isa<SwitchInst>(U)) interesting = true;
+      else if (isa<BranchInst>(U)) interesting = true;
 
       if (interesting) {
         args.push_back(std::distance(F.arg_begin(), ii));
@@ -196,14 +196,16 @@ int PartSpec::scanDistribution(Function& F, int arg,
                                std::map<Constant*, int>& dist) {
   bool hasIndirect = false;
   int total = 0;
-  for(Value::use_iterator ii = F.use_begin(), ee = F.use_end();
-      ii != ee; ++ii)
-    if ((isa<CallInst>(ii) || isa<InvokeInst>(ii))
-        && ii->getOperand(0) == &F) {
-      ++dist[dyn_cast<Constant>(ii->getOperand(arg + 1))];
+  for (Value::use_iterator ii = F.use_begin(), ee = F.use_end();
+      ii != ee; ++ii) {
+    User *U = *ii;
+    CallSite CS(U);
+    if (CS && CS.getCalledFunction() == &F) {
+      ++dist[dyn_cast<Constant>(CS.getArgument(arg))];
       ++total;
     } else
       hasIndirect = true;
+  }
 
   // Preserve the original address taken function even if all other uses
   // will be specialized.
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index de6099cc1daa0..09ac76f979649 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -37,7 +37,7 @@ STATISTIC(NumUnreach, "Number of noreturn calls optimized");
 namespace {
   struct PruneEH : public CallGraphSCCPass {
     static char ID; // Pass identification, replacement for typeid
-    PruneEH() : CallGraphSCCPass(&ID) {}
+    PruneEH() : CallGraphSCCPass(ID) {}
 
     // runOnSCC - Analyze the SCC, performing the transformation if possible.
     bool runOnSCC(CallGraphSCC &SCC);
@@ -48,8 +48,8 @@ namespace {
 }
 
 char PruneEH::ID = 0;
-static RegisterPass<PruneEH>
-X("prune-eh", "Remove unused exception handling info");
+INITIALIZE_PASS(PruneEH, "prune-eh",
+                "Remove unused exception handling info", false, false);
 
 Pass *llvm::createPruneEHPass() { return new PruneEH(); }
 
diff --git a/lib/Transforms/IPO/StripDeadPrototypes.cpp b/lib/Transforms/IPO/StripDeadPrototypes.cpp
index 4566a7634af5a..ee10ad0b8ba26 100644
--- a/lib/Transforms/IPO/StripDeadPrototypes.cpp
+++ b/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -29,15 +29,15 @@ namespace {
 class StripDeadPrototypesPass : public ModulePass {
 public:
   static char ID; // Pass identification, replacement for typeid
-  StripDeadPrototypesPass() : ModulePass(&ID) { }
+  StripDeadPrototypesPass() : ModulePass(ID) { }
   virtual bool runOnModule(Module &M);
 };
 
 } // end anonymous namespace
 
 char StripDeadPrototypesPass::ID = 0;
-static RegisterPass<StripDeadPrototypesPass>
-X("strip-dead-prototypes", "Strip Unused Function Prototypes");
+INITIALIZE_PASS(StripDeadPrototypesPass, "strip-dead-prototypes",
+                "Strip Unused Function Prototypes", false, false);
 
 bool StripDeadPrototypesPass::runOnModule(Module &M) {
   bool MadeChange = false;
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 12e8db8b4a546..20b7b8f2b8509 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -39,7 +39,7 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit StripSymbols(bool ODI = false) 
-      : ModulePass(&ID), OnlyDebugInfo(ODI) {}
+      : ModulePass(ID), OnlyDebugInfo(ODI) {}
 
     virtual bool runOnModule(Module &M);
 
@@ -52,7 +52,7 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit StripNonDebugSymbols()
-      : ModulePass(&ID) {}
+      : ModulePass(ID) {}
 
     virtual bool runOnModule(Module &M);
 
@@ -65,7 +65,7 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit StripDebugDeclare()
-      : ModulePass(&ID) {}
+      : ModulePass(ID) {}
 
     virtual bool runOnModule(Module &M);
 
@@ -78,7 +78,7 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit StripDeadDebugInfo()
-      : ModulePass(&ID) {}
+      : ModulePass(ID) {}
 
     virtual bool runOnModule(Module &M);
 
@@ -89,32 +89,33 @@ namespace {
 }
 
 char StripSymbols::ID = 0;
-static RegisterPass<StripSymbols>
-X("strip", "Strip all symbols from a module");
+INITIALIZE_PASS(StripSymbols, "strip",
+                "Strip all symbols from a module", false, false);
 
 ModulePass *llvm::createStripSymbolsPass(bool OnlyDebugInfo) {
   return new StripSymbols(OnlyDebugInfo);
 }
 
 char StripNonDebugSymbols::ID = 0;
-static RegisterPass<StripNonDebugSymbols>
-Y("strip-nondebug", "Strip all symbols, except dbg symbols, from a module");
+INITIALIZE_PASS(StripNonDebugSymbols, "strip-nondebug",
+                "Strip all symbols, except dbg symbols, from a module",
+                false, false);
 
 ModulePass *llvm::createStripNonDebugSymbolsPass() {
   return new StripNonDebugSymbols();
 }
 
 char StripDebugDeclare::ID = 0;
-static RegisterPass<StripDebugDeclare>
-Z("strip-debug-declare", "Strip all llvm.dbg.declare intrinsics");
+INITIALIZE_PASS(StripDebugDeclare, "strip-debug-declare",
+                "Strip all llvm.dbg.declare intrinsics", false, false);
 
 ModulePass *llvm::createStripDebugDeclarePass() {
   return new StripDebugDeclare();
 }
 
 char StripDeadDebugInfo::ID = 0;
-static RegisterPass<StripDeadDebugInfo>
-A("strip-dead-debug-info", "Strip debug info for unused symbols");
+INITIALIZE_PASS(StripDeadDebugInfo, "strip-dead-debug-info",
+                "Strip debug info for unused symbols", false, false);
 
 ModulePass *llvm::createStripDeadDebugInfoPass() {
   return new StripDeadDebugInfo();
@@ -254,14 +255,15 @@ static bool StripDebugInfo(Module &M) {
     }
   }
 
-  unsigned MDDbgKind = M.getMDKindID("dbg");
   for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI)
     for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE;
          ++FI)
       for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE;
            ++BI) {
-        Changed = true; // FIXME: Only set if there was debug metadata.
-        BI->setMetadata(MDDbgKind, 0);
+        if (!BI->getDebugLoc().isUnknown()) {
+          Changed = true;
+          BI->setDebugLoc(DebugLoc());
+        }
       }
 
   return Changed;
@@ -348,8 +350,8 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
 
     for (SmallVector<MDNode *, 8>::iterator I = MDs.begin(),
            E = MDs.end(); I != E; ++I) {
-      if (M.getGlobalVariable(DIGlobalVariable(*I).getGlobal()->getName(), 
-                              true)) {
+      GlobalVariable *GV = DIGlobalVariable(*I).getGlobal();
+      if (GV && M.getGlobalVariable(GV->getName(), true)) {
         if (!NMD)
           NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv");
         NMD->addOperand(*I);
diff --git a/lib/Transforms/IPO/StructRetPromotion.cpp b/lib/Transforms/IPO/StructRetPromotion.cpp
index a74686f408b67..b82b03f7d9e7d 100644
--- a/lib/Transforms/IPO/StructRetPromotion.cpp
+++ b/lib/Transforms/IPO/StructRetPromotion.cpp
@@ -1,4 +1,4 @@
-//===-- StructRetPromotion.cpp - Promote sret arguments ------------------===//
+//===-- StructRetPromotion.cpp - Promote sret arguments -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -50,20 +50,19 @@ namespace {
 
     virtual bool runOnSCC(CallGraphSCC &SCC);
     static char ID; // Pass identification, replacement for typeid
-    SRETPromotion() : CallGraphSCCPass(&ID) {}
+    SRETPromotion() : CallGraphSCCPass(ID) {}
 
   private:
     CallGraphNode *PromoteReturn(CallGraphNode *CGN);
     bool isSafeToUpdateAllCallers(Function *F);
     Function *cloneFunctionBody(Function *F, const StructType *STy);
     CallGraphNode *updateCallSites(Function *F, Function *NF);
-    bool nestedStructType(const StructType *STy);
   };
 }
 
 char SRETPromotion::ID = 0;
-static RegisterPass<SRETPromotion>
-X("sretpromotion", "Promote sret arguments to multiple ret values");
+INITIALIZE_PASS(SRETPromotion, "sretpromotion",
+                "Promote sret arguments to multiple ret values", false, false);
 
 Pass *llvm::createStructRetPromotionPass() {
   return new SRETPromotion();
@@ -156,7 +155,7 @@ bool SRETPromotion::isSafeToUpdateAllCallers(Function *F) {
        FnUseI != FnUseE; ++FnUseI) {
     // The function is passed in as an argument to (possibly) another function,
     // we can't change it!
-    CallSite CS = CallSite::get(*FnUseI);
+    CallSite CS(*FnUseI);
     Instruction *Call = CS.getInstruction();
     // The function is used by something else than a call or invoke instruction,
     // we can't change it!
@@ -187,7 +186,7 @@ bool SRETPromotion::isSafeToUpdateAllCallers(Function *F) {
           return false;
         for (Value::use_iterator GEPI = GEP->use_begin(), GEPE = GEP->use_end();
              GEPI != GEPE; ++GEPI) 
-          if (!isa<LoadInst>(GEPI))
+          if (!isa<LoadInst>(*GEPI))
             return false;
       } 
       // Any other FirstArg users make this function unsuitable for sret 
@@ -271,7 +270,7 @@ CallGraphNode *SRETPromotion::updateCallSites(Function *F, Function *NF) {
   CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF);
 
   while (!F->use_empty()) {
-    CallSite CS = CallSite::get(*F->use_begin());
+    CallSite CS(*F->use_begin());
     Instruction *Call = CS.getInstruction();
 
     const AttrListPtr &PAL = F->getAttributes();
@@ -351,14 +350,3 @@ CallGraphNode *SRETPromotion::updateCallSites(Function *F, Function *NF) {
   return NF_CGN;
 }
 
-/// nestedStructType - Return true if STy includes any
-/// other aggregate types
-bool SRETPromotion::nestedStructType(const StructType *STy) {
-  unsigned Num = STy->getNumElements();
-  for (unsigned i = 0; i < Num; i++) {
-    const Type *Ty = STy->getElementType(i);
-    if (!Ty->isSingleValueType() && !Ty->isVoidTy())
-      return true;
-  }
-  return false;
-}
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 24e052881a9d4..6f9609cf997ba 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -81,7 +81,7 @@ public:
   BuilderTy *Builder;
       
   static char ID; // Pass identification, replacement for typeid
-  InstCombiner() : FunctionPass(&ID), TD(0), Builder(0) {}
+  InstCombiner() : FunctionPass(ID), TD(0), Builder(0) {}
 
 public:
   virtual bool runOnFunction(Function &F);
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 5876f408343b6..19a05bfe9bba3 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -474,19 +474,16 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     }
     
     // (icmp ne (A & C1), 0) & (icmp ne (A & C2), 0) -->
-    // (icmp eq (A & (C1|C2)), (C1|C2))
+    // (icmp eq (A & (C1|C2)), (C1|C2)) where C1 and C2 are non-zero POT
     if (LHSCC == ICmpInst::ICMP_NE && LHSCst->isZero()) {
-      Instruction *I1 = dyn_cast<Instruction>(Val);
-      Instruction *I2 = dyn_cast<Instruction>(Val2);
-      if (I1 && I1->getOpcode() == Instruction::And &&
-          I2 && I2->getOpcode() == Instruction::And &&
-          I1->getOperand(0) == I1->getOperand(0)) {
-        ConstantInt *CI1 = dyn_cast<ConstantInt>(I1->getOperand(1));
-        ConstantInt *CI2 = dyn_cast<ConstantInt>(I2->getOperand(1));
-        if (CI1 && !CI1->isZero() && CI2 && !CI2->isZero() &&
-            CI1->getValue().operator&(CI2->getValue()) == 0) {
+      Value *Op1 = 0, *Op2 = 0;
+      ConstantInt *CI1 = 0, *CI2 = 0;
+      if (match(LHS->getOperand(0), m_And(m_Value(Op1), m_ConstantInt(CI1))) &&
+          match(RHS->getOperand(0), m_And(m_Value(Op2), m_ConstantInt(CI2)))) {
+        if (Op1 == Op2 && !CI1->isZero() && !CI2->isZero() &&
+            CI1->getValue().isPowerOf2() && CI2->getValue().isPowerOf2()) {
           Constant *ConstOr = ConstantExpr::getOr(CI1, CI2);
-          Value *NewAnd = Builder->CreateAnd(I1->getOperand(0), ConstOr);
+          Value *NewAnd = Builder->CreateAnd(Op1, ConstOr);
           return Builder->CreateICmp(ICmpInst::ICMP_EQ, NewAnd, ConstOr);
         }
       }
@@ -1170,11 +1167,28 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
   if (LHSCst == 0 || RHSCst == 0) return 0;
 
-  // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
-  if (LHSCst == RHSCst && LHSCC == RHSCC &&
-      LHSCC == ICmpInst::ICMP_NE && LHSCst->isZero()) {
-    Value *NewOr = Builder->CreateOr(Val, Val2);
-    return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
+  if (LHSCst == RHSCst && LHSCC == RHSCC) {
+    // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
+    if (LHSCC == ICmpInst::ICMP_NE && LHSCst->isZero()) {
+      Value *NewOr = Builder->CreateOr(Val, Val2);
+      return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
+    }
+  
+    // (icmp eq (A & C1), 0) | (icmp eq (A & C2), 0) -->
+    // (icmp ne (A & (C1|C2)), (C1|C2)) where C1 and C2 are non-zero POT
+    if (LHSCC == ICmpInst::ICMP_EQ && LHSCst->isZero()) {
+      Value *Op1 = 0, *Op2 = 0;
+      ConstantInt *CI1 = 0, *CI2 = 0;
+      if (match(LHS->getOperand(0), m_And(m_Value(Op1), m_ConstantInt(CI1))) &&
+          match(RHS->getOperand(0), m_And(m_Value(Op2), m_ConstantInt(CI2)))) {
+        if (Op1 == Op2 && !CI1->isZero() && !CI2->isZero() &&
+            CI1->getValue().isPowerOf2() && CI2->getValue().isPowerOf2()) {
+          Constant *ConstOr = ConstantExpr::getOr(CI1, CI2);
+          Value *NewAnd = Builder->CreateAnd(Op1, ConstOr);
+          return Builder->CreateICmp(ICmpInst::ICMP_NE, NewAnd, ConstOr);
+        }
+      }
+    }
   }
   
   // From here on, we only handle:
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 85251a83d4eac..0ebe3b45589e5 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -96,14 +96,23 @@ static unsigned EnforceKnownAlignment(Value *V,
 /// increase the alignment of the ultimate object, making this check succeed.
 unsigned InstCombiner::GetOrEnforceKnownAlignment(Value *V,
                                                   unsigned PrefAlign) {
-  unsigned BitWidth = TD ? TD->getTypeSizeInBits(V->getType()) :
-                      sizeof(PrefAlign) * CHAR_BIT;
+  assert(V->getType()->isPointerTy() &&
+         "GetOrEnforceKnownAlignment expects a pointer!");
+  unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64;
   APInt Mask = APInt::getAllOnesValue(BitWidth);
   APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
   ComputeMaskedBits(V, Mask, KnownZero, KnownOne);
   unsigned TrailZ = KnownZero.countTrailingOnes();
+
+  // Avoid trouble with rediculously large TrailZ values, such as
+  // those computed from a null pointer.
+  TrailZ = std::min(TrailZ, unsigned(sizeof(unsigned) * CHAR_BIT - 1));
+
   unsigned Align = 1u << std::min(BitWidth - 1, TrailZ);
 
+  // LLVM doesn't support alignments larger than this currently.
+  Align = std::min(Align, +Value::MaximumAlignment);
+
   if (PrefAlign > Align)
     Align = EnforceKnownAlignment(V, Align, PrefAlign);
   
@@ -529,7 +538,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       // X + 0 -> {X, false}
       if (RHS->isZero()) {
         Constant *V[] = {
-          UndefValue::get(II->getCalledValue()->getType()),
+          UndefValue::get(II->getArgOperand(0)->getType()),
           ConstantInt::getFalse(II->getContext())
         };
         Constant *Struct = ConstantStruct::get(II->getContext(), V, 2, false);
@@ -630,8 +639,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements();
     APInt DemandedElts(VWidth, 1);
     APInt UndefElts(VWidth, 0);
-    if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
-                                              UndefElts)) {
+    if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0),
+                                              DemandedElts, UndefElts)) {
       II->setArgOperand(0, V);
       return II;
     }
@@ -655,8 +664,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       
       if (AllEltsOk) {
         // Cast the input vectors to byte vectors.
-        Value *Op0 = Builder->CreateBitCast(II->getArgOperand(0), Mask->getType());
-        Value *Op1 = Builder->CreateBitCast(II->getArgOperand(1), Mask->getType());
+        Value *Op0 = Builder->CreateBitCast(II->getArgOperand(0),
+                                            Mask->getType());
+        Value *Op1 = Builder->CreateBitCast(II->getArgOperand(1),
+                                            Mask->getType());
         Value *Result = UndefValue::get(Op0->getType());
         
         // Only extract each element once.
@@ -772,13 +783,15 @@ protected:
     NewInstruction = IC->ReplaceInstUsesWith(*CI, With);
   }
   bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp, bool isString) const {
-    if (ConstantInt *SizeCI = dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp - CallInst::ArgOffset))) {
+    if (ConstantInt *SizeCI =
+                           dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) {
       if (SizeCI->isAllOnesValue())
         return true;
       if (isString)
         return SizeCI->getZExtValue() >=
-               GetStringLength(CI->getArgOperand(SizeArgOp - CallInst::ArgOffset));
-      if (ConstantInt *Arg = dyn_cast<ConstantInt>(CI->getArgOperand(SizeArgOp - CallInst::ArgOffset)))
+               GetStringLength(CI->getArgOperand(SizeArgOp));
+      if (ConstantInt *Arg = dyn_cast<ConstantInt>(
+                                                  CI->getArgOperand(SizeArgOp)))
         return SizeCI->getZExtValue() >= Arg->getZExtValue();
     }
     return false;
@@ -1140,7 +1153,7 @@ Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) {
   IntrinsicInst *Tramp =
     cast<IntrinsicInst>(cast<BitCastInst>(Callee)->getOperand(0));
 
-  Function *NestF = cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts());
+  Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts());
   const PointerType *NestFPTy = cast<PointerType>(NestF->getType());
   const FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType());
 
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 505a0bf8f4e7f..79a9b09c64d07 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -396,6 +396,11 @@ static bool CanEvaluateTruncated(Value *V, const Type *Ty) {
   case Instruction::Trunc:
     // trunc(trunc(x)) -> trunc(x)
     return true;
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest
+    // trunc(ext(x)) -> trunc(x) if the source type is larger than the new dest
+    return true;
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
     return CanEvaluateTruncated(SI->getTrueValue(), Ty) &&
@@ -454,6 +459,29 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     Value *Zero = Constant::getNullValue(Src->getType());
     return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
   }
+  
+  // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
+  Value *A = 0; ConstantInt *Cst = 0;
+  if (match(Src, m_LShr(m_ZExt(m_Value(A)), m_ConstantInt(Cst))) &&
+      Src->hasOneUse()) {
+    // We have three types to worry about here, the type of A, the source of
+    // the truncate (MidSize), and the destination of the truncate. We know that
+    // ASize < MidSize   and MidSize > ResultSize, but don't know the relation
+    // between ASize and ResultSize.
+    unsigned ASize = A->getType()->getPrimitiveSizeInBits();
+    
+    // If the shift amount is larger than the size of A, then the result is
+    // known to be zero because all the input bits got shifted out.
+    if (Cst->getZExtValue() >= ASize)
+      return ReplaceInstUsesWith(CI, Constant::getNullValue(CI.getType()));
+
+    // Since we're doing an lshr and a zero extend, and know that the shift
+    // amount is smaller than ASize, it is always safe to do the shift in A's
+    // type, then zero extend or truncate to the result.
+    Value *Shift = Builder->CreateLShr(A, Cst->getZExtValue());
+    Shift->takeName(Src);
+    return CastInst::CreateIntegerCast(Shift, CI.getType(), false);
+  }
 
   return 0;
 }
@@ -538,8 +566,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
           
         if (CI.getType() == In->getType())
           return ReplaceInstUsesWith(CI, In);
-        else
-          return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/);
+        return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/);
       }
     }
   }
@@ -1097,6 +1124,38 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
       break;  
     }
   }
+  
+  // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x)
+  // NOTE: This should be disabled by -fno-builtin-sqrt if we ever support it.
+  CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0));
+  if (Call && Call->getCalledFunction() &&
+      Call->getCalledFunction()->getName() == "sqrt" &&
+      Call->getNumArgOperands() == 1) {
+    CastInst *Arg = dyn_cast<CastInst>(Call->getArgOperand(0));
+    if (Arg && Arg->getOpcode() == Instruction::FPExt &&
+        CI.getType()->isFloatTy() &&
+        Call->getType()->isDoubleTy() &&
+        Arg->getType()->isDoubleTy() &&
+        Arg->getOperand(0)->getType()->isFloatTy()) {
+      Function *Callee = Call->getCalledFunction();
+      Module *M = CI.getParent()->getParent()->getParent();
+      Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf", 
+                                                   Callee->getAttributes(),
+                                                   Builder->getFloatTy(),
+                                                   Builder->getFloatTy(),
+                                                   NULL);
+      CallInst *ret = CallInst::Create(SqrtfFunc, Arg->getOperand(0),
+                                       "sqrtfcall");
+      ret->setAttributes(Callee->getAttributes());
+      
+      
+      // Remove the old Call.  With -fmath-errno, it won't get marked readnone.
+      Call->replaceAllUsesWith(UndefValue::get(Call->getType()));
+      EraseInstFromFunction(*Call);
+      return ret;
+    }
+  }
+  
   return 0;
 }
 
@@ -1308,6 +1367,199 @@ static Instruction *OptimizeVectorResize(Value *InVal, const VectorType *DestTy,
   return new ShuffleVectorInst(InVal, V2, Mask);
 }
 
+static bool isMultipleOfTypeSize(unsigned Value, const Type *Ty) {
+  return Value % Ty->getPrimitiveSizeInBits() == 0;
+}
+
+static unsigned getTypeSizeIndex(unsigned Value, const Type *Ty) {
+  return Value / Ty->getPrimitiveSizeInBits();
+}
+
+/// CollectInsertionElements - V is a value which is inserted into a vector of
+/// VecEltTy.  Look through the value to see if we can decompose it into
+/// insertions into the vector.  See the example in the comment for
+/// OptimizeIntegerToVectorInsertions for the pattern this handles.
+/// The type of V is always a non-zero multiple of VecEltTy's size.
+///
+/// This returns false if the pattern can't be matched or true if it can,
+/// filling in Elements with the elements found here.
+static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
+                                     SmallVectorImpl<Value*> &Elements,
+                                     const Type *VecEltTy) {
+  // Undef values never contribute useful bits to the result.
+  if (isa<UndefValue>(V)) return true;
+  
+  // If we got down to a value of the right type, we win, try inserting into the
+  // right element.
+  if (V->getType() == VecEltTy) {
+    // Inserting null doesn't actually insert any elements.
+    if (Constant *C = dyn_cast<Constant>(V))
+      if (C->isNullValue())
+        return true;
+    
+    // Fail if multiple elements are inserted into this slot.
+    if (ElementIndex >= Elements.size() || Elements[ElementIndex] != 0)
+      return false;
+    
+    Elements[ElementIndex] = V;
+    return true;
+  }
+  
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    // Figure out the # elements this provides, and bitcast it or slice it up
+    // as required.
+    unsigned NumElts = getTypeSizeIndex(C->getType()->getPrimitiveSizeInBits(),
+                                        VecEltTy);
+    // If the constant is the size of a vector element, we just need to bitcast
+    // it to the right type so it gets properly inserted.
+    if (NumElts == 1)
+      return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
+                                      ElementIndex, Elements, VecEltTy);
+    
+    // Okay, this is a constant that covers multiple elements.  Slice it up into
+    // pieces and insert each element-sized piece into the vector.
+    if (!isa<IntegerType>(C->getType()))
+      C = ConstantExpr::getBitCast(C, IntegerType::get(V->getContext(),
+                                       C->getType()->getPrimitiveSizeInBits()));
+    unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits();
+    const Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
+    
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
+                                                               i*ElementSize));
+      Piece = ConstantExpr::getTrunc(Piece, ElementIntTy);
+      if (!CollectInsertionElements(Piece, ElementIndex+i, Elements, VecEltTy))
+        return false;
+    }
+    return true;
+  }
+  
+  if (!V->hasOneUse()) return false;
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0) return false;
+  switch (I->getOpcode()) {
+  default: return false; // Unhandled case.
+  case Instruction::BitCast:
+    return CollectInsertionElements(I->getOperand(0), ElementIndex,
+                                    Elements, VecEltTy);  
+  case Instruction::ZExt:
+    if (!isMultipleOfTypeSize(
+                          I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
+                              VecEltTy))
+      return false;
+    return CollectInsertionElements(I->getOperand(0), ElementIndex,
+                                    Elements, VecEltTy);  
+  case Instruction::Or:
+    return CollectInsertionElements(I->getOperand(0), ElementIndex,
+                                    Elements, VecEltTy) &&
+           CollectInsertionElements(I->getOperand(1), ElementIndex,
+                                    Elements, VecEltTy);
+  case Instruction::Shl: {
+    // Must be shifting by a constant that is a multiple of the element size.
+    ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
+    if (CI == 0) return false;
+    if (!isMultipleOfTypeSize(CI->getZExtValue(), VecEltTy)) return false;
+    unsigned IndexShift = getTypeSizeIndex(CI->getZExtValue(), VecEltTy);
+    
+    return CollectInsertionElements(I->getOperand(0), ElementIndex+IndexShift,
+                                    Elements, VecEltTy);
+  }
+      
+  }
+}
+
+
+/// OptimizeIntegerToVectorInsertions - If the input is an 'or' instruction, we
+/// may be doing shifts and ors to assemble the elements of the vector manually.
+/// Try to rip the code out and replace it with insertelements.  This is to
+/// optimize code like this:
+///
+///    %tmp37 = bitcast float %inc to i32
+///    %tmp38 = zext i32 %tmp37 to i64
+///    %tmp31 = bitcast float %inc5 to i32
+///    %tmp32 = zext i32 %tmp31 to i64
+///    %tmp33 = shl i64 %tmp32, 32
+///    %ins35 = or i64 %tmp33, %tmp38
+///    %tmp43 = bitcast i64 %ins35 to <2 x float>
+///
+/// Into two insertelements that do "buildvector{%inc, %inc5}".
+static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
+                                                InstCombiner &IC) {
+  const VectorType *DestVecTy = cast<VectorType>(CI.getType());
+  Value *IntInput = CI.getOperand(0);
+
+  SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
+  if (!CollectInsertionElements(IntInput, 0, Elements,
+                                DestVecTy->getElementType()))
+    return 0;
+
+  // If we succeeded, we know that all of the element are specified by Elements
+  // or are zero if Elements has a null entry.  Recast this as a set of
+  // insertions.
+  Value *Result = Constant::getNullValue(CI.getType());
+  for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
+    if (Elements[i] == 0) continue;  // Unset element.
+    
+    Result = IC.Builder->CreateInsertElement(Result, Elements[i],
+                                             IC.Builder->getInt32(i));
+  }
+  
+  return Result;
+}
+
+
+/// OptimizeIntToFloatBitCast - See if we can optimize an integer->float/double
+/// bitcast.  The various long double bitcasts can't get in here.
+static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
+  Value *Src = CI.getOperand(0);
+  const Type *DestTy = CI.getType();
+
+  // If this is a bitcast from int to float, check to see if the int is an
+  // extraction from a vector.
+  Value *VecInput = 0;
+  // bitcast(trunc(bitcast(somevector)))
+  if (match(Src, m_Trunc(m_BitCast(m_Value(VecInput)))) &&
+      isa<VectorType>(VecInput->getType())) {
+    const VectorType *VecTy = cast<VectorType>(VecInput->getType());
+    unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
+
+    if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0) {
+      // If the element type of the vector doesn't match the result type,
+      // bitcast it to be a vector type we can extract from.
+      if (VecTy->getElementType() != DestTy) {
+        VecTy = VectorType::get(DestTy,
+                                VecTy->getPrimitiveSizeInBits() / DestWidth);
+        VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
+      }
+    
+      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(0));
+    }
+  }
+  
+  // bitcast(trunc(lshr(bitcast(somevector), cst))
+  ConstantInt *ShAmt = 0;
+  if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)),
+                                m_ConstantInt(ShAmt)))) &&
+      isa<VectorType>(VecInput->getType())) {
+    const VectorType *VecTy = cast<VectorType>(VecInput->getType());
+    unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
+    if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0 &&
+        ShAmt->getZExtValue() % DestWidth == 0) {
+      // If the element type of the vector doesn't match the result type,
+      // bitcast it to be a vector type we can extract from.
+      if (VecTy->getElementType() != DestTy) {
+        VecTy = VectorType::get(DestTy,
+                                VecTy->getPrimitiveSizeInBits() / DestWidth);
+        VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
+      }
+      
+      unsigned Elt = ShAmt->getZExtValue() / DestWidth;
+      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
+    }
+  }
+  return 0;
+}
 
 Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
   // If the operands are integer typed then apply the integer transforms,
@@ -1359,6 +1611,11 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
                                                ((Instruction*)NULL));
     }
   }
+  
+  // Try to optimize int -> float bitcasts.
+  if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy))
+    if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this))
+      return I;
 
   if (const VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
     if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) {
@@ -1368,16 +1625,24 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       // FIXME: Canonicalize bitcast(insertelement) -> insertelement(bitcast)
     }
     
-    // If this is a cast from an integer to vector, check to see if the input
-    // is a trunc or zext of a bitcast from vector.  If so, we can replace all
-    // the casts with a shuffle and (potentially) a bitcast.
-    if (isa<IntegerType>(SrcTy) && (isa<TruncInst>(Src) || isa<ZExtInst>(Src))){
-      CastInst *SrcCast = cast<CastInst>(Src);
-      if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0)))
-        if (isa<VectorType>(BCIn->getOperand(0)->getType()))
-          if (Instruction *I = OptimizeVectorResize(BCIn->getOperand(0),
+    if (isa<IntegerType>(SrcTy)) {
+      // If this is a cast from an integer to vector, check to see if the input
+      // is a trunc or zext of a bitcast from vector.  If so, we can replace all
+      // the casts with a shuffle and (potentially) a bitcast.
+      if (isa<TruncInst>(Src) || isa<ZExtInst>(Src)) {
+        CastInst *SrcCast = cast<CastInst>(Src);
+        if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0)))
+          if (isa<VectorType>(BCIn->getOperand(0)->getType()))
+            if (Instruction *I = OptimizeVectorResize(BCIn->getOperand(0),
                                                cast<VectorType>(DestTy), *this))
-            return I;
+              return I;
+      }
+      
+      // If the input is an 'or' instruction, we may be doing shifts and ors to
+      // assemble the elements of the vector manually.  Try to rip the code out
+      // and replace it with insertelements.
+      if (Value *V = OptimizeIntegerToVectorInsertions(CI, *this))
+        return ReplaceInstUsesWith(CI, V);
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 6c00586412ac1..d7e2b72b7fac5 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1374,7 +1374,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       case Instruction::Or:
         // If bits are being or'd in that are not present in the constant we
         // are comparing against, then the comparison could never succeed!
-        if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1))) {
+        if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
           Constant *NotCI = ConstantExpr::getNot(RHS);
           if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue())
             return ReplaceInstUsesWith(ICI,
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 8933a0b137ab4..b68fbc2db5c9a 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -146,10 +146,14 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   if (TD) {
     unsigned KnownAlign =
       GetOrEnforceKnownAlignment(Op, TD->getPrefTypeAlignment(LI.getType()));
-    if (KnownAlign >
-        (LI.getAlignment() == 0 ? TD->getABITypeAlignment(LI.getType()) :
-                                  LI.getAlignment()))
+    unsigned LoadAlign = LI.getAlignment();
+    unsigned EffectiveLoadAlign = LoadAlign != 0 ? LoadAlign :
+      TD->getABITypeAlignment(LI.getType());
+
+    if (KnownAlign > EffectiveLoadAlign)
       LI.setAlignment(KnownAlign);
+    else if (LoadAlign == 0)
+      LI.setAlignment(EffectiveLoadAlign);
   }
 
   // load (cast X) --> cast (load X) iff safe.
@@ -369,7 +373,7 @@ DbgDeclareInst *InstCombiner::hasOneUsePlusDeclare(Value *V) {
     if (DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(U))
       return DI;
     if (isa<BitCastInst>(U) && U->hasOneUse()) {
-      if (DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(U->use_begin()))
+      if (DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(*U->use_begin()))
         return DI;
       }
   }
@@ -411,10 +415,14 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   if (TD) {
     unsigned KnownAlign =
       GetOrEnforceKnownAlignment(Ptr, TD->getPrefTypeAlignment(Val->getType()));
-    if (KnownAlign >
-        (SI.getAlignment() == 0 ? TD->getABITypeAlignment(Val->getType()) :
-                                  SI.getAlignment()))
+    unsigned StoreAlign = SI.getAlignment();
+    unsigned EffectiveStoreAlign = StoreAlign != 0 ? StoreAlign :
+      TD->getABITypeAlignment(Val->getType());
+
+    if (KnownAlign > EffectiveStoreAlign)
       SI.setAlignment(KnownAlign);
+    else if (StoreAlign == 0)
+      SI.setAlignment(EffectiveStoreAlign);
   }
 
   // Do really simple DSE, to catch cases where there are several consecutive
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index f9ffdb10f2660..c44fe9db6e3a7 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -699,34 +699,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     SI.setOperand(2, TrueVal);
     return &SI;
   }
-  
-  // select (A == 0 | B == 0), T, F--> select (A != 0 & B != 0), F, T
-  // Note: This is a canonicalization rather than an optimization, and is used
-  // to expose opportunities to other instcombine transforms.
-  Instruction* CondInst = dyn_cast<Instruction>(CondVal);
-  if (CondInst && CondInst->hasOneUse() && 
-      CondInst->getOpcode() == Instruction::Or) {
-    ICmpInst *LHSCmp = dyn_cast<ICmpInst>(CondInst->getOperand(0));
-    ICmpInst *RHSCmp = dyn_cast<ICmpInst>(CondInst->getOperand(1));
-    if (LHSCmp && LHSCmp->hasOneUse() &&
-                  LHSCmp->getPredicate() == ICmpInst::ICMP_EQ &&
-        RHSCmp && RHSCmp->hasOneUse() &&
-                  RHSCmp->getPredicate() == ICmpInst::ICMP_EQ) {
-      ConstantInt* C1 = dyn_cast<ConstantInt>(LHSCmp->getOperand(1));
-      ConstantInt* C2 = dyn_cast<ConstantInt>(RHSCmp->getOperand(1));
-      if (C1 && C1->isZero() && C2 && C2->isZero()) {
-        LHSCmp->setPredicate(ICmpInst::ICMP_NE);
-        RHSCmp->setPredicate(ICmpInst::ICMP_NE);
-        Value *And =
-          InsertNewInstBefore(BinaryOperator::CreateAnd(LHSCmp, RHSCmp,
-                                             "and."+CondVal->getName()), SI);
-        SI.setOperand(0, And);
-        SI.setOperand(1, FalseVal);
-        SI.setOperand(2, TrueVal);
-        return &SI;
-      }
-    }
-  }
 
   return 0;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index e5ce8a612f3f6..27716b886a226 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -56,10 +56,270 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
   return 0;
 }
 
+/// CanEvaluateShifted - See if we can compute the specified value, but shifted
+/// logically to the left or right by some number of bits.  This should return
+/// true if the expression can be computed for the same cost as the current
+/// expression tree.  This is used to eliminate extraneous shifting from things
+/// like:
+///      %C = shl i128 %A, 64
+///      %D = shl i128 %B, 96
+///      %E = or i128 %C, %D
+///      %F = lshr i128 %E, 64
+/// where the client will ask if E can be computed shifted right by 64-bits.  If
+/// this succeeds, the GetShiftedValue function will be called to produce the
+/// value.
+static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
+                               InstCombiner &IC) {
+  // We can always evaluate constants shifted.
+  if (isa<Constant>(V))
+    return true;
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+  
+  // If this is the opposite shift, we can directly reuse the input of the shift
+  // if the needed bits are already zero in the input.  This allows us to reuse
+  // the value which means that we don't care if the shift has multiple uses.
+  //  TODO:  Handle opposite shift by exact value.
+  ConstantInt *CI;
+  if ((isLeftShift && match(I, m_LShr(m_Value(), m_ConstantInt(CI)))) ||
+      (!isLeftShift && match(I, m_Shl(m_Value(), m_ConstantInt(CI))))) {
+    if (CI->getZExtValue() == NumBits) {
+      // TODO: Check that the input bits are already zero with MaskedValueIsZero
+#if 0
+      // If this is a truncate of a logical shr, we can truncate it to a smaller
+      // lshr iff we know that the bits we would otherwise be shifting in are
+      // already zeros.
+      uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+      uint32_t BitWidth = Ty->getScalarSizeInBits();
+      if (MaskedValueIsZero(I->getOperand(0),
+            APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth)) &&
+          CI->getLimitedValue(BitWidth) < BitWidth) {
+        return CanEvaluateTruncated(I->getOperand(0), Ty);
+      }
+#endif
+      
+    }
+  }
+  
+  // We can't mutate something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
+  
+  switch (I->getOpcode()) {
+  default: return false;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
+    return CanEvaluateShifted(I->getOperand(0), NumBits, isLeftShift, IC) &&
+           CanEvaluateShifted(I->getOperand(1), NumBits, isLeftShift, IC);
+      
+  case Instruction::Shl: {
+    // We can often fold the shift into shifts-by-a-constant.
+    CI = dyn_cast<ConstantInt>(I->getOperand(1));
+    if (CI == 0) return false;
+
+    // We can always fold shl(c1)+shl(c2) -> shl(c1+c2).
+    if (isLeftShift) return true;
+    
+    // We can always turn shl(c)+shr(c) -> and(c2).
+    if (CI->getValue() == NumBits) return true;
+      
+    unsigned TypeWidth = I->getType()->getScalarSizeInBits();
+
+    // We can turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but it isn't
+    // profitable unless we know the and'd out bits are already zero.
+    if (CI->getZExtValue() > NumBits) {
+      unsigned HighBits = CI->getZExtValue() - NumBits;
+      if (MaskedValueIsZero(I->getOperand(0),
+                            APInt::getHighBitsSet(TypeWidth, HighBits)))
+        return true;
+    }
+      
+    return false;
+  }
+  case Instruction::LShr: {
+    // We can often fold the shift into shifts-by-a-constant.
+    CI = dyn_cast<ConstantInt>(I->getOperand(1));
+    if (CI == 0) return false;
+    
+    // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2).
+    if (!isLeftShift) return true;
+    
+    // We can always turn lshr(c)+shl(c) -> and(c2).
+    if (CI->getValue() == NumBits) return true;
+      
+    unsigned TypeWidth = I->getType()->getScalarSizeInBits();
+
+    // We can always turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but it isn't
+    // profitable unless we know the and'd out bits are already zero.
+    if (CI->getZExtValue() > NumBits) {
+      unsigned LowBits = CI->getZExtValue() - NumBits;
+      if (MaskedValueIsZero(I->getOperand(0),
+                            APInt::getLowBitsSet(TypeWidth, LowBits)))
+        return true;
+    }
+      
+    return false;
+  }
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    return CanEvaluateShifted(SI->getTrueValue(), NumBits, isLeftShift, IC) &&
+           CanEvaluateShifted(SI->getFalseValue(), NumBits, isLeftShift, IC);
+  }
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!CanEvaluateShifted(PN->getIncomingValue(i), NumBits, isLeftShift,IC))
+        return false;
+    return true;
+  }
+  }      
+}
+
+/// GetShiftedValue - When CanEvaluateShifted returned true for an expression,
+/// this value inserts the new computation that produces the shifted value.
+static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
+                              InstCombiner &IC) {
+  // We can always evaluate constants shifted.
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    if (isLeftShift)
+      V = IC.Builder->CreateShl(C, NumBits);
+    else
+      V = IC.Builder->CreateLShr(C, NumBits);
+    // If we got a constantexpr back, try to simplify it with TD info.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+      V = ConstantFoldConstantExpression(CE, IC.getTargetData());
+    return V;
+  }
+  
+  Instruction *I = cast<Instruction>(V);
+  IC.Worklist.Add(I);
+
+  switch (I->getOpcode()) {
+  default: assert(0 && "Inconsistency with CanEvaluateShifted");
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
+    I->setOperand(0, GetShiftedValue(I->getOperand(0), NumBits,isLeftShift,IC));
+    I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC));
+    return I;
+    
+  case Instruction::Shl: {
+    unsigned TypeWidth = I->getType()->getScalarSizeInBits();
+
+    // We only accept shifts-by-a-constant in CanEvaluateShifted.
+    ConstantInt *CI = cast<ConstantInt>(I->getOperand(1));
+    
+    // We can always fold shl(c1)+shl(c2) -> shl(c1+c2).
+    if (isLeftShift) {
+      // If this is oversized composite shift, then unsigned shifts get 0.
+      unsigned NewShAmt = NumBits+CI->getZExtValue();
+      if (NewShAmt >= TypeWidth)
+        return Constant::getNullValue(I->getType());
+
+      I->setOperand(1, ConstantInt::get(I->getType(), NewShAmt));
+      return I;
+    }
+    
+    // We turn shl(c)+lshr(c) -> and(c2) if the input doesn't already have
+    // zeros.
+    if (CI->getValue() == NumBits) {
+      APInt Mask(APInt::getLowBitsSet(TypeWidth, TypeWidth - NumBits));
+      V = IC.Builder->CreateAnd(I->getOperand(0),
+                                ConstantInt::get(I->getContext(), Mask));
+      if (Instruction *VI = dyn_cast<Instruction>(V)) {
+        VI->moveBefore(I);
+        VI->takeName(I);
+      }
+      return V;
+    }
+    
+    // We turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but only when we know that
+    // the and won't be needed.
+    assert(CI->getZExtValue() > NumBits);
+    I->setOperand(1, ConstantInt::get(I->getType(),
+                                      CI->getZExtValue() - NumBits));
+    return I;
+  }
+  case Instruction::LShr: {
+    unsigned TypeWidth = I->getType()->getScalarSizeInBits();
+    // We only accept shifts-by-a-constant in CanEvaluateShifted.
+    ConstantInt *CI = cast<ConstantInt>(I->getOperand(1));
+    
+    // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2).
+    if (!isLeftShift) {
+      // If this is oversized composite shift, then unsigned shifts get 0.
+      unsigned NewShAmt = NumBits+CI->getZExtValue();
+      if (NewShAmt >= TypeWidth)
+        return Constant::getNullValue(I->getType());
+      
+      I->setOperand(1, ConstantInt::get(I->getType(), NewShAmt));
+      return I;
+    }
+    
+    // We turn lshr(c)+shl(c) -> and(c2) if the input doesn't already have
+    // zeros.
+    if (CI->getValue() == NumBits) {
+      APInt Mask(APInt::getHighBitsSet(TypeWidth, TypeWidth - NumBits));
+      V = IC.Builder->CreateAnd(I->getOperand(0),
+                                ConstantInt::get(I->getContext(), Mask));
+      if (Instruction *VI = dyn_cast<Instruction>(V)) {
+        VI->moveBefore(I);
+        VI->takeName(I);
+      }
+      return V;
+    }
+    
+    // We turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but only when we know that
+    // the and won't be needed.
+    assert(CI->getZExtValue() > NumBits);
+    I->setOperand(1, ConstantInt::get(I->getType(),
+                                      CI->getZExtValue() - NumBits));
+    return I;
+  }
+    
+  case Instruction::Select:
+    I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC));
+    I->setOperand(2, GetShiftedValue(I->getOperand(2), NumBits,isLeftShift,IC));
+    return I;
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      PN->setIncomingValue(i, GetShiftedValue(PN->getIncomingValue(i),
+                                              NumBits, isLeftShift, IC));
+    return PN;
+  }
+  }      
+}
+
+
+
 Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
                                                BinaryOperator &I) {
   bool isLeftShift = I.getOpcode() == Instruction::Shl;
-
+  
+  
+  // See if we can propagate this shift into the input, this covers the trivial
+  // cast of lshr(shl(x,c1),c2) as well as other more complex cases.
+  if (I.getOpcode() != Instruction::AShr &&
+      CanEvaluateShifted(Op0, Op1->getZExtValue(), isLeftShift, *this)) {
+    DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression"
+              " to eliminate shift:\n  IN: " << *Op0 << "\n  SH: " << I <<"\n");
+    
+    return ReplaceInstUsesWith(I, 
+                 GetShiftedValue(Op0, Op1->getZExtValue(), isLeftShift, *this));
+  }
+  
+  
   // See if we can simplify any instructions used by the instruction whose sole 
   // purpose is to compute bits we don't care about.
   uint32_t TypeBits = Op0->getType()->getScalarSizeInBits();
@@ -288,39 +548,17 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
                                     ConstantInt::get(Ty, AmtSum));
     }
     
-    if (ShiftOp->getOpcode() == Instruction::LShr &&
-        I.getOpcode() == Instruction::AShr) {
-      if (AmtSum >= TypeBits)
-        return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
-      
-      // ((X >>u C1) >>s C2) -> (X >>u (C1+C2))  since C1 != 0.
-      return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum));
-    }
-    
-    if (ShiftOp->getOpcode() == Instruction::AShr &&
-        I.getOpcode() == Instruction::LShr) {
-      // ((X >>s C1) >>u C2) -> ((X >>s (C1+C2)) & mask) since C1 != 0.
-      if (AmtSum >= TypeBits)
-        AmtSum = TypeBits-1;
-      
-      Value *Shift = Builder->CreateAShr(X, ConstantInt::get(Ty, AmtSum));
-
-      APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
-      return BinaryOperator::CreateAnd(Shift,
-                                       ConstantInt::get(I.getContext(), Mask));
-    }
-    
-    // Okay, if we get here, one shift must be left, and the other shift must be
-    // right.  See if the amounts are equal.
     if (ShiftAmt1 == ShiftAmt2) {
       // If we have ((X >>? C) << C), turn this into X & (-1 << C).
-      if (I.getOpcode() == Instruction::Shl) {
+      if (I.getOpcode() == Instruction::Shl &&
+          ShiftOp->getOpcode() != Instruction::Shl) {
         APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt1));
         return BinaryOperator::CreateAnd(X,
                                          ConstantInt::get(I.getContext(),Mask));
       }
       // If we have ((X << C) >>u C), turn this into X & (-1 >>u C).
-      if (I.getOpcode() == Instruction::LShr) {
+      if (I.getOpcode() == Instruction::LShr &&
+          ShiftOp->getOpcode() == Instruction::Shl) {
         APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt1));
         return BinaryOperator::CreateAnd(X,
                                         ConstantInt::get(I.getContext(), Mask));
@@ -329,7 +567,8 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       uint32_t ShiftDiff = ShiftAmt2-ShiftAmt1;
       
       // (X >>? C1) << C2 --> X << (C2-C1) & (-1 << C2)
-      if (I.getOpcode() == Instruction::Shl) {
+      if (I.getOpcode() == Instruction::Shl &&
+          ShiftOp->getOpcode() != Instruction::Shl) {
         assert(ShiftOp->getOpcode() == Instruction::LShr ||
                ShiftOp->getOpcode() == Instruction::AShr);
         Value *Shift = Builder->CreateShl(X, ConstantInt::get(Ty, ShiftDiff));
@@ -340,7 +579,8 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       }
       
       // (X << C1) >>u C2  --> X >>u (C2-C1) & (-1 >> C2)
-      if (I.getOpcode() == Instruction::LShr) {
+      if (I.getOpcode() == Instruction::LShr &&
+          ShiftOp->getOpcode() == Instruction::Shl) {
         assert(ShiftOp->getOpcode() == Instruction::Shl);
         Value *Shift = Builder->CreateLShr(X, ConstantInt::get(Ty, ShiftDiff));
         
@@ -355,9 +595,8 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       uint32_t ShiftDiff = ShiftAmt1-ShiftAmt2;
 
       // (X >>? C1) << C2 --> X >>? (C1-C2) & (-1 << C2)
-      if (I.getOpcode() == Instruction::Shl) {
-        assert(ShiftOp->getOpcode() == Instruction::LShr ||
-               ShiftOp->getOpcode() == Instruction::AShr);
+      if (I.getOpcode() == Instruction::Shl &&
+          ShiftOp->getOpcode() != Instruction::Shl) {
         Value *Shift = Builder->CreateBinOp(ShiftOp->getOpcode(), X,
                                             ConstantInt::get(Ty, ShiftDiff));
         
@@ -367,8 +606,8 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       }
       
       // (X << C1) >>u C2  --> X << (C1-C2) & (-1 >> C2)
-      if (I.getOpcode() == Instruction::LShr) {
-        assert(ShiftOp->getOpcode() == Instruction::Shl);
+      if (I.getOpcode() == Instruction::LShr &&
+          ShiftOp->getOpcode() == Instruction::Shl) {
         Value *Shift = Builder->CreateShl(X, ConstantInt::get(Ty, ShiftDiff));
         
         APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index af2958fe3d910..e46c67994e2b0 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -60,8 +60,8 @@ STATISTIC(NumSunkInst , "Number of instructions sunk");
 
 
 char InstCombiner::ID = 0;
-static RegisterPass<InstCombiner>
-X("instcombine", "Combine redundant instructions");
+INITIALIZE_PASS(InstCombiner, "instcombine",
+                "Combine redundant instructions", false, false);
 
 void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreservedID(LCSSAID);
diff --git a/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
index 9ae3786707153..a77d70cd1c1bc 100644
--- a/lib/Transforms/Instrumentation/EdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
@@ -34,7 +34,7 @@ namespace {
     bool runOnModule(Module &M);
   public:
     static char ID; // Pass identification, replacement for typeid
-    EdgeProfiler() : ModulePass(&ID) {}
+    EdgeProfiler() : ModulePass(ID) {}
 
     virtual const char *getPassName() const {
       return "Edge Profiler";
@@ -43,8 +43,8 @@ namespace {
 }
 
 char EdgeProfiler::ID = 0;
-static RegisterPass<EdgeProfiler>
-X("insert-edge-profiling", "Insert instrumentation for edge profiling");
+INITIALIZE_PASS(EdgeProfiler, "insert-edge-profiling",
+                "Insert instrumentation for edge profiling", false, false);
 
 ModulePass *llvm::createEdgeProfilerPass() { return new EdgeProfiler(); }
 
diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
index 41e3a39f2685b..8eec9872812dc 100644
--- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
@@ -36,7 +36,7 @@ namespace {
     bool runOnModule(Module &M);
   public:
     static char ID; // Pass identification, replacement for typeid
-    OptimalEdgeProfiler() : ModulePass(&ID) {}
+    OptimalEdgeProfiler() : ModulePass(ID) {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequiredID(ProfileEstimatorPassID);
@@ -50,9 +50,9 @@ namespace {
 }
 
 char OptimalEdgeProfiler::ID = 0;
-static RegisterPass<OptimalEdgeProfiler>
-X("insert-optimal-edge-profiling", 
-  "Insert optimal instrumentation for edge profiling");
+INITIALIZE_PASS(OptimalEdgeProfiler, "insert-optimal-edge-profiling", 
+                "Insert optimal instrumentation for edge profiling",
+                false, false);
 
 ModulePass *llvm::createOptimalEdgeProfilerPass() {
   return new OptimalEdgeProfiler();
diff --git a/lib/Transforms/Scalar/ABCD.cpp b/lib/Transforms/Scalar/ABCD.cpp
deleted file mode 100644
index dcf14a6860da5..0000000000000
--- a/lib/Transforms/Scalar/ABCD.cpp
+++ /dev/null
@@ -1,1112 +0,0 @@
-//===------- ABCD.cpp - Removes redundant conditional branches ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass removes redundant branch instructions. This algorithm was
-// described by Rastislav Bodik, Rajiv Gupta and Vivek Sarkar in their paper
-// "ABCD: Eliminating Array Bounds Checks on Demand (2000)". The original
-// Algorithm was created to remove array bound checks for strongly typed
-// languages. This implementation expands the idea and removes any conditional
-// branches that can be proved redundant, not only those used in array bound
-// checks. With the SSI representation, each variable has a
-// constraint. By analyzing these constraints we can prove that a branch is
-// redundant. When a branch is proved redundant it means that
-// one direction will always be taken; thus, we can change this branch into an
-// unconditional jump.
-// It is advisable to run SimplifyCFG and Aggressive Dead Code Elimination
-// after ABCD to clean up the code.
-// This implementation was created based on the implementation of the ABCD
-// algorithm implemented for the compiler Jitrino.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "abcd"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/SSI.h"
-
-using namespace llvm;
-
-STATISTIC(NumBranchTested, "Number of conditional branches analyzed");
-STATISTIC(NumBranchRemoved, "Number of conditional branches removed");
-
-namespace {
-
-class ABCD : public FunctionPass {
- public:
-  static char ID;  // Pass identification, replacement for typeid.
-  ABCD() : FunctionPass(&ID) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.addRequired<SSI>();
-  }
-
-  bool runOnFunction(Function &F);
-
- private:
-  /// Keep track of whether we've modified the program yet.
-  bool modified;
-
-  enum ProveResult {
-    False = 0,
-    Reduced = 1,
-    True = 2
-  };
-
-  typedef ProveResult (*meet_function)(ProveResult, ProveResult);
-  static ProveResult max(ProveResult res1, ProveResult res2) {
-    return (ProveResult) std::max(res1, res2);
-  }
-  static ProveResult min(ProveResult res1, ProveResult res2) {
-    return (ProveResult) std::min(res1, res2);
-  }
-
-  class Bound {
-   public:
-    Bound(APInt v, bool upper) : value(v), upper_bound(upper) {}
-    Bound(const Bound &b, int cnst)
-      : value(b.value - cnst), upper_bound(b.upper_bound) {}
-    Bound(const Bound &b, const APInt &cnst)
-      : value(b.value - cnst), upper_bound(b.upper_bound) {}
-
-    /// Test if Bound is an upper bound
-    bool isUpperBound() const { return upper_bound; }
-
-    /// Get the bitwidth of this bound
-    int32_t getBitWidth() const { return value.getBitWidth(); }
-
-    /// Creates a Bound incrementing the one received
-    static Bound createIncrement(const Bound &b) {
-      return Bound(b.isUpperBound() ? b.value+1 : b.value-1,
-                   b.upper_bound);
-    }
-
-    /// Creates a Bound decrementing the one received
-    static Bound createDecrement(const Bound &b) {
-      return Bound(b.isUpperBound() ? b.value-1 : b.value+1,
-                   b.upper_bound);
-    }
-
-    /// Test if two bounds are equal
-    static bool eq(const Bound *a, const Bound *b) {
-      if (!a || !b) return false;
-
-      assert(a->isUpperBound() == b->isUpperBound());
-      return a->value == b->value;
-    }
-
-    /// Test if val is less than or equal to Bound b
-    static bool leq(APInt val, const Bound &b) {
-      return b.isUpperBound() ? val.sle(b.value) : val.sge(b.value);
-    }
-
-    /// Test if Bound a is less then or equal to Bound
-    static bool leq(const Bound &a, const Bound &b) {
-      assert(a.isUpperBound() == b.isUpperBound());
-      return a.isUpperBound() ? a.value.sle(b.value) :
-                                a.value.sge(b.value);
-    }
-
-    /// Test if Bound a is less then Bound b
-    static bool lt(const Bound &a, const Bound &b) {
-      assert(a.isUpperBound() == b.isUpperBound());
-      return a.isUpperBound() ? a.value.slt(b.value) :
-                                a.value.sgt(b.value);
-    }
-
-    /// Test if Bound b is greater then or equal val
-    static bool geq(const Bound &b, APInt val) {
-      return leq(val, b);
-    }
-
-    /// Test if Bound a is greater then or equal Bound b
-    static bool geq(const Bound &a, const Bound &b) {
-      return leq(b, a);
-    }
-
-   private:
-    APInt value;
-    bool upper_bound;
-  };
-
-  /// This class is used to store results some parts of the graph,
-  /// so information does not need to be recalculated. The maximum false,
-  /// minimum true and minimum reduced results are stored
-  class MemoizedResultChart {
-   public:
-     MemoizedResultChart() {}
-     MemoizedResultChart(const MemoizedResultChart &other) {
-       if (other.max_false)
-         max_false.reset(new Bound(*other.max_false));
-       if (other.min_true)
-         min_true.reset(new Bound(*other.min_true));
-       if (other.min_reduced)
-         min_reduced.reset(new Bound(*other.min_reduced));
-     }
-
-    /// Returns the max false
-    const Bound *getFalse() const { return max_false.get(); }
-
-    /// Returns the min true
-    const Bound *getTrue() const { return min_true.get(); }
-
-    /// Returns the min reduced
-    const Bound *getReduced() const { return min_reduced.get(); }
-
-    /// Return the stored result for this bound
-    ProveResult getResult(const Bound &bound) const;
-
-    /// Stores a false found
-    void addFalse(const Bound &bound);
-
-    /// Stores a true found
-    void addTrue(const Bound &bound);
-
-    /// Stores a Reduced found
-    void addReduced(const Bound &bound);
-
-    /// Clears redundant reduced
-    /// If a min_true is smaller than a min_reduced then the min_reduced
-    /// is unnecessary and then removed. It also works for min_reduced
-    /// begin smaller than max_false.
-    void clearRedundantReduced();
-
-    void clear() {
-      max_false.reset();
-      min_true.reset();
-      min_reduced.reset();
-    }
-
-  private:
-    OwningPtr<Bound> max_false, min_true, min_reduced;
-  };
-
-  /// This class stores the result found for a node of the graph,
-  /// so these results do not need to be recalculated, only searched for.
-  class MemoizedResult {
-  public:
-    /// Test if there is true result stored from b to a
-    /// that is less then the bound
-    bool hasTrue(Value *b, const Bound &bound) const {
-      const Bound *trueBound = map.lookup(b).getTrue();
-      return trueBound && Bound::leq(*trueBound, bound);
-    }
-
-    /// Test if there is false result stored from b to a
-    /// that is less then the bound
-    bool hasFalse(Value *b, const Bound &bound) const {
-      const Bound *falseBound = map.lookup(b).getFalse();
-      return falseBound && Bound::leq(*falseBound, bound);
-    }
-
-    /// Test if there is reduced result stored from b to a
-    /// that is less then the bound
-    bool hasReduced(Value *b, const Bound &bound) const {
-      const Bound *reducedBound = map.lookup(b).getReduced();
-      return reducedBound && Bound::leq(*reducedBound, bound);
-    }
-
-    /// Returns the stored bound for b
-    ProveResult getBoundResult(Value *b, const Bound &bound) {
-      return map[b].getResult(bound);
-    }
-
-    /// Clears the map
-    void clear() {
-      DenseMapIterator<Value*, MemoizedResultChart> begin = map.begin();
-      DenseMapIterator<Value*, MemoizedResultChart> end = map.end();
-      for (; begin != end; ++begin) {
-        begin->second.clear();
-      }
-      map.clear();
-    }
-
-    /// Stores the bound found
-    void updateBound(Value *b, const Bound &bound, const ProveResult res);
-
-  private:
-    // Maps a nod in the graph with its results found.
-    DenseMap<Value*, MemoizedResultChart> map;
-  };
-
-  /// This class represents an edge in the inequality graph used by the
-  /// ABCD algorithm. An edge connects node v to node u with a value c if
-  /// we could infer a constraint v <= u + c in the source program.
-  class Edge {
-  public:
-    Edge(Value *V, APInt val, bool upper)
-      : vertex(V), value(val), upper_bound(upper) {}
-
-    Value *getVertex() const { return vertex; }
-    const APInt &getValue() const { return value; }
-    bool isUpperBound() const { return upper_bound; }
-
-  private:
-    Value *vertex;
-    APInt value;
-    bool upper_bound;
-  };
-
-  /// Weighted and Directed graph to represent constraints.
-  /// There is one type of constraint, a <= b + X, which will generate an
-  /// edge from b to a with weight X.
-  class InequalityGraph {
-  public:
-
-    /// Adds an edge from V_from to V_to with weight value
-    void addEdge(Value *V_from, Value *V_to, APInt value, bool upper);
-
-    /// Test if there is a node V
-    bool hasNode(Value *V) const { return graph.count(V); }
-
-    /// Test if there is any edge from V in the upper direction
-    bool hasEdge(Value *V, bool upper) const;
-
-    /// Returns all edges pointed by vertex V
-    SmallVector<Edge, 16> getEdges(Value *V) const {
-      return graph.lookup(V);
-    }
-
-    /// Prints the graph in dot format.
-    /// Blue edges represent upper bound and Red lower bound.
-    void printGraph(raw_ostream &OS, Function &F) const {
-      printHeader(OS, F);
-      printBody(OS);
-      printFooter(OS);
-    }
-
-    /// Clear the graph
-    void clear() {
-      graph.clear();
-    }
-
-  private:
-    DenseMap<Value *, SmallVector<Edge, 16> > graph;
-
-    /// Prints the header of the dot file
-    void printHeader(raw_ostream &OS, Function &F) const;
-
-    /// Prints the footer of the dot file
-    void printFooter(raw_ostream &OS) const {
-      OS << "}\n";
-    }
-
-    /// Prints the body of the dot file
-    void printBody(raw_ostream &OS) const;
-
-    /// Prints vertex source to the dot file
-    void printVertex(raw_ostream &OS, Value *source) const;
-
-    /// Prints the edge to the dot file
-    void printEdge(raw_ostream &OS, Value *source, const Edge &edge) const;
-
-    void printName(raw_ostream &OS, Value *info) const;
-  };
-
-  /// Iterates through all BasicBlocks, if the Terminator Instruction
-  /// uses an Comparator Instruction, all operands of this comparator
-  /// are sent to be transformed to SSI. Only Instruction operands are
-  /// transformed.
-  void createSSI(Function &F);
-
-  /// Creates the graphs for this function.
-  /// It will look for all comparators used in branches, and create them.
-  /// These comparators will create constraints for any instruction as an
-  /// operand.
-  void executeABCD(Function &F);
-
-  /// Seeks redundancies in the comparator instruction CI.
-  /// If the ABCD algorithm can prove that the comparator CI always
-  /// takes one way, then the Terminator Instruction TI is substituted from
-  /// a conditional branch to a unconditional one.
-  /// This code basically receives a comparator, and verifies which kind of
-  /// instruction it is. Depending on the kind of instruction, we use different
-  /// strategies to prove its redundancy.
-  void seekRedundancy(ICmpInst *ICI, TerminatorInst *TI);
-
-  /// Substitutes Terminator Instruction TI, that is a conditional branch,
-  /// with one unconditional branch. Succ_edge determines if the new
-  /// unconditional edge will be the first or second edge of the former TI
-  /// instruction.
-  void removeRedundancy(TerminatorInst *TI, bool Succ_edge);
-
-  /// When an conditional branch is removed, the BasicBlock that is no longer
-  /// reachable will have problems in phi functions. This method fixes these
-  /// phis removing the former BasicBlock from the list of incoming BasicBlocks
-  /// of all phis. In case the phi remains with no predecessor it will be
-  /// marked to be removed later.
-  void fixPhi(BasicBlock *BB, BasicBlock *Succ);
-
-  /// Removes phis that have no predecessor
-  void removePhis();
-
-  /// Creates constraints for Instructions.
-  /// If the constraint for this instruction has already been created
-  /// nothing is done.
-  void createConstraintInstruction(Instruction *I);
-
-  /// Creates constraints for Binary Operators.
-  /// It will create constraints only for addition and subtraction,
-  /// the other binary operations are not treated by ABCD.
-  /// For additions in the form a = b + X and a = X + b, where X is a constant,
-  /// the constraint a <= b + X can be obtained. For this constraint, an edge
-  /// a->b with weight X is added to the lower bound graph, and an edge
-  /// b->a with weight -X is added to the upper bound graph.
-  /// Only subtractions in the format a = b - X is used by ABCD.
-  /// Edges are created using the same semantic as addition.
-  void createConstraintBinaryOperator(BinaryOperator *BO);
-
-  /// Creates constraints for Comparator Instructions.
-  /// Only comparators that have any of the following operators
-  /// are used to create constraints: >=, >, <=, <. And only if
-  /// at least one operand is an Instruction. In a Comparator Instruction
-  /// a op b, there will be 4 sigma functions a_t, a_f, b_t and b_f. Where
-  /// t and f represent sigma for operands in true and false branches. The
-  /// following constraints can be obtained. a_t <= a, a_f <= a, b_t <= b and
-  /// b_f <= b. There are two more constraints that depend on the operator.
-  /// For the operator <= : a_t <= b_t   and b_f <= a_f-1
-  /// For the operator <  : a_t <= b_t-1 and b_f <= a_f
-  /// For the operator >= : b_t <= a_t   and a_f <= b_f-1
-  /// For the operator >  : b_t <= a_t-1 and a_f <= b_f
-  void createConstraintCmpInst(ICmpInst *ICI, TerminatorInst *TI);
-
-  /// Creates constraints for PHI nodes.
-  /// In a PHI node a = phi(b,c) we can create the constraint
-  /// a<= max(b,c). With this constraint there will be the edges,
-  /// b->a and c->a with weight 0 in the lower bound graph, and the edges
-  /// a->b and a->c with weight 0 in the upper bound graph.
-  void createConstraintPHINode(PHINode *PN);
-
-  /// Given a binary operator, we are only interest in the case
-  /// that one operand is an Instruction and the other is a ConstantInt. In
-  /// this case the method returns true, otherwise false. It also obtains the
-  /// Instruction and ConstantInt from the BinaryOperator and returns it.
-  bool createBinaryOperatorInfo(BinaryOperator *BO, Instruction **I1,
-                                Instruction **I2, ConstantInt **C1,
-                                ConstantInt **C2);
-
-  /// This method creates a constraint between a Sigma and an Instruction.
-  /// These constraints are created as soon as we find a comparator that uses a
-  /// SSI variable.
-  void createConstraintSigInst(Instruction *I_op, BasicBlock *BB_succ_t,
-                               BasicBlock *BB_succ_f, PHINode **SIG_op_t,
-                               PHINode **SIG_op_f);
-
-  /// If PN_op1 and PN_o2 are different from NULL, create a constraint
-  /// PN_op2 -> PN_op1 with value. In case any of them is NULL, replace
-  /// with the respective V_op#, if V_op# is a ConstantInt.
-  void createConstraintSigSig(PHINode *SIG_op1, PHINode *SIG_op2, 
-                              ConstantInt *V_op1, ConstantInt *V_op2,
-                              APInt value);
-
-  /// Returns the sigma representing the Instruction I in BasicBlock BB.
-  /// Returns NULL in case there is no sigma for this Instruction in this
-  /// Basic Block. This methods assume that sigmas are the first instructions
-  /// in a block, and that there can be only two sigmas in a block. So it will
-  /// only look on the first two instructions of BasicBlock BB.
-  PHINode *findSigma(BasicBlock *BB, Instruction *I);
-
-  /// Original ABCD algorithm to prove redundant checks.
-  /// This implementation works on any kind of inequality branch.
-  bool demandProve(Value *a, Value *b, int c, bool upper_bound);
-
-  /// Prove that distance between b and a is <= bound
-  ProveResult prove(Value *a, Value *b, const Bound &bound, unsigned level);
-
-  /// Updates the distance value for a and b
-  void updateMemDistance(Value *a, Value *b, const Bound &bound, unsigned level,
-                         meet_function meet);
-
-  InequalityGraph inequality_graph;
-  MemoizedResult mem_result;
-  DenseMap<Value*, const Bound*> active;
-  SmallPtrSet<Value*, 16> created;
-  SmallVector<PHINode *, 16> phis_to_remove;
-};
-
-}  // end anonymous namespace.
-
-char ABCD::ID = 0;
-static RegisterPass<ABCD> X("abcd", "ABCD: Eliminating Array Bounds Checks on Demand");
-
-
-bool ABCD::runOnFunction(Function &F) {
-  modified = false;
-  createSSI(F);
-  executeABCD(F);
-  DEBUG(inequality_graph.printGraph(dbgs(), F));
-  removePhis();
-
-  inequality_graph.clear();
-  mem_result.clear();
-  active.clear();
-  created.clear();
-  phis_to_remove.clear();
-  return modified;
-}
-
-/// Iterates through all BasicBlocks, if the Terminator Instruction
-/// uses an Comparator Instruction, all operands of this comparator
-/// are sent to be transformed to SSI. Only Instruction operands are
-/// transformed.
-void ABCD::createSSI(Function &F) {
-  SSI *ssi = &getAnalysis<SSI>();
-
-  SmallVector<Instruction *, 16> Insts;
-
-  for (Function::iterator begin = F.begin(), end = F.end();
-       begin != end; ++begin) {
-    BasicBlock *BB = begin;
-    TerminatorInst *TI = BB->getTerminator();
-    if (TI->getNumOperands() == 0)
-      continue;
-
-    if (ICmpInst *ICI = dyn_cast<ICmpInst>(TI->getOperand(0))) {
-      if (Instruction *I = dyn_cast<Instruction>(ICI->getOperand(0))) {
-        modified = true;  // XXX: but yet createSSI might do nothing
-        Insts.push_back(I);
-      }
-      if (Instruction *I = dyn_cast<Instruction>(ICI->getOperand(1))) {
-        modified = true;
-        Insts.push_back(I);
-      }
-    }
-  }
-  ssi->createSSI(Insts);
-}
-
-/// Creates the graphs for this function.
-/// It will look for all comparators used in branches, and create them.
-/// These comparators will create constraints for any instruction as an
-/// operand.
-void ABCD::executeABCD(Function &F) {
-  for (Function::iterator begin = F.begin(), end = F.end();
-       begin != end; ++begin) {
-    BasicBlock *BB = begin;
-    TerminatorInst *TI = BB->getTerminator();
-    if (TI->getNumOperands() == 0)
-      continue;
-
-    ICmpInst *ICI = dyn_cast<ICmpInst>(TI->getOperand(0));
-    if (!ICI || !ICI->getOperand(0)->getType()->isIntegerTy())
-      continue;
-
-    createConstraintCmpInst(ICI, TI);
-    seekRedundancy(ICI, TI);
-  }
-}
-
-/// Seeks redundancies in the comparator instruction CI.
-/// If the ABCD algorithm can prove that the comparator CI always
-/// takes one way, then the Terminator Instruction TI is substituted from
-/// a conditional branch to a unconditional one.
-/// This code basically receives a comparator, and verifies which kind of
-/// instruction it is. Depending on the kind of instruction, we use different
-/// strategies to prove its redundancy.
-void ABCD::seekRedundancy(ICmpInst *ICI, TerminatorInst *TI) {
-  CmpInst::Predicate Pred = ICI->getPredicate();
-
-  Value *source, *dest;
-  int distance1, distance2;
-  bool upper;
-
-  switch(Pred) {
-    case CmpInst::ICMP_SGT: // signed greater than
-      upper = false;
-      distance1 = 1;
-      distance2 = 0;
-      break;
-
-    case CmpInst::ICMP_SGE: // signed greater or equal
-      upper = false;
-      distance1 = 0;
-      distance2 = -1;
-      break;
-
-    case CmpInst::ICMP_SLT: // signed less than
-      upper = true;
-      distance1 = -1;
-      distance2 = 0;
-      break;
-
-    case CmpInst::ICMP_SLE: // signed less or equal
-      upper = true;
-      distance1 = 0;
-      distance2 = 1;
-      break;
-
-    default:
-      return;
-  }
-
-  ++NumBranchTested;
-  source = ICI->getOperand(0);
-  dest = ICI->getOperand(1);
-  if (demandProve(dest, source, distance1, upper)) {
-    removeRedundancy(TI, true);
-  } else if (demandProve(dest, source, distance2, !upper)) {
-    removeRedundancy(TI, false);
-  }
-}
-
-/// Substitutes Terminator Instruction TI, that is a conditional branch,
-/// with one unconditional branch. Succ_edge determines if the new
-/// unconditional edge will be the first or second edge of the former TI
-/// instruction.
-void ABCD::removeRedundancy(TerminatorInst *TI, bool Succ_edge) {
-  BasicBlock *Succ;
-  if (Succ_edge) {
-    Succ = TI->getSuccessor(0);
-    fixPhi(TI->getParent(), TI->getSuccessor(1));
-  } else {
-    Succ = TI->getSuccessor(1);
-    fixPhi(TI->getParent(), TI->getSuccessor(0));
-  }
-
-  BranchInst::Create(Succ, TI);
-  TI->eraseFromParent();  // XXX: invoke
-  ++NumBranchRemoved;
-  modified = true;
-}
-
-/// When an conditional branch is removed, the BasicBlock that is no longer
-/// reachable will have problems in phi functions. This method fixes these
-/// phis removing the former BasicBlock from the list of incoming BasicBlocks
-/// of all phis. In case the phi remains with no predecessor it will be
-/// marked to be removed later.
-void ABCD::fixPhi(BasicBlock *BB, BasicBlock *Succ) {
-  BasicBlock::iterator begin = Succ->begin();
-  while (PHINode *PN = dyn_cast<PHINode>(begin++)) {
-    PN->removeIncomingValue(BB, false);
-    if (PN->getNumIncomingValues() == 0)
-      phis_to_remove.push_back(PN);
-  }
-}
-
-/// Removes phis that have no predecessor
-void ABCD::removePhis() {
-  for (unsigned i = 0, e = phis_to_remove.size(); i != e; ++i) {
-    PHINode *PN = phis_to_remove[i];
-    PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
-    PN->eraseFromParent();
-  }
-}
-
-/// Creates constraints for Instructions.
-/// If the constraint for this instruction has already been created
-/// nothing is done.
-void ABCD::createConstraintInstruction(Instruction *I) {
-  // Test if this instruction has not been created before
-  if (created.insert(I)) {
-    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
-      createConstraintBinaryOperator(BO);
-    } else if (PHINode *PN = dyn_cast<PHINode>(I)) {
-      createConstraintPHINode(PN);
-    }
-  }
-}
-
-/// Creates constraints for Binary Operators.
-/// It will create constraints only for addition and subtraction,
-/// the other binary operations are not treated by ABCD.
-/// For additions in the form a = b + X and a = X + b, where X is a constant,
-/// the constraint a <= b + X can be obtained. For this constraint, an edge
-/// a->b with weight X is added to the lower bound graph, and an edge
-/// b->a with weight -X is added to the upper bound graph.
-/// Only subtractions in the format a = b - X is used by ABCD.
-/// Edges are created using the same semantic as addition.
-void ABCD::createConstraintBinaryOperator(BinaryOperator *BO) {
-  Instruction *I1 = NULL, *I2 = NULL;
-  ConstantInt *CI1 = NULL, *CI2 = NULL;
-
-  // Test if an operand is an Instruction and the other is a Constant
-  if (!createBinaryOperatorInfo(BO, &I1, &I2, &CI1, &CI2))
-    return;
-
-  Instruction *I = 0;
-  APInt value;
-
-  switch (BO->getOpcode()) {
-    case Instruction::Add:
-      if (I1) {
-        I = I1;
-        value = CI2->getValue();
-      } else if (I2) {
-        I = I2;
-        value = CI1->getValue();
-      }
-      break;
-
-    case Instruction::Sub:
-      // Instructions like a = X-b, where X is a constant are not represented
-      // in the graph.
-      if (!I1)
-        return;
-
-      I = I1;
-      value = -CI2->getValue();
-      break;
-
-    default:
-      return;
-  }
-
-  inequality_graph.addEdge(I, BO, value, true);
-  inequality_graph.addEdge(BO, I, -value, false);
-  createConstraintInstruction(I);
-}
-
-/// Given a binary operator, we are only interest in the case
-/// that one operand is an Instruction and the other is a ConstantInt. In
-/// this case the method returns true, otherwise false. It also obtains the
-/// Instruction and ConstantInt from the BinaryOperator and returns it.
-bool ABCD::createBinaryOperatorInfo(BinaryOperator *BO, Instruction **I1,
-                                    Instruction **I2, ConstantInt **C1,
-                                    ConstantInt **C2) {
-  Value *op1 = BO->getOperand(0);
-  Value *op2 = BO->getOperand(1);
-
-  if ((*I1 = dyn_cast<Instruction>(op1))) {
-    if ((*C2 = dyn_cast<ConstantInt>(op2)))
-      return true; // First is Instruction and second ConstantInt
-
-    return false; // Both are Instruction
-  } else {
-    if ((*C1 = dyn_cast<ConstantInt>(op1)) &&
-        (*I2 = dyn_cast<Instruction>(op2)))
-      return true; // First is ConstantInt and second Instruction
-
-    return false; // Both are not Instruction
-  }
-}
-
-/// Creates constraints for Comparator Instructions.
-/// Only comparators that have any of the following operators
-/// are used to create constraints: >=, >, <=, <. And only if
-/// at least one operand is an Instruction. In a Comparator Instruction
-/// a op b, there will be 4 sigma functions a_t, a_f, b_t and b_f. Where
-/// t and f represent sigma for operands in true and false branches. The
-/// following constraints can be obtained. a_t <= a, a_f <= a, b_t <= b and
-/// b_f <= b. There are two more constraints that depend on the operator.
-/// For the operator <= : a_t <= b_t   and b_f <= a_f-1
-/// For the operator <  : a_t <= b_t-1 and b_f <= a_f
-/// For the operator >= : b_t <= a_t   and a_f <= b_f-1
-/// For the operator >  : b_t <= a_t-1 and a_f <= b_f
-void ABCD::createConstraintCmpInst(ICmpInst *ICI, TerminatorInst *TI) {
-  Value *V_op1 = ICI->getOperand(0);
-  Value *V_op2 = ICI->getOperand(1);
-
-  if (!V_op1->getType()->isIntegerTy())
-    return;
-
-  Instruction *I_op1 = dyn_cast<Instruction>(V_op1);
-  Instruction *I_op2 = dyn_cast<Instruction>(V_op2);
-
-  // Test if at least one operand is an Instruction
-  if (!I_op1 && !I_op2)
-    return;
-
-  BasicBlock *BB_succ_t = TI->getSuccessor(0);
-  BasicBlock *BB_succ_f = TI->getSuccessor(1);
-
-  PHINode *SIG_op1_t = NULL, *SIG_op1_f = NULL,
-          *SIG_op2_t = NULL, *SIG_op2_f = NULL;
-
-  createConstraintSigInst(I_op1, BB_succ_t, BB_succ_f, &SIG_op1_t, &SIG_op1_f);
-  createConstraintSigInst(I_op2, BB_succ_t, BB_succ_f, &SIG_op2_t, &SIG_op2_f);
-
-  int32_t width = cast<IntegerType>(V_op1->getType())->getBitWidth();
-  APInt MinusOne = APInt::getAllOnesValue(width);
-  APInt Zero = APInt::getNullValue(width);
-
-  CmpInst::Predicate Pred = ICI->getPredicate();
-  ConstantInt *CI1 = dyn_cast<ConstantInt>(V_op1);
-  ConstantInt *CI2 = dyn_cast<ConstantInt>(V_op2);
-  switch (Pred) {
-  case CmpInst::ICMP_SGT:  // signed greater than
-    createConstraintSigSig(SIG_op2_t, SIG_op1_t, CI2, CI1, MinusOne);
-    createConstraintSigSig(SIG_op1_f, SIG_op2_f, CI1, CI2, Zero);
-    break;
-
-  case CmpInst::ICMP_SGE:  // signed greater or equal
-    createConstraintSigSig(SIG_op2_t, SIG_op1_t, CI2, CI1, Zero);
-    createConstraintSigSig(SIG_op1_f, SIG_op2_f, CI1, CI2, MinusOne);
-    break;
-
-  case CmpInst::ICMP_SLT:  // signed less than
-    createConstraintSigSig(SIG_op1_t, SIG_op2_t, CI1, CI2, MinusOne);
-    createConstraintSigSig(SIG_op2_f, SIG_op1_f, CI2, CI1, Zero);
-    break;
-
-  case CmpInst::ICMP_SLE:  // signed less or equal
-    createConstraintSigSig(SIG_op1_t, SIG_op2_t, CI1, CI2, Zero);
-    createConstraintSigSig(SIG_op2_f, SIG_op1_f, CI2, CI1, MinusOne);
-    break;
-
-  default:
-    break;
-  }
-
-  if (I_op1)
-    createConstraintInstruction(I_op1);
-  if (I_op2)
-    createConstraintInstruction(I_op2);
-}
-
-/// Creates constraints for PHI nodes.
-/// In a PHI node a = phi(b,c) we can create the constraint
-/// a<= max(b,c). With this constraint there will be the edges,
-/// b->a and c->a with weight 0 in the lower bound graph, and the edges
-/// a->b and a->c with weight 0 in the upper bound graph.
-void ABCD::createConstraintPHINode(PHINode *PN) {
-  // FIXME: We really want to disallow sigma nodes, but I don't know the best
-  // way to detect the other than this.
-  if (PN->getNumOperands() == 2) return;
-  
-  int32_t width = cast<IntegerType>(PN->getType())->getBitWidth();
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-    Value *V = PN->getIncomingValue(i);
-    if (Instruction *I = dyn_cast<Instruction>(V)) {
-      createConstraintInstruction(I);
-    }
-    inequality_graph.addEdge(V, PN, APInt(width, 0), true);
-    inequality_graph.addEdge(V, PN, APInt(width, 0), false);
-  }
-}
-
-/// This method creates a constraint between a Sigma and an Instruction.
-/// These constraints are created as soon as we find a comparator that uses a
-/// SSI variable.
-void ABCD::createConstraintSigInst(Instruction *I_op, BasicBlock *BB_succ_t,
-                                   BasicBlock *BB_succ_f, PHINode **SIG_op_t,
-                                   PHINode **SIG_op_f) {
-  *SIG_op_t = findSigma(BB_succ_t, I_op);
-  *SIG_op_f = findSigma(BB_succ_f, I_op);
-
-  if (*SIG_op_t) {
-    int32_t width = cast<IntegerType>((*SIG_op_t)->getType())->getBitWidth();
-    inequality_graph.addEdge(I_op, *SIG_op_t, APInt(width, 0), true);
-    inequality_graph.addEdge(*SIG_op_t, I_op, APInt(width, 0), false);
-  }
-  if (*SIG_op_f) {
-    int32_t width = cast<IntegerType>((*SIG_op_f)->getType())->getBitWidth();
-    inequality_graph.addEdge(I_op, *SIG_op_f, APInt(width, 0), true);
-    inequality_graph.addEdge(*SIG_op_f, I_op, APInt(width, 0), false);
-  }
-}
-
-/// If PN_op1 and PN_o2 are different from NULL, create a constraint
-/// PN_op2 -> PN_op1 with value. In case any of them is NULL, replace
-/// with the respective V_op#, if V_op# is a ConstantInt.
-void ABCD::createConstraintSigSig(PHINode *SIG_op1, PHINode *SIG_op2,
-                                  ConstantInt *V_op1, ConstantInt *V_op2,
-                                  APInt value) {
-  if (SIG_op1 && SIG_op2) {
-    inequality_graph.addEdge(SIG_op2, SIG_op1, value, true);
-    inequality_graph.addEdge(SIG_op1, SIG_op2, -value, false);
-  } else if (SIG_op1 && V_op2) {
-    inequality_graph.addEdge(V_op2, SIG_op1, value, true);
-    inequality_graph.addEdge(SIG_op1, V_op2, -value, false);
-  } else if (SIG_op2 && V_op1) {
-    inequality_graph.addEdge(SIG_op2, V_op1, value, true);
-    inequality_graph.addEdge(V_op1, SIG_op2, -value, false);
-  }
-}
-
-/// Returns the sigma representing the Instruction I in BasicBlock BB.
-/// Returns NULL in case there is no sigma for this Instruction in this
-/// Basic Block. This methods assume that sigmas are the first instructions
-/// in a block, and that there can be only two sigmas in a block. So it will
-/// only look on the first two instructions of BasicBlock BB.
-PHINode *ABCD::findSigma(BasicBlock *BB, Instruction *I) {
-  // BB has more than one predecessor, BB cannot have sigmas.
-  if (I == NULL || BB->getSinglePredecessor() == NULL)
-    return NULL;
-
-  BasicBlock::iterator begin = BB->begin();
-  BasicBlock::iterator end = BB->end();
-
-  for (unsigned i = 0; i < 2 && begin != end; ++i, ++begin) {
-    Instruction *I_succ = begin;
-    if (PHINode *PN = dyn_cast<PHINode>(I_succ))
-      if (PN->getIncomingValue(0) == I)
-        return PN;
-  }
-
-  return NULL;
-}
-
-/// Original ABCD algorithm to prove redundant checks.
-/// This implementation works on any kind of inequality branch.
-bool ABCD::demandProve(Value *a, Value *b, int c, bool upper_bound) {
-  int32_t width = cast<IntegerType>(a->getType())->getBitWidth();
-  Bound bound(APInt(width, c), upper_bound);
-
-  mem_result.clear();
-  active.clear();
-
-  ProveResult res = prove(a, b, bound, 0);
-  return res != False;
-}
-
-/// Prove that distance between b and a is <= bound
-ABCD::ProveResult ABCD::prove(Value *a, Value *b, const Bound &bound,
-                              unsigned level) {
-  // if (C[b-a<=e] == True for some e <= bound
-  // Same or stronger difference was already proven
-  if (mem_result.hasTrue(b, bound))
-    return True;
-
-  // if (C[b-a<=e] == False for some e >= bound
-  // Same or weaker difference was already disproved
-  if (mem_result.hasFalse(b, bound))
-    return False;
-
-  // if (C[b-a<=e] == Reduced for some e <= bound
-  // b is on a cycle that was reduced for same or stronger difference
-  if (mem_result.hasReduced(b, bound))
-    return Reduced;
-
-  // traversal reached the source vertex
-  if (a == b && Bound::geq(bound, APInt(bound.getBitWidth(), 0, true)))
-    return True;
-
-  // if b has no predecessor then fail
-  if (!inequality_graph.hasEdge(b, bound.isUpperBound()))
-    return False;
-
-  // a cycle was encountered
-  if (active.count(b)) {
-    if (Bound::leq(*active.lookup(b), bound))
-      return Reduced; // a "harmless" cycle
-
-    return False; // an amplifying cycle
-  }
-
-  active[b] = &bound;
-  PHINode *PN = dyn_cast<PHINode>(b);
-
-  // Test if a Value is a Phi. If it is a PHINode with more than 1 incoming
-  // value, then it is a phi, if it has 1 incoming value it is a sigma.
-  if (PN && PN->getNumIncomingValues() > 1)
-    updateMemDistance(a, b, bound, level, min);
-  else
-    updateMemDistance(a, b, bound, level, max);
-
-  active.erase(b);
-
-  ABCD::ProveResult res = mem_result.getBoundResult(b, bound);
-  return res;
-}
-
-/// Updates the distance value for a and b
-void ABCD::updateMemDistance(Value *a, Value *b, const Bound &bound,
-                             unsigned level, meet_function meet) {
-  ABCD::ProveResult res = (meet == max) ? False : True;
-
-  SmallVector<Edge, 16> Edges = inequality_graph.getEdges(b);
-  SmallVector<Edge, 16>::iterator begin = Edges.begin(), end = Edges.end();
-
-  for (; begin != end ; ++begin) {
-    if (((res >= Reduced) && (meet == max)) ||
-       ((res == False) && (meet == min))) {
-      break;
-    }
-    const Edge &in = *begin;
-    if (in.isUpperBound() == bound.isUpperBound()) {
-      Value *succ = in.getVertex();
-      res = meet(res, prove(a, succ, Bound(bound, in.getValue()),
-                            level+1));
-    }
-  }
-
-  mem_result.updateBound(b, bound, res);
-}
-
-/// Return the stored result for this bound
-ABCD::ProveResult ABCD::MemoizedResultChart::getResult(const Bound &bound)const{
-  if (max_false && Bound::leq(bound, *max_false))
-    return False;
-  if (min_true && Bound::leq(*min_true, bound))
-    return True;
-  if (min_reduced && Bound::leq(*min_reduced, bound))
-    return Reduced;
-  return False;
-}
-
-/// Stores a false found
-void ABCD::MemoizedResultChart::addFalse(const Bound &bound) {
-  if (!max_false || Bound::leq(*max_false, bound))
-    max_false.reset(new Bound(bound));
-
-  if (Bound::eq(max_false.get(), min_reduced.get()))
-    min_reduced.reset(new Bound(Bound::createIncrement(*min_reduced)));
-  if (Bound::eq(max_false.get(), min_true.get()))
-    min_true.reset(new Bound(Bound::createIncrement(*min_true)));
-  if (Bound::eq(min_reduced.get(), min_true.get()))
-    min_reduced.reset();
-  clearRedundantReduced();
-}
-
-/// Stores a true found
-void ABCD::MemoizedResultChart::addTrue(const Bound &bound) {
-  if (!min_true || Bound::leq(bound, *min_true))
-    min_true.reset(new Bound(bound));
-
-  if (Bound::eq(min_true.get(), min_reduced.get()))
-    min_reduced.reset(new Bound(Bound::createDecrement(*min_reduced)));
-  if (Bound::eq(min_true.get(), max_false.get()))
-    max_false.reset(new Bound(Bound::createDecrement(*max_false)));
-  if (Bound::eq(max_false.get(), min_reduced.get()))
-    min_reduced.reset();
-  clearRedundantReduced();
-}
-
-/// Stores a Reduced found
-void ABCD::MemoizedResultChart::addReduced(const Bound &bound) {
-  if (!min_reduced || Bound::leq(bound, *min_reduced))
-    min_reduced.reset(new Bound(bound));
-
-  if (Bound::eq(min_reduced.get(), min_true.get()))
-    min_true.reset(new Bound(Bound::createIncrement(*min_true)));
-  if (Bound::eq(min_reduced.get(), max_false.get()))
-    max_false.reset(new Bound(Bound::createDecrement(*max_false)));
-}
-
-/// Clears redundant reduced
-/// If a min_true is smaller than a min_reduced then the min_reduced
-/// is unnecessary and then removed. It also works for min_reduced
-/// begin smaller than max_false.
-void ABCD::MemoizedResultChart::clearRedundantReduced() {
-  if (min_true && min_reduced && Bound::lt(*min_true, *min_reduced))
-    min_reduced.reset();
-  if (max_false && min_reduced && Bound::lt(*min_reduced, *max_false))
-    min_reduced.reset();
-}
-
-/// Stores the bound found
-void ABCD::MemoizedResult::updateBound(Value *b, const Bound &bound,
-                                       const ProveResult res) {
-  if (res == False) {
-    map[b].addFalse(bound);
-  } else if (res == True) {
-    map[b].addTrue(bound);
-  } else {
-    map[b].addReduced(bound);
-  }
-}
-
-/// Adds an edge from V_from to V_to with weight value
-void ABCD::InequalityGraph::addEdge(Value *V_to, Value *V_from,
-                                    APInt value, bool upper) {
-  assert(V_from->getType() == V_to->getType());
-  assert(cast<IntegerType>(V_from->getType())->getBitWidth() ==
-         value.getBitWidth());
-
-  graph[V_from].push_back(Edge(V_to, value, upper));
-}
-
-/// Test if there is any edge from V in the upper direction
-bool ABCD::InequalityGraph::hasEdge(Value *V, bool upper) const {
-  SmallVector<Edge, 16> it = graph.lookup(V);
-
-  SmallVector<Edge, 16>::iterator begin = it.begin();
-  SmallVector<Edge, 16>::iterator end = it.end();
-  for (; begin != end; ++begin) {
-    if (begin->isUpperBound() == upper) {
-      return true;
-    }
-  }
-  return false;
-}
-
-/// Prints the header of the dot file
-void ABCD::InequalityGraph::printHeader(raw_ostream &OS, Function &F) const {
-  OS << "digraph dotgraph {\n";
-  OS << "label=\"Inequality Graph for \'";
-  OS << F.getNameStr() << "\' function\";\n";
-  OS << "node [shape=record,fontname=\"Times-Roman\",fontsize=14];\n";
-}
-
-/// Prints the body of the dot file
-void ABCD::InequalityGraph::printBody(raw_ostream &OS) const {
-  DenseMap<Value *, SmallVector<Edge, 16> >::const_iterator begin =
-      graph.begin(), end = graph.end();
-
-  for (; begin != end ; ++begin) {
-    SmallVector<Edge, 16>::const_iterator begin_par =
-        begin->second.begin(), end_par = begin->second.end();
-    Value *source = begin->first;
-
-    printVertex(OS, source);
-
-    for (; begin_par != end_par ; ++begin_par) {
-      const Edge &edge = *begin_par;
-      printEdge(OS, source, edge);
-    }
-  }
-}
-
-/// Prints vertex source to the dot file
-///
-void ABCD::InequalityGraph::printVertex(raw_ostream &OS, Value *source) const {
-  OS << "\"";
-  printName(OS, source);
-  OS << "\"";
-  OS << " [label=\"{";
-  printName(OS, source);
-  OS << "}\"];\n";
-}
-
-/// Prints the edge to the dot file
-void ABCD::InequalityGraph::printEdge(raw_ostream &OS, Value *source,
-                                      const Edge &edge) const {
-  Value *dest = edge.getVertex();
-  APInt value = edge.getValue();
-  bool upper = edge.isUpperBound();
-
-  OS << "\"";
-  printName(OS, source);
-  OS << "\"";
-  OS << " -> ";
-  OS << "\"";
-  printName(OS, dest);
-  OS << "\"";
-  OS << " [label=\"" << value << "\"";
-  if (upper) {
-    OS << "color=\"blue\"";
-  } else {
-    OS << "color=\"red\"";
-  }
-  OS << "];\n";
-}
-
-void ABCD::InequalityGraph::printName(raw_ostream &OS, Value *info) const {
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(info)) {
-    OS << *CI;
-  } else {
-    if (!info->hasName()) {
-      info->setName("V");
-    }
-    OS << info->getNameStr();
-  }
-}
-
-/// createABCDPass - The public interface to this file...
-FunctionPass *llvm::createABCDPass() {
-  return new ABCD();
-}
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index 2d19467ce7462..ada086e9db766 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -33,7 +33,7 @@ STATISTIC(NumRemoved, "Number of instructions removed");
 namespace {
   struct ADCE : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    ADCE() : FunctionPass(&ID) {}
+    ADCE() : FunctionPass(ID) {}
     
     virtual bool runOnFunction(Function& F);
     
@@ -45,7 +45,7 @@ namespace {
 }
 
 char ADCE::ID = 0;
-static RegisterPass<ADCE> X("adce", "Aggressive Dead Code Elimination");
+INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false);
 
 bool ADCE::runOnFunction(Function& F) {
   SmallPtrSet<Instruction*, 128> alive;
diff --git a/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/lib/Transforms/Scalar/BasicBlockPlacement.cpp
index 54533f50405f3..b144678c6a0ed 100644
--- a/lib/Transforms/Scalar/BasicBlockPlacement.cpp
+++ b/lib/Transforms/Scalar/BasicBlockPlacement.cpp
@@ -41,7 +41,7 @@ STATISTIC(NumMoved, "Number of basic blocks moved");
 namespace {
   struct BlockPlacement : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    BlockPlacement() : FunctionPass(&ID) {}
+    BlockPlacement() : FunctionPass(ID) {}
 
     virtual bool runOnFunction(Function &F);
 
@@ -74,8 +74,8 @@ namespace {
 }
 
 char BlockPlacement::ID = 0;
-static RegisterPass<BlockPlacement>
-X("block-placement", "Profile Guided Basic Block Placement");
+INITIALIZE_PASS(BlockPlacement, "block-placement",
+                "Profile Guided Basic Block Placement", false, false);
 
 FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); }
 
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 1a3b10cc9baaf..b7598eace536b 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -1,9 +1,9 @@
 add_llvm_library(LLVMScalarOpts
-  ABCD.cpp
   ADCE.cpp
   BasicBlockPlacement.cpp
   CodeGenPrepare.cpp
   ConstantProp.cpp
+  CorrelatedValuePropagation.cpp
   DCE.cpp
   DeadStoreElimination.cpp
   GEPSplitter.cpp
@@ -17,6 +17,7 @@ add_llvm_library(LLVMScalarOpts
   LoopStrengthReduce.cpp
   LoopUnrollPass.cpp
   LoopUnswitch.cpp
+  LowerAtomic.cpp
   MemCpyOptimizer.cpp
   Reassociate.cpp
   Reg2Mem.cpp
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 272066c8c0c4b..e07b761e589c1 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -33,6 +33,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
@@ -41,6 +42,11 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+static cl::opt<bool>
+CriticalEdgeSplit("cgp-critical-edge-splitting",
+                  cl::desc("Split critical edges during codegen prepare"),
+                  cl::init(true), cl::Hidden);
+
 namespace {
   class CodeGenPrepare : public FunctionPass {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
@@ -54,7 +60,7 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit CodeGenPrepare(const TargetLowering *tli = 0)
-      : FunctionPass(&ID), TLI(tli) {}
+      : FunctionPass(ID), TLI(tli) {}
     bool runOnFunction(Function &F);
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
@@ -82,8 +88,8 @@ namespace {
 }
 
 char CodeGenPrepare::ID = 0;
-static RegisterPass<CodeGenPrepare> X("codegenprepare",
-                                      "Optimize for code generation");
+INITIALIZE_PASS(CodeGenPrepare, "codegenprepare",
+                "Optimize for code generation", false, false);
 
 FunctionPass *llvm::createCodeGenPreparePass(const TargetLowering *TLI) {
   return new CodeGenPrepare(TLI);
@@ -427,9 +433,9 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
   // If these values will be promoted, find out what they will be promoted
   // to.  This helps us consider truncates on PPC as noop copies when they
   // are.
-  if (TLI.getTypeAction(CI->getContext(), SrcVT) == TargetLowering::Promote)
+  if (TLI.getTypeAction(SrcVT) == TargetLowering::Promote)
     SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
-  if (TLI.getTypeAction(CI->getContext(), DstVT) == TargetLowering::Promote)
+  if (TLI.getTypeAction(DstVT) == TargetLowering::Promote)
     DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
 
   // If, after promotion, these are the same types, this is a noop copy.
@@ -548,9 +554,9 @@ protected:
     CI->eraseFromParent();
   }
   bool isFoldable(unsigned SizeCIOp, unsigned, bool) const {
-      if (ConstantInt *SizeCI = dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp
-                                                        - CallInst::ArgOffset)))
-      return SizeCI->isAllOnesValue();
+      if (ConstantInt *SizeCI =
+                             dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp)))
+        return SizeCI->isAllOnesValue();
     return false;
   }
 };
@@ -891,12 +897,14 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
   bool MadeChange = false;
 
   // Split all critical edges where the dest block has a PHI.
-  TerminatorInst *BBTI = BB.getTerminator();
-  if (BBTI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(BBTI)) {
-    for (unsigned i = 0, e = BBTI->getNumSuccessors(); i != e; ++i) {
-      BasicBlock *SuccBB = BBTI->getSuccessor(i);
-      if (isa<PHINode>(SuccBB->begin()) && isCriticalEdge(BBTI, i, true))
-        SplitEdgeNicely(BBTI, i, BackEdges, this);
+  if (CriticalEdgeSplit) {
+    TerminatorInst *BBTI = BB.getTerminator();
+    if (BBTI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(BBTI)) {
+      for (unsigned i = 0, e = BBTI->getNumSuccessors(); i != e; ++i) {
+        BasicBlock *SuccBB = BBTI->getSuccessor(i);
+        if (isa<PHINode>(SuccBB->begin()) && isCriticalEdge(BBTI, i, true))
+          SplitEdgeNicely(BBTI, i, BackEdges, this);
+      }
     }
   }
 
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index ea208135739d5..a0ea369d0cadd 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -34,7 +34,7 @@ STATISTIC(NumInstKilled, "Number of instructions killed");
 namespace {
   struct ConstantPropagation : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    ConstantPropagation() : FunctionPass(&ID) {}
+    ConstantPropagation() : FunctionPass(ID) {}
 
     bool runOnFunction(Function &F);
 
@@ -45,8 +45,8 @@ namespace {
 }
 
 char ConstantPropagation::ID = 0;
-static RegisterPass<ConstantPropagation>
-X("constprop", "Simple constant propagation");
+INITIALIZE_PASS(ConstantPropagation, "constprop",
+                "Simple constant propagation", false, false);
 
 FunctionPass *llvm::createConstantPropagationPass() {
   return new ConstantPropagation();
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
new file mode 100644
index 0000000000000..0d4e45de34664
--- /dev/null
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -0,0 +1,200 @@
+//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Correlated Value Propagation pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "correlated-value-propagation"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumPhis,      "Number of phis propagated");
+STATISTIC(NumSelects,   "Number of selects propagated");
+STATISTIC(NumMemAccess, "Number of memory access targets propagated");
+STATISTIC(NumCmps,      "Number of comparisons propagated");
+
+namespace {
+  class CorrelatedValuePropagation : public FunctionPass {
+    LazyValueInfo *LVI;
+    
+    bool processSelect(SelectInst *SI);
+    bool processPHI(PHINode *P);
+    bool processMemAccess(Instruction *I);
+    bool processCmp(CmpInst *C);
+    
+  public:
+    static char ID;
+    CorrelatedValuePropagation(): FunctionPass(ID) { }
+    
+    bool runOnFunction(Function &F);
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LazyValueInfo>();
+    }
+  };
+}
+
+char CorrelatedValuePropagation::ID = 0;
+INITIALIZE_PASS(CorrelatedValuePropagation, "correlated-propagation",
+                "Value Propagation", false, false);
+
+// Public interface to the Value Propagation pass
+Pass *llvm::createCorrelatedValuePropagationPass() {
+  return new CorrelatedValuePropagation();
+}
+
+bool CorrelatedValuePropagation::processSelect(SelectInst *S) {
+  if (S->getType()->isVectorTy()) return false;
+  if (isa<Constant>(S->getOperand(0))) return false;
+  
+  Constant *C = LVI->getConstant(S->getOperand(0), S->getParent());
+  if (!C) return false;
+  
+  ConstantInt *CI = dyn_cast<ConstantInt>(C);
+  if (!CI) return false;
+  
+  S->replaceAllUsesWith(S->getOperand(CI->isOne() ? 1 : 2));
+  S->eraseFromParent();
+
+  ++NumSelects;
+  
+  return true;
+}
+
+bool CorrelatedValuePropagation::processPHI(PHINode *P) {
+  bool Changed = false;
+  
+  BasicBlock *BB = P->getParent();
+  for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
+    Value *Incoming = P->getIncomingValue(i);
+    if (isa<Constant>(Incoming)) continue;
+    
+    Constant *C = LVI->getConstantOnEdge(P->getIncomingValue(i),
+                                         P->getIncomingBlock(i),
+                                         BB);
+    if (!C) continue;
+    
+    P->setIncomingValue(i, C);
+    Changed = true;
+  }
+  
+  if (Value *ConstVal = P->hasConstantValue()) {
+    P->replaceAllUsesWith(ConstVal);
+    P->eraseFromParent();
+    Changed = true;
+  }
+  
+  ++NumPhis;
+  
+  return Changed;
+}
+
+bool CorrelatedValuePropagation::processMemAccess(Instruction *I) {
+  Value *Pointer = 0;
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    Pointer = L->getPointerOperand();
+  else
+    Pointer = cast<StoreInst>(I)->getPointerOperand();
+  
+  if (isa<Constant>(Pointer)) return false;
+  
+  Constant *C = LVI->getConstant(Pointer, I->getParent());
+  if (!C) return false;
+  
+  ++NumMemAccess;
+  I->replaceUsesOfWith(Pointer, C);
+  return true;
+}
+
+/// processCmp - If the value of this comparison could be determined locally,
+/// constant propagation would already have figured it out.  Instead, walk
+/// the predecessors and statically evaluate the comparison based on information
+/// available on that edge.  If a given static evaluation is true on ALL
+/// incoming edges, then it's true universally and we can simplify the compare.
+bool CorrelatedValuePropagation::processCmp(CmpInst *C) {
+  Value *Op0 = C->getOperand(0);
+  if (isa<Instruction>(Op0) &&
+      cast<Instruction>(Op0)->getParent() == C->getParent())
+    return false;
+  
+  Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
+  if (!Op1) return false;
+  
+  pred_iterator PI = pred_begin(C->getParent()), PE = pred_end(C->getParent());
+  if (PI == PE) return false;
+  
+  LazyValueInfo::Tristate Result = LVI->getPredicateOnEdge(C->getPredicate(), 
+                                    C->getOperand(0), Op1, *PI, C->getParent());
+  if (Result == LazyValueInfo::Unknown) return false;
+
+  ++PI;
+  while (PI != PE) {
+    LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(C->getPredicate(), 
+                                    C->getOperand(0), Op1, *PI, C->getParent());
+    if (Res != Result) return false;
+    ++PI;
+  }
+  
+  ++NumCmps;
+  
+  if (Result == LazyValueInfo::True)
+    C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext()));
+  else
+    C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext()));
+  
+  C->eraseFromParent();
+
+  return true;
+}
+
+bool CorrelatedValuePropagation::runOnFunction(Function &F) {
+  LVI = &getAnalysis<LazyValueInfo>();
+  
+  bool FnChanged = false;
+  
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    bool BBChanged = false;
+    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) {
+      Instruction *II = BI++;
+      switch (II->getOpcode()) {
+      case Instruction::Select:
+        BBChanged |= processSelect(cast<SelectInst>(II));
+        break;
+      case Instruction::PHI:
+        BBChanged |= processPHI(cast<PHINode>(II));
+        break;
+      case Instruction::ICmp:
+      case Instruction::FCmp:
+        BBChanged |= processCmp(cast<CmpInst>(II));
+        break;
+      case Instruction::Load:
+      case Instruction::Store:
+        BBChanged |= processMemAccess(II);
+        break;
+      }
+    }
+    
+    // Propagating correlated values might leave cruft around.
+    // Try to clean it up before we continue.
+    if (BBChanged)
+      SimplifyInstructionsInBlock(FI);
+    
+    FnChanged |= BBChanged;
+  }
+  
+  return FnChanged;
+}
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index 39940c35da5d5..87ea8038356ae 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -35,7 +35,7 @@ namespace {
   //
   struct DeadInstElimination : public BasicBlockPass {
     static char ID; // Pass identification, replacement for typeid
-    DeadInstElimination() : BasicBlockPass(&ID) {}
+    DeadInstElimination() : BasicBlockPass(ID) {}
     virtual bool runOnBasicBlock(BasicBlock &BB) {
       bool Changed = false;
       for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
@@ -56,8 +56,8 @@ namespace {
 }
 
 char DeadInstElimination::ID = 0;
-static RegisterPass<DeadInstElimination>
-X("die", "Dead Instruction Elimination");
+INITIALIZE_PASS(DeadInstElimination, "die",
+                "Dead Instruction Elimination", false, false);
 
 Pass *llvm::createDeadInstEliminationPass() {
   return new DeadInstElimination();
@@ -70,7 +70,7 @@ namespace {
   //
   struct DCE : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    DCE() : FunctionPass(&ID) {}
+    DCE() : FunctionPass(ID) {}
 
     virtual bool runOnFunction(Function &F);
 
@@ -81,7 +81,7 @@ namespace {
 }
 
 char DCE::ID = 0;
-static RegisterPass<DCE> Y("dce", "Dead Code Elimination");
+INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false);
 
 bool DCE::runOnFunction(Function &F) {
   // Start out with all of the instructions in the worklist...
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index e047e4ffa151c..c8fd9d9fa5561 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -40,7 +40,7 @@ namespace {
     TargetData *TD;
 
     static char ID; // Pass identification, replacement for typeid
-    DSE() : FunctionPass(&ID) {}
+    DSE() : FunctionPass(ID) {}
 
     virtual bool runOnFunction(Function &F) {
       bool Changed = false;
@@ -82,7 +82,7 @@ namespace {
 }
 
 char DSE::ID = 0;
-static RegisterPass<DSE> X("dse", "Dead Store Elimination");
+INITIALIZE_PASS(DSE, "dse", "Dead Store Elimination", false, false);
 
 FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
 
@@ -401,10 +401,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       }
       
       continue;
-    } else if (CallSite::get(BBI).getInstruction() != 0) {
+    } else if (CallSite CS = cast<Value>(BBI)) {
       // If this call does not access memory, it can't
       // be undeadifying any of our pointers.
-      CallSite CS = CallSite::get(BBI);
       if (AA.doesNotAccessMemory(CS))
         continue;
       
diff --git a/lib/Transforms/Scalar/GEPSplitter.cpp b/lib/Transforms/Scalar/GEPSplitter.cpp
index 610a41dae44b1..53dd06d24bb5b 100644
--- a/lib/Transforms/Scalar/GEPSplitter.cpp
+++ b/lib/Transforms/Scalar/GEPSplitter.cpp
@@ -27,13 +27,13 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const;
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit GEPSplitter() : FunctionPass(&ID) {}
+    explicit GEPSplitter() : FunctionPass(ID) {}
   };
 }
 
 char GEPSplitter::ID = 0;
-static RegisterPass<GEPSplitter> X("split-geps",
-                                   "split complex GEPs into simple GEPs");
+INITIALIZE_PASS(GEPSplitter, "split-geps",
+                "split complex GEPs into simple GEPs", false, false);
 
 FunctionPass *llvm::createGEPSplitterPass() {
   return new GEPSplitter();
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 88b67768fa5df..c62ce1f27f647 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -165,7 +165,6 @@ namespace {
       Expression create_expression(CastInst* C);
       Expression create_expression(GetElementPtrInst* G);
       Expression create_expression(CallInst* C);
-      Expression create_expression(Constant* C);
       Expression create_expression(ExtractValueInst* C);
       Expression create_expression(InsertValueInst* C);
       
@@ -665,7 +664,7 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit GVN(bool noloads = false)
-      : FunctionPass(&ID), NoLoads(noloads), MD(0) { }
+      : FunctionPass(ID), NoLoads(noloads), MD(0) { }
 
   private:
     bool NoLoads;
@@ -716,8 +715,7 @@ FunctionPass *llvm::createGVNPass(bool NoLoads) {
   return new GVN(NoLoads);
 }
 
-static RegisterPass<GVN> X("gvn",
-                           "Global Value Numbering");
+INITIALIZE_PASS(GVN, "gvn", "Global Value Numbering", false, false);
 
 void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   errs() << "{\n";
@@ -735,7 +733,7 @@ static bool isSafeReplacement(PHINode* p, Instruction *inst) {
 
   for (Instruction::use_iterator UI = p->use_begin(), E = p->use_end();
        UI != E; ++UI)
-    if (PHINode* use_phi = dyn_cast<PHINode>(UI))
+    if (PHINode* use_phi = dyn_cast<PHINode>(*UI))
       if (use_phi->getParent() == inst->getParent())
         return false;
 
@@ -1312,7 +1310,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   // Otherwise, we have to construct SSA form.
   SmallVector<PHINode*, 8> NewPHIs;
   SSAUpdater SSAUpdate(&NewPHIs);
-  SSAUpdate.Initialize(LI);
+  SSAUpdate.Initialize(LI->getType(), LI->getName());
   
   const Type *LoadTy = LI->getType();
   
@@ -2112,6 +2110,11 @@ bool GVN::performPRE(Function &F) {
           CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
           isa<DbgInfoIntrinsic>(CurInst))
         continue;
+      
+      // We don't currently value number ANY inline asm calls.
+      if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
+        if (CallI->isInlineAsm())
+          continue;
 
       uint32_t ValNo = VN.lookup(CurInst);
 
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index b5c9dd881df86..af2eafc47cbf3 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -77,7 +77,7 @@ namespace {
   public:
 
     static char ID; // Pass identification, replacement for typeid
-    IndVarSimplify() : LoopPass(&ID) {}
+    IndVarSimplify() : LoopPass(ID) {}
 
     virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
 
@@ -102,7 +102,7 @@ namespace {
     void RewriteNonIntegerIVs(Loop *L);
 
     ICmpInst *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
-                                   Value *IndVar,
+                                   PHINode *IndVar,
                                    BasicBlock *ExitingBlock,
                                    BranchInst *BI,
                                    SCEVExpander &Rewriter);
@@ -117,8 +117,8 @@ namespace {
 }
 
 char IndVarSimplify::ID = 0;
-static RegisterPass<IndVarSimplify>
-X("indvars", "Canonicalize Induction Variables");
+INITIALIZE_PASS(IndVarSimplify, "indvars",
+                "Canonicalize Induction Variables", false, false);
 
 Pass *llvm::createIndVarSimplifyPass() {
   return new IndVarSimplify();
@@ -131,7 +131,7 @@ Pass *llvm::createIndVarSimplifyPass() {
 /// is actually a much broader range than just linear tests.
 ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
                                    const SCEV *BackedgeTakenCount,
-                                   Value *IndVar,
+                                   PHINode *IndVar,
                                    BasicBlock *ExitingBlock,
                                    BranchInst *BI,
                                    SCEVExpander &Rewriter) {
@@ -181,7 +181,7 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
     // The BackedgeTaken expression contains the number of times that the
     // backedge branches to the loop header.  This is one less than the
     // number of times the loop executes, so use the incremented indvar.
-    CmpIndVar = L->getCanonicalInductionVariableIncrement();
+    CmpIndVar = IndVar->getIncomingValueForBlock(ExitingBlock);
   } else {
     // We have to use the preincremented value...
     RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount,
@@ -534,7 +534,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Now that we know the largest of the induction variable expressions
   // in this loop, insert a canonical induction variable of the largest size.
-  Value *IndVar = 0;
+  PHINode *IndVar = 0;
   if (NeedCannIV) {
     // Check to see if the loop already has any canonical-looking induction
     // variables. If any are present and wider than the planned canonical
@@ -862,9 +862,9 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // Check Incr uses. One user is PN and the other user is an exit condition
   // used by the conditional terminator.
   Value::use_iterator IncrUse = Incr->use_begin();
-  Instruction *U1 = cast<Instruction>(IncrUse++);
+  Instruction *U1 = cast<Instruction>(*IncrUse++);
   if (IncrUse == Incr->use_end()) return;
-  Instruction *U2 = cast<Instruction>(IncrUse++);
+  Instruction *U2 = cast<Instruction>(*IncrUse++);
   if (IncrUse != Incr->use_end()) return;
 
   // Find exit condition, which is an fcmp.  If it doesn't exist, or if it isn't
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index edce14cd92eaf..104d5aecbdd32 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -45,7 +46,10 @@ Threshold("jump-threading-threshold",
 
 // Turn on use of LazyValueInfo.
 static cl::opt<bool>
-EnableLVI("enable-jump-threading-lvi", cl::ReallyHidden);
+EnableLVI("enable-jump-threading-lvi",
+          cl::desc("Use LVI for jump threading"),
+          cl::init(true),
+          cl::ReallyHidden);
 
 
 
@@ -74,15 +78,32 @@ namespace {
 #else
     SmallSet<AssertingVH<BasicBlock>, 16> LoopHeaders;
 #endif
+    DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet;
+    
+    // RAII helper for updating the recursion stack.
+    struct RecursionSetRemover {
+      DenseSet<std::pair<Value*, BasicBlock*> > &TheSet;
+      std::pair<Value*, BasicBlock*> ThePair;
+      
+      RecursionSetRemover(DenseSet<std::pair<Value*, BasicBlock*> > &S,
+                          std::pair<Value*, BasicBlock*> P)
+        : TheSet(S), ThePair(P) { }
+      
+      ~RecursionSetRemover() {
+        TheSet.erase(ThePair);
+      }
+    };
   public:
     static char ID; // Pass identification
-    JumpThreading() : FunctionPass(&ID) {}
+    JumpThreading() : FunctionPass(ID) {}
 
     bool runOnFunction(Function &F);
     
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      if (EnableLVI)
+      if (EnableLVI) {
         AU.addRequired<LazyValueInfo>();
+        AU.addPreserved<LazyValueInfo>();
+      }
     }
     
     void FindLoopHeaders(Function &F);
@@ -111,8 +132,8 @@ namespace {
 }
 
 char JumpThreading::ID = 0;
-static RegisterPass<JumpThreading>
-X("jump-threading", "Jump Threading");
+INITIALIZE_PASS(JumpThreading, "jump-threading",
+                "Jump Threading", false, false);
 
 // Public interface to the Jump Threading pass
 FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); }
@@ -144,6 +165,7 @@ bool JumpThreading::runOnFunction(Function &F) {
         DEBUG(dbgs() << "  JT: Deleting dead block '" << BB->getName()
               << "' with terminator: " << *BB->getTerminator() << '\n');
         LoopHeaders.erase(BB);
+        if (LVI) LVI->eraseBlock(BB);
         DeleteDeadBlock(BB);
         Changed = true;
       } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
@@ -164,6 +186,11 @@ bool JumpThreading::runOnFunction(Function &F) {
             bool ErasedFromLoopHeaders = LoopHeaders.erase(BB);
             BasicBlock *Succ = BI->getSuccessor(0);
             
+            // FIXME: It is always conservatively correct to drop the info
+            // for a block even if it doesn't get erased.  This isn't totally
+            // awesome, but it allows us to use AssertingVH to prevent nasty
+            // dangling pointer issues within LazyValueInfo.
+            if (LVI) LVI->eraseBlock(BB);
             if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) {
               Changed = true;
               // If we deleted BB and BB was the header of a loop, then the
@@ -251,6 +278,17 @@ void JumpThreading::FindLoopHeaders(Function &F) {
     LoopHeaders.insert(const_cast<BasicBlock*>(Edges[i].second));
 }
 
+// Helper method for ComputeValueKnownInPredecessors.  If Value is a
+// ConstantInt, push it.  If it's an undef, push 0.  Otherwise, do nothing.
+static void PushConstantIntOrUndef(SmallVectorImpl<std::pair<ConstantInt*,
+                                                        BasicBlock*> > &Result,
+                              Constant *Value, BasicBlock* BB){
+  if (ConstantInt *FoldedCInt = dyn_cast<ConstantInt>(Value))
+    Result.push_back(std::make_pair(FoldedCInt, BB));
+  else if (isa<UndefValue>(Value))
+    Result.push_back(std::make_pair((ConstantInt*)0, BB));
+}
+
 /// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see
 /// if we can infer that the value is a known ConstantInt in any of our
 /// predecessors.  If so, return the known list of value and pred BB in the
@@ -260,12 +298,24 @@ void JumpThreading::FindLoopHeaders(Function &F) {
 ///
 bool JumpThreading::
 ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
+  // This method walks up use-def chains recursively.  Because of this, we could
+  // get into an infinite loop going around loops in the use-def chain.  To
+  // prevent this, keep track of what (value, block) pairs we've already visited
+  // and terminate the search if we loop back to them
+  if (!RecursionSet.insert(std::make_pair(V, BB)).second)
+    return false;
+  
+  // An RAII help to remove this pair from the recursion set once the recursion
+  // stack pops back out again.
+  RecursionSetRemover remover(RecursionSet, std::make_pair(V, BB));
+  
   // If V is a constantint, then it is known in all predecessors.
   if (isa<ConstantInt>(V) || isa<UndefValue>(V)) {
     ConstantInt *CI = dyn_cast<ConstantInt>(V);
     
     for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
       Result.push_back(std::make_pair(CI, *PI));
+    
     return true;
   }
   
@@ -313,8 +363,15 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
       if (isa<ConstantInt>(InVal) || isa<UndefValue>(InVal)) {
         ConstantInt *CI = dyn_cast<ConstantInt>(InVal);
         Result.push_back(std::make_pair(CI, PN->getIncomingBlock(i)));
+      } else if (LVI) {
+        Constant *CI = LVI->getConstantOnEdge(InVal,
+                                              PN->getIncomingBlock(i), BB);
+        // LVI returns null is no value could be determined.
+        if (!CI) continue;
+        PushConstantIntOrUndef(Result, CI, PN->getIncomingBlock(i));
       }
     }
+    
     return !Result.empty();
   }
   
@@ -338,29 +395,26 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
       else
         InterestingVal = ConstantInt::getFalse(I->getContext());
       
+      SmallPtrSet<BasicBlock*, 4> LHSKnownBBs;
+      
       // Scan for the sentinel.  If we find an undef, force it to the
       // interesting value: x|undef -> true and x&undef -> false.
       for (unsigned i = 0, e = LHSVals.size(); i != e; ++i)
         if (LHSVals[i].first == InterestingVal || LHSVals[i].first == 0) {
           Result.push_back(LHSVals[i]);
           Result.back().first = InterestingVal;
+          LHSKnownBBs.insert(LHSVals[i].second);
         }
       for (unsigned i = 0, e = RHSVals.size(); i != e; ++i)
         if (RHSVals[i].first == InterestingVal || RHSVals[i].first == 0) {
           // If we already inferred a value for this block on the LHS, don't
           // re-add it.
-          bool HasValue = false;
-          for (unsigned r = 0, e = Result.size(); r != e; ++r)
-            if (Result[r].second == RHSVals[i].second) {
-              HasValue = true;
-              break;
-            }
-          
-          if (!HasValue) {
+          if (!LHSKnownBBs.count(RHSVals[i].second)) {
             Result.push_back(RHSVals[i]);
             Result.back().first = InterestingVal;
           }
         }
+      
       return !Result.empty();
     }
     
@@ -377,8 +431,27 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
         if (Result[i].first)
           Result[i].first =
             cast<ConstantInt>(ConstantExpr::getNot(Result[i].first));
+      
       return true;
     }
+  
+  // Try to simplify some other binary operator values.
+  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+      SmallVector<std::pair<ConstantInt*, BasicBlock*>, 8> LHSVals;
+      ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals);
+    
+      // Try to use constant folding to simplify the binary operator.
+      for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) {
+        Constant *V = LHSVals[i].first ? LHSVals[i].first :
+                                 cast<Constant>(UndefValue::get(BO->getType()));
+        Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);
+        
+        PushConstantIntOrUndef(Result, Folded, LHSVals[i].second);
+      }
+    }
+      
+    return !Result.empty();
   }
   
   // Handle compare with phi operand, where the PHI is defined in this block.
@@ -405,10 +478,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
           Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
         }
         
-        if (isa<UndefValue>(Res))
-          Result.push_back(std::make_pair((ConstantInt*)0, PredBB));
-        else if (ConstantInt *CI = dyn_cast<ConstantInt>(Res))
-          Result.push_back(std::make_pair(CI, PredBB));
+        if (Constant *ConstRes = dyn_cast<Constant>(Res))
+          PushConstantIntOrUndef(Result, ConstRes, PredBB);
       }
       
       return !Result.empty();
@@ -418,28 +489,59 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
     // If comparing a live-in value against a constant, see if we know the
     // live-in value on any predecessors.
     if (LVI && isa<Constant>(Cmp->getOperand(1)) &&
-        Cmp->getType()->isIntegerTy() && // Not vector compare.
-        (!isa<Instruction>(Cmp->getOperand(0)) ||
-         cast<Instruction>(Cmp->getOperand(0))->getParent() != BB)) {
-      Constant *RHSCst = cast<Constant>(Cmp->getOperand(1));
+        Cmp->getType()->isIntegerTy()) {
+      if (!isa<Instruction>(Cmp->getOperand(0)) ||
+          cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) {
+        Constant *RHSCst = cast<Constant>(Cmp->getOperand(1));
+
+        for (pred_iterator PI = pred_begin(BB), E = pred_end(BB);PI != E; ++PI){
+          BasicBlock *P = *PI;
+          // If the value is known by LazyValueInfo to be a constant in a
+          // predecessor, use that information to try to thread this block.
+          LazyValueInfo::Tristate Res =
+            LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0),
+                                    RHSCst, P, BB);
+          if (Res == LazyValueInfo::Unknown)
+            continue;
 
-      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
-        BasicBlock *P = *PI;
-        // If the value is known by LazyValueInfo to be a constant in a
-        // predecessor, use that information to try to thread this block.
-        LazyValueInfo::Tristate
-          Res = LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0),
-                                        RHSCst, P, BB);
-        if (Res == LazyValueInfo::Unknown)
-          continue;
+          Constant *ResC = ConstantInt::get(Cmp->getType(), Res);
+          Result.push_back(std::make_pair(cast<ConstantInt>(ResC), P));
+        }
 
-        Constant *ResC = ConstantInt::get(Cmp->getType(), Res);
-        Result.push_back(std::make_pair(cast<ConstantInt>(ResC), P));
+        return !Result.empty();
       }
-
-      return !Result.empty();
+      
+      // Try to find a constant value for the LHS of a comparison,
+      // and evaluate it statically if we can.
+      if (Constant *CmpConst = dyn_cast<Constant>(Cmp->getOperand(1))) {
+        SmallVector<std::pair<ConstantInt*, BasicBlock*>, 8> LHSVals;
+        ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals);
+        
+        for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) {
+          Constant *V = LHSVals[i].first ? LHSVals[i].first :
+                           cast<Constant>(UndefValue::get(CmpConst->getType()));
+          Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(),
+                                                      V, CmpConst);
+          PushConstantIntOrUndef(Result, Folded, LHSVals[i].second);
+        }
+        
+        return !Result.empty();
+      }
+    }
+  }
+  
+  if (LVI) {
+    // If all else fails, see if LVI can figure out a constant value for us.
+    Constant *CI = LVI->getConstant(V, BB);
+    ConstantInt *CInt = dyn_cast_or_null<ConstantInt>(CI);
+    if (CInt) {
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+        Result.push_back(std::make_pair(CInt, *PI));
     }
+    
+    return !Result.empty();
   }
+  
   return false;
 }
 
@@ -490,6 +592,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
       // Remember if SinglePred was the entry block of the function.  If so, we
       // will need to move BB back to the entry position.
       bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
+      if (LVI) LVI->eraseBlock(SinglePred);
       MergeBasicBlockIntoOnlyPred(BB);
       
       if (isEntry && BB != &BB->getParent()->getEntryBlock())
@@ -603,6 +706,44 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
           }
       }
     }
+    
+    // For a comparison where the LHS is outside this block, it's possible
+    // that we've branched on it before.  Used LVI to see if we can simplify
+    // the branch based on that.
+    BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+    Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
+    pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+    if (LVI && CondBr && CondConst && CondBr->isConditional() && PI != PE &&
+        (!isa<Instruction>(CondCmp->getOperand(0)) ||
+         cast<Instruction>(CondCmp->getOperand(0))->getParent() != BB)) {
+      // For predecessor edge, determine if the comparison is true or false
+      // on that edge.  If they're all true or all false, we can simplify the
+      // branch.
+      // FIXME: We could handle mixed true/false by duplicating code.
+      LazyValueInfo::Tristate Baseline =      
+        LVI->getPredicateOnEdge(CondCmp->getPredicate(), CondCmp->getOperand(0),
+                                CondConst, *PI, BB);
+      if (Baseline != LazyValueInfo::Unknown) {
+        // Check that all remaining incoming values match the first one.
+        while (++PI != PE) {
+          LazyValueInfo::Tristate Ret = LVI->getPredicateOnEdge(
+                                          CondCmp->getPredicate(),
+                                          CondCmp->getOperand(0),
+                                          CondConst, *PI, BB);
+          if (Ret != Baseline) break;
+        }
+        
+        // If we terminated early, then one of the values didn't match.
+        if (PI == PE) {
+          unsigned ToRemove = Baseline == LazyValueInfo::True ? 1 : 0;
+          unsigned ToKeep = Baseline == LazyValueInfo::True ? 0 : 1;
+          RemovePredecessorAndSimplify(CondBr->getSuccessor(ToRemove), BB, TD);
+          BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
+          CondBr->eraseFromParent();
+          return true;
+        }
+      }
+    }
   }
 
   // Check for some cases that are worth simplifying.  Right now we want to look
@@ -1020,6 +1161,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB) {
   SmallVector<std::pair<ConstantInt*, BasicBlock*>, 8> PredValues;
   if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues))
     return false;
+  
   assert(!PredValues.empty() &&
          "ComputeValueKnownInPredecessors returned true with no values");
 
@@ -1314,6 +1456,9 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
         << ", across block:\n    "
         << *BB << "\n");
   
+  if (LVI)
+    LVI->threadEdge(PredBB, BB, SuccBB);
+  
   // We are going to have to map operands from the original BB block to the new
   // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
   // account for entry from PredBB.
@@ -1383,7 +1528,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
     // We found a use of I outside of BB.  Rename all uses of I that are outside
     // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
     // with the two values we know.
-    SSAUpdate.Initialize(I);
+    SSAUpdate.Initialize(I->getType(), I->getName());
     SSAUpdate.AddAvailableValue(BB, I);
     SSAUpdate.AddAvailableValue(NewBB, ValueMapping[I]);
     
@@ -1538,7 +1683,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
     // We found a use of I outside of BB.  Rename all uses of I that are outside
     // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
     // with the two values we know.
-    SSAUpdate.Initialize(I);
+    SSAUpdate.Initialize(I->getType(), I->getName());
     SSAUpdate.AddAvailableValue(BB, I);
     SSAUpdate.AddAvailableValue(PredBB, ValueMapping[I]);
     
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 73473952912ee..2ef85446bd9ba 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -26,8 +26,7 @@
 //          pointer.  There are no calls in the loop which mod/ref the pointer.
 //     If these conditions are true, we can promote the loads and stores in the
 //     loop of the pointer to use a temporary alloca'd variable.  We then use
-//     the mem2reg functionality to construct the appropriate SSA form for the
-//     variable.
+//     the SSAUpdater to construct the appropriate SSA form for the value.
 //
 //===----------------------------------------------------------------------===//
 
@@ -37,14 +36,15 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Instructions.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
@@ -66,7 +66,7 @@ DisablePromotion("disable-licm-promotion", cl::Hidden,
 namespace {
   struct LICM : public LoopPass {
     static char ID; // Pass identification, replacement for typeid
-    LICM() : LoopPass(&ID) {}
+    LICM() : LoopPass(ID) {}
 
     virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
 
@@ -75,39 +75,31 @@ namespace {
     ///
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
-      AU.addRequiredID(LoopSimplifyID);
-      AU.addRequired<LoopInfo>();
       AU.addRequired<DominatorTree>();
-      AU.addRequired<DominanceFrontier>();  // For scalar promotion (mem2reg)
+      AU.addRequired<LoopInfo>();
+      AU.addRequiredID(LoopSimplifyID);
       AU.addRequired<AliasAnalysis>();
+      AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<ScalarEvolution>();
-      AU.addPreserved<DominanceFrontier>();
       AU.addPreservedID(LoopSimplifyID);
     }
 
     bool doFinalization() {
-      // Free the values stored in the map
-      for (std::map<Loop *, AliasSetTracker *>::iterator
-             I = LoopToAliasMap.begin(), E = LoopToAliasMap.end(); I != E; ++I)
-        delete I->second;
-
-      LoopToAliasMap.clear();
+      assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets");
       return false;
     }
 
   private:
-    // Various analyses that we use...
     AliasAnalysis *AA;       // Current AliasAnalysis information
     LoopInfo      *LI;       // Current LoopInfo
-    DominatorTree *DT;       // Dominator Tree for the current Loop...
-    DominanceFrontier *DF;   // Current Dominance Frontier
+    DominatorTree *DT;       // Dominator Tree for the current Loop.
 
-    // State that is updated as we process loops
+    // State that is updated as we process loops.
     bool Changed;            // Set to true when we change anything.
     BasicBlock *Preheader;   // The preheader block of the current loop...
     Loop *CurLoop;           // The current loop we are working on...
     AliasSetTracker *CurAST; // AliasSet information for the current loop...
-    std::map<Loop *, AliasSetTracker *> LoopToAliasMap;
+    DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap;
 
     /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
     void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L);
@@ -204,25 +196,12 @@ namespace {
     bool isLoopInvariantInst(Instruction &I);
     bool isNotUsedInLoop(Instruction &I);
 
-    /// PromoteValuesInLoop - Look at the stores in the loop and promote as many
-    /// to scalars as we can.
-    ///
-    void PromoteValuesInLoop();
-
-    /// FindPromotableValuesInLoop - Check the current loop for stores to
-    /// definite pointers, which are not loaded and stored through may aliases.
-    /// If these are found, create an alloca for the value, add it to the
-    /// PromotedValues list, and keep track of the mapping from value to
-    /// alloca...
-    ///
-    void FindPromotableValuesInLoop(
-                   std::vector<std::pair<AllocaInst*, Value*> > &PromotedValues,
-                                    std::map<Value*, AllocaInst*> &Val2AlMap);
+    void PromoteAliasSet(AliasSet &AS);
   };
 }
 
 char LICM::ID = 0;
-static RegisterPass<LICM> X("licm", "Loop Invariant Code Motion");
+INITIALIZE_PASS(LICM, "licm", "Loop Invariant Code Motion", false, false);
 
 Pass *llvm::createLICMPass() { return new LICM(); }
 
@@ -236,19 +215,23 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   // Get our Loop and Alias Analysis information...
   LI = &getAnalysis<LoopInfo>();
   AA = &getAnalysis<AliasAnalysis>();
-  DF = &getAnalysis<DominanceFrontier>();
   DT = &getAnalysis<DominatorTree>();
 
   CurAST = new AliasSetTracker(*AA);
-  // Collect Alias info from subloops
+  // Collect Alias info from subloops.
   for (Loop::iterator LoopItr = L->begin(), LoopItrE = L->end();
        LoopItr != LoopItrE; ++LoopItr) {
     Loop *InnerL = *LoopItr;
-    AliasSetTracker *InnerAST = LoopToAliasMap[InnerL];
-    assert (InnerAST && "Where is my AST?");
+    AliasSetTracker *InnerAST = LoopToAliasSetMap[InnerL];
+    assert(InnerAST && "Where is my AST?");
 
     // What if InnerLoop was modified by other passes ?
     CurAST->add(*InnerAST);
+    
+    // Once we've incorporated the inner loop's AST into ours, we don't need the
+    // subloop's anymore.
+    delete InnerAST;
+    LoopToAliasSetMap.erase(InnerL);
   }
   
   CurLoop = L;
@@ -263,7 +246,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
        I != E; ++I) {
     BasicBlock *BB = *I;
-    if (LI->getLoopFor(BB) == L)        // Ignore blocks in subloops...
+    if (LI->getLoopFor(BB) == L)        // Ignore blocks in subloops.
       CurAST->add(*BB);                 // Incorporate the specified basic block
   }
 
@@ -283,15 +266,24 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
     HoistRegion(DT->getNode(L->getHeader()));
 
   // Now that all loop invariants have been removed from the loop, promote any
-  // memory references to scalars that we can...
-  if (!DisablePromotion && Preheader && L->hasDedicatedExits())
-    PromoteValuesInLoop();
-
+  // memory references to scalars that we can.
+  if (!DisablePromotion && Preheader && L->hasDedicatedExits()) {
+    // Loop over all of the alias sets in the tracker object.
+    for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
+         I != E; ++I)
+      PromoteAliasSet(*I);
+  }
+  
   // Clear out loops state information for the next iteration
   CurLoop = 0;
   Preheader = 0;
 
-  LoopToAliasMap[L] = CurAST;
+  // If this loop is nested inside of another one, save the alias information
+  // for when we process the outer loop.
+  if (L->getParentLoop())
+    LoopToAliasSetMap[L] = CurAST;
+  else
+    delete CurAST;
   return Changed;
 }
 
@@ -308,7 +300,7 @@ void LICM::SinkRegion(DomTreeNode *N) {
   // If this subregion is not in the top level loop at all, exit.
   if (!CurLoop->contains(BB)) return;
 
-  // We are processing blocks in reverse dfo, so process children first...
+  // We are processing blocks in reverse dfo, so process children first.
   const std::vector<DomTreeNode*> &Children = N->getChildren();
   for (unsigned i = 0, e = Children.size(); i != e; ++i)
     SinkRegion(Children[i]);
@@ -319,6 +311,17 @@ void LICM::SinkRegion(DomTreeNode *N) {
 
   for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) {
     Instruction &I = *--II;
+    
+    // If the instruction is dead, we would try to sink it because it isn't used
+    // in the loop, instead, just delete it.
+    if (isInstructionTriviallyDead(&I)) {
+      DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+      ++II;
+      CurAST->deleteValue(&I);
+      I.eraseFromParent();
+      Changed = true;
+      continue;
+    }
 
     // Check to see if we can sink this instruction to the exit blocks
     // of the loop.  We can do this if the all users of the instruction are
@@ -350,6 +353,18 @@ void LICM::HoistRegion(DomTreeNode *N) {
     for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) {
       Instruction &I = *II++;
 
+      // Try constant folding this instruction.  If all the operands are
+      // constants, it is technically hoistable, but it would be better to just
+      // fold it.
+      if (Constant *C = ConstantFoldInstruction(&I)) {
+        DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C << '\n');
+        CurAST->copyValue(&I, C);
+        CurAST->deleteValue(&I);
+        I.replaceAllUsesWith(C);
+        I.eraseFromParent();
+        continue;
+      }
+      
       // Try hoisting the instruction out to the preheader.  We can only do this
       // if all of the operands of the instruction are loop invariant and if it
       // is safe to hoist the instruction.
@@ -357,7 +372,7 @@ void LICM::HoistRegion(DomTreeNode *N) {
       if (isLoopInvariantInst(I) && canSinkOrHoistInst(I) &&
           isSafeToExecuteUnconditionally(I))
         hoist(I);
-      }
+    }
 
   const std::vector<DomTreeNode*> &Children = N->getChildren();
   for (unsigned i = 0, e = Children.size(); i != e; ++i)
@@ -457,10 +472,10 @@ bool LICM::isLoopInvariantInst(Instruction &I) {
 /// position, and may either delete it or move it to outside of the loop.
 ///
 void LICM::sink(Instruction &I) {
-  DEBUG(dbgs() << "LICM sinking instruction: " << I);
+  DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
 
   SmallVector<BasicBlock*, 8> ExitBlocks;
-  CurLoop->getExitBlocks(ExitBlocks);
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
 
   if (isa<LoadInst>(I)) ++NumMovedLoads;
   else if (isa<CallInst>(I)) ++NumMovedCalls;
@@ -477,122 +492,101 @@ void LICM::sink(Instruction &I) {
       // If I has users in unreachable blocks, eliminate.
       // If I is not void type then replaceAllUsesWith undef.
       // This allows ValueHandlers and custom metadata to adjust itself.
-      if (!I.getType()->isVoidTy())
+      if (!I.use_empty())
         I.replaceAllUsesWith(UndefValue::get(I.getType()));
       I.eraseFromParent();
     } else {
       // Move the instruction to the start of the exit block, after any PHI
       // nodes in it.
-      I.removeFromParent();
-      BasicBlock::iterator InsertPt = ExitBlocks[0]->getFirstNonPHI();
-      ExitBlocks[0]->getInstList().insert(InsertPt, &I);
+      I.moveBefore(ExitBlocks[0]->getFirstNonPHI());
+
+      // This instruction is no longer in the AST for the current loop, because
+      // we just sunk it out of the loop.  If we just sunk it into an outer
+      // loop, we will rediscover the operation when we process it.
+      CurAST->deleteValue(&I);
     }
-  } else if (ExitBlocks.empty()) {
+    return;
+  }
+  
+  if (ExitBlocks.empty()) {
     // The instruction is actually dead if there ARE NO exit blocks.
     CurAST->deleteValue(&I);
     // If I has users in unreachable blocks, eliminate.
     // If I is not void type then replaceAllUsesWith undef.
     // This allows ValueHandlers and custom metadata to adjust itself.
-    if (!I.getType()->isVoidTy())
+    if (!I.use_empty())
       I.replaceAllUsesWith(UndefValue::get(I.getType()));
     I.eraseFromParent();
-  } else {
-    // Otherwise, if we have multiple exits, use the PromoteMem2Reg function to
-    // do all of the hard work of inserting PHI nodes as necessary.  We convert
-    // the value into a stack object to get it to do this.
-
-    // Firstly, we create a stack object to hold the value...
-    AllocaInst *AI = 0;
-
-    if (!I.getType()->isVoidTy()) {
-      AI = new AllocaInst(I.getType(), 0, I.getName(),
-                          I.getParent()->getParent()->getEntryBlock().begin());
-      CurAST->add(AI);
-    }
-
-    // Secondly, insert load instructions for each use of the instruction
-    // outside of the loop.
-    while (!I.use_empty()) {
-      Instruction *U = cast<Instruction>(I.use_back());
-
-      // If the user is a PHI Node, we actually have to insert load instructions
-      // in all predecessor blocks, not in the PHI block itself!
-      if (PHINode *UPN = dyn_cast<PHINode>(U)) {
-        // Only insert into each predecessor once, so that we don't have
-        // different incoming values from the same block!
-        std::map<BasicBlock*, Value*> InsertedBlocks;
-        for (unsigned i = 0, e = UPN->getNumIncomingValues(); i != e; ++i)
-          if (UPN->getIncomingValue(i) == &I) {
-            BasicBlock *Pred = UPN->getIncomingBlock(i);
-            Value *&PredVal = InsertedBlocks[Pred];
-            if (!PredVal) {
-              // Insert a new load instruction right before the terminator in
-              // the predecessor block.
-              PredVal = new LoadInst(AI, "", Pred->getTerminator());
-              CurAST->add(cast<LoadInst>(PredVal));
-            }
-
-            UPN->setIncomingValue(i, PredVal);
-          }
-
-      } else {
-        LoadInst *L = new LoadInst(AI, "", U);
-        U->replaceUsesOfWith(&I, L);
-        CurAST->add(L);
-      }
-    }
-
-    // Thirdly, insert a copy of the instruction in each exit block of the loop
-    // that is dominated by the instruction, storing the result into the memory
-    // location.  Be careful not to insert the instruction into any particular
-    // basic block more than once.
-    std::set<BasicBlock*> InsertedBlocks;
-    BasicBlock *InstOrigBB = I.getParent();
-
-    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
-      BasicBlock *ExitBlock = ExitBlocks[i];
-
-      if (isExitBlockDominatedByBlockInLoop(ExitBlock, InstOrigBB)) {
-        // If we haven't already processed this exit block, do so now.
-        if (InsertedBlocks.insert(ExitBlock).second) {
-          // Insert the code after the last PHI node...
-          BasicBlock::iterator InsertPt = ExitBlock->getFirstNonPHI();
-
-          // If this is the first exit block processed, just move the original
-          // instruction, otherwise clone the original instruction and insert
-          // the copy.
-          Instruction *New;
-          if (InsertedBlocks.size() == 1) {
-            I.removeFromParent();
-            ExitBlock->getInstList().insert(InsertPt, &I);
-            New = &I;
-          } else {
-            New = I.clone();
-            CurAST->copyValue(&I, New);
-            if (!I.getName().empty())
-              New->setName(I.getName()+".le");
-            ExitBlock->getInstList().insert(InsertPt, New);
-          }
-
-          // Now that we have inserted the instruction, store it into the alloca
-          if (AI) new StoreInst(New, AI, InsertPt);
-        }
-      }
-    }
-
-    // If the instruction doesn't dominate any exit blocks, it must be dead.
-    if (InsertedBlocks.empty()) {
-      CurAST->deleteValue(&I);
-      I.eraseFromParent();
-    }
-
-    // Finally, promote the fine value to SSA form.
-    if (AI) {
-      std::vector<AllocaInst*> Allocas;
-      Allocas.push_back(AI);
-      PromoteMemToReg(Allocas, *DT, *DF, CurAST);
+    return;
+  }
+  
+  // Otherwise, if we have multiple exits, use the SSAUpdater to do all of the
+  // hard work of inserting PHI nodes as necessary.
+  SmallVector<PHINode*, 8> NewPHIs;
+  SSAUpdater SSA(&NewPHIs);
+  
+  if (!I.use_empty())
+    SSA.Initialize(I.getType(), I.getName());
+  
+  // Insert a copy of the instruction in each exit block of the loop that is
+  // dominated by the instruction.  Each exit block is known to only be in the
+  // ExitBlocks list once.
+  BasicBlock *InstOrigBB = I.getParent();
+  unsigned NumInserted = 0;
+  
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBlock = ExitBlocks[i];
+    
+    if (!isExitBlockDominatedByBlockInLoop(ExitBlock, InstOrigBB))
+      continue;
+    
+    // Insert the code after the last PHI node.
+    BasicBlock::iterator InsertPt = ExitBlock->getFirstNonPHI();
+    
+    // If this is the first exit block processed, just move the original
+    // instruction, otherwise clone the original instruction and insert
+    // the copy.
+    Instruction *New;
+    if (NumInserted++ == 0) {
+      I.moveBefore(InsertPt);
+      New = &I;
+    } else {
+      New = I.clone();
+      if (!I.getName().empty())
+        New->setName(I.getName()+".le");
+      ExitBlock->getInstList().insert(InsertPt, New);
     }
+    
+    // Now that we have inserted the instruction, inform SSAUpdater.
+    if (!I.use_empty())
+      SSA.AddAvailableValue(ExitBlock, New);
   }
+  
+  // If the instruction doesn't dominate any exit blocks, it must be dead.
+  if (NumInserted == 0) {
+    CurAST->deleteValue(&I);
+    if (!I.use_empty())
+      I.replaceAllUsesWith(UndefValue::get(I.getType()));
+    I.eraseFromParent();
+    return;
+  }
+  
+  // Next, rewrite uses of the instruction, inserting PHI nodes as needed.
+  for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; ) {
+    // Grab the use before incrementing the iterator.
+    Use &U = UI.getUse();
+    // Increment the iterator before removing the use from the list.
+    ++UI;
+    SSA.RewriteUseAfterInsertions(U);
+  }
+  
+  // Update CurAST for NewPHIs if I had pointer type.
+  if (I.getType()->isPointerTy())
+    for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i)
+      CurAST->copyValue(&I, NewPHIs[i]);
+  
+  // Finally, remove the instruction from CurAST.  It is no longer in the loop.
+  CurAST->deleteValue(&I);
 }
 
 /// hoist - When an instruction is found to only use loop invariant operands
@@ -602,12 +596,8 @@ void LICM::hoist(Instruction &I) {
   DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": "
         << I << "\n");
 
-  // Remove the instruction from its current basic block... but don't delete the
-  // instruction.
-  I.removeFromParent();
-
-  // Insert the new node in Preheader, before the terminator.
-  Preheader->getInstList().insert(Preheader->getTerminator(), &I);
+  // Move the new node to the Preheader, before its terminator.
+  I.moveBefore(Preheader->getTerminator());
 
   if (isa<LoadInst>(I)) ++NumMovedLoads;
   else if (isa<CallInst>(I)) ++NumMovedCalls;
@@ -647,223 +637,269 @@ bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
   return true;
 }
 
-
-/// PromoteValuesInLoop - Try to promote memory values to scalars by sinking
+/// PromoteAliasSet - Try to promote memory values to scalars by sinking
 /// stores out of the loop and moving loads to before the loop.  We do this by
 /// looping over the stores in the loop, looking for stores to Must pointers
-/// which are loop invariant.  We promote these memory locations to use allocas
-/// instead.  These allocas can easily be raised to register values by the
-/// PromoteMem2Reg functionality.
+/// which are loop invariant.
 ///
-void LICM::PromoteValuesInLoop() {
-  // PromotedValues - List of values that are promoted out of the loop.  Each
-  // value has an alloca instruction for it, and a canonical version of the
-  // pointer.
-  std::vector<std::pair<AllocaInst*, Value*> > PromotedValues;
-  std::map<Value*, AllocaInst*> ValueToAllocaMap; // Map of ptr to alloca
-
-  FindPromotableValuesInLoop(PromotedValues, ValueToAllocaMap);
-  if (ValueToAllocaMap.empty()) return;   // If there are values to promote.
-
-  Changed = true;
-  NumPromoted += PromotedValues.size();
-
-  std::vector<Value*> PointerValueNumbers;
-
-  // Emit a copy from the value into the alloca'd value in the loop preheader
-  TerminatorInst *LoopPredInst = Preheader->getTerminator();
-  for (unsigned i = 0, e = PromotedValues.size(); i != e; ++i) {
-    Value *Ptr = PromotedValues[i].second;
-
-    // If we are promoting a pointer value, update alias information for the
-    // inserted load.
-    Value *LoadValue = 0;
-    if (cast<PointerType>(Ptr->getType())->getElementType()->isPointerTy()) {
-      // Locate a load or store through the pointer, and assign the same value
-      // to LI as we are loading or storing.  Since we know that the value is
-      // stored in this loop, this will always succeed.
-      for (Value::use_iterator UI = Ptr->use_begin(), E = Ptr->use_end();
-           UI != E; ++UI) {
-        User *U = *UI;
-        if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-          LoadValue = LI;
-          break;
-        } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
-          if (SI->getOperand(1) == Ptr) {
-            LoadValue = SI->getOperand(0);
-            break;
-          }
-        }
-      }
-      assert(LoadValue && "No store through the pointer found!");
-      PointerValueNumbers.push_back(LoadValue);  // Remember this for later.
-    }
-
-    // Load from the memory we are promoting.
-    LoadInst *LI = new LoadInst(Ptr, Ptr->getName()+".promoted", LoopPredInst);
-
-    if (LoadValue) CurAST->copyValue(LoadValue, LI);
-
-    // Store into the temporary alloca.
-    new StoreInst(LI, PromotedValues[i].first, LoopPredInst);
-  }
+void LICM::PromoteAliasSet(AliasSet &AS) {
+  // We can promote this alias set if it has a store, if it is a "Must" alias
+  // set, if the pointer is loop invariant, and if we are not eliminating any
+  // volatile loads or stores.
+  if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
+      AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue()))
+    return;
+  
+  assert(!AS.empty() &&
+         "Must alias set should have at least one pointer element in it!");
+  Value *SomePtr = AS.begin()->getValue();
 
-  // Scan the basic blocks in the loop, replacing uses of our pointers with
-  // uses of the allocas in question.
+  // It isn't safe to promote a load/store from the loop if the load/store is
+  // conditional.  For example, turning:
   //
-  for (Loop::block_iterator I = CurLoop->block_begin(),
-         E = CurLoop->block_end(); I != E; ++I) {
-    BasicBlock *BB = *I;
-    // Rewrite all loads and stores in the block of the pointer...
-    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
-      if (LoadInst *L = dyn_cast<LoadInst>(II)) {
-        std::map<Value*, AllocaInst*>::iterator
-          I = ValueToAllocaMap.find(L->getOperand(0));
-        if (I != ValueToAllocaMap.end())
-          L->setOperand(0, I->second);    // Rewrite load instruction...
-      } else if (StoreInst *S = dyn_cast<StoreInst>(II)) {
-        std::map<Value*, AllocaInst*>::iterator
-          I = ValueToAllocaMap.find(S->getOperand(1));
-        if (I != ValueToAllocaMap.end())
-          S->setOperand(1, I->second);    // Rewrite store instruction...
-      }
-    }
-  }
-
-  // Now that the body of the loop uses the allocas instead of the original
-  // memory locations, insert code to copy the alloca value back into the
-  // original memory location on all exits from the loop.  Note that we only
-  // want to insert one copy of the code in each exit block, though the loop may
-  // exit to the same block more than once.
+  //    for () { if (c) *P += 1; }
   //
-  SmallPtrSet<BasicBlock*, 16> ProcessedBlocks;
-
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  CurLoop->getExitBlocks(ExitBlocks);
-  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
-    if (!ProcessedBlocks.insert(ExitBlocks[i]))
-      continue;
-  
-    // Copy all of the allocas into their memory locations.
-    BasicBlock::iterator BI = ExitBlocks[i]->getFirstNonPHI();
-    Instruction *InsertPos = BI;
-    unsigned PVN = 0;
-    for (unsigned i = 0, e = PromotedValues.size(); i != e; ++i) {
-      // Load from the alloca.
-      LoadInst *LI = new LoadInst(PromotedValues[i].first, "", InsertPos);
-
-      // If this is a pointer type, update alias info appropriately.
-      if (LI->getType()->isPointerTy())
-        CurAST->copyValue(PointerValueNumbers[PVN++], LI);
-
-      // Store into the memory we promoted.
-      new StoreInst(LI, PromotedValues[i].second, InsertPos);
-    }
-  }
-
-  // Now that we have done the deed, use the mem2reg functionality to promote
-  // all of the new allocas we just created into real SSA registers.
+  // into:
   //
-  std::vector<AllocaInst*> PromotedAllocas;
-  PromotedAllocas.reserve(PromotedValues.size());
-  for (unsigned i = 0, e = PromotedValues.size(); i != e; ++i)
-    PromotedAllocas.push_back(PromotedValues[i].first);
-  PromoteMemToReg(PromotedAllocas, *DT, *DF, CurAST);
-}
-
-/// FindPromotableValuesInLoop - Check the current loop for stores to definite
-/// pointers, which are not loaded and stored through may aliases and are safe
-/// for promotion.  If these are found, create an alloca for the value, add it 
-/// to the PromotedValues list, and keep track of the mapping from value to 
-/// alloca. 
-void LICM::FindPromotableValuesInLoop(
-                   std::vector<std::pair<AllocaInst*, Value*> > &PromotedValues,
-                             std::map<Value*, AllocaInst*> &ValueToAllocaMap) {
-  Instruction *FnStart = CurLoop->getHeader()->getParent()->begin()->begin();
-
-  // Loop over all of the alias sets in the tracker object.
-  for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
-       I != E; ++I) {
-    AliasSet &AS = *I;
-    // We can promote this alias set if it has a store, if it is a "Must" alias
-    // set, if the pointer is loop invariant, and if we are not eliminating any
-    // volatile loads or stores.
-    if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
-        AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue()))
-      continue;
+  //    tmp = *P;  for () { if (c) tmp +=1; } *P = tmp;
+  //
+  // is not safe, because *P may only be valid to access if 'c' is true.
+  // 
+  // It is safe to promote P if all uses are direct load/stores and if at
+  // least one is guaranteed to be executed.
+  bool GuaranteedToExecute = false;
+  
+  SmallVector<Instruction*, 64> LoopUses;
+  SmallPtrSet<Value*, 4> PointerMustAliases;
+
+  // Check that all of the pointers in the alias set have the same type.  We
+  // cannot (yet) promote a memory location that is loaded and stored in
+  // different sizes.
+  for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) {
+    Value *ASIV = ASI->getValue();
+    PointerMustAliases.insert(ASIV);
     
-    assert(!AS.empty() &&
-           "Must alias set should have at least one pointer element in it!");
-    Value *V = AS.begin()->getValue();
-
     // Check that all of the pointers in the alias set have the same type.  We
     // cannot (yet) promote a memory location that is loaded and stored in
     // different sizes.
-    {
-      bool PointerOk = true;
-      for (AliasSet::iterator I = AS.begin(), E = AS.end(); I != E; ++I)
-        if (V->getType() != I->getValue()->getType()) {
-          PointerOk = false;
-          break;
-        }
-      if (!PointerOk)
-        continue;
-    }
-
-    // It isn't safe to promote a load/store from the loop if the load/store is
-    // conditional.  For example, turning:
-    //
-    //    for () { if (c) *P += 1; }
-    //
-    // into:
-    //
-    //    tmp = *P;  for () { if (c) tmp +=1; } *P = tmp;
-    //
-    // is not safe, because *P may only be valid to access if 'c' is true.
-    // 
-    // It is safe to promote P if all uses are direct load/stores and if at
-    // least one is guaranteed to be executed.
-    bool GuaranteedToExecute = false;
-    bool InvalidInst = false;
-    for (Value::use_iterator UI = V->use_begin(), UE = V->use_end();
+    if (SomePtr->getType() != ASIV->getType())
+      return;
+    
+    for (Value::use_iterator UI = ASIV->use_begin(), UE = ASIV->use_end();
          UI != UE; ++UI) {
-      // Ignore instructions not in this loop.
+      // Ignore instructions that are outside the loop.
       Instruction *Use = dyn_cast<Instruction>(*UI);
       if (!Use || !CurLoop->contains(Use))
         continue;
-
-      if (!isa<LoadInst>(Use) && !isa<StoreInst>(Use)) {
-        InvalidInst = true;
-        break;
-      }
+      
+      // If there is an non-load/store instruction in the loop, we can't promote
+      // it.
+      if (isa<LoadInst>(Use))
+        assert(!cast<LoadInst>(Use)->isVolatile() && "AST broken");
+      else if (isa<StoreInst>(Use)) {
+        assert(!cast<StoreInst>(Use)->isVolatile() && "AST broken");
+        if (Use->getOperand(0) == ASIV) return;
+      } else
+        return; // Not a load or store.
       
       if (!GuaranteedToExecute)
         GuaranteedToExecute = isSafeToExecuteUnconditionally(*Use);
+      
+      LoopUses.push_back(Use);
     }
+  }
+  
+  // If there isn't a guaranteed-to-execute instruction, we can't promote.
+  if (!GuaranteedToExecute)
+    return;
+  
+  // Otherwise, this is safe to promote, lets do it!
+  DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n');  
+  Changed = true;
+  ++NumPromoted;
 
-    // If there is an non-load/store instruction in the loop, we can't promote
-    // it.  If there isn't a guaranteed-to-execute instruction, we can't
-    // promote.
-    if (InvalidInst || !GuaranteedToExecute)
+  // We use the SSAUpdater interface to insert phi nodes as required.
+  SmallVector<PHINode*, 16> NewPHIs;
+  SSAUpdater SSA(&NewPHIs);
+  
+  // It wants to know some value of the same type as what we'll be inserting.
+  Value *SomeValue;
+  if (isa<LoadInst>(LoopUses[0]))
+    SomeValue = LoopUses[0];
+  else
+    SomeValue = cast<StoreInst>(LoopUses[0])->getOperand(0);
+  SSA.Initialize(SomeValue->getType(), SomeValue->getName());
+
+  // First step: bucket up uses of the pointers by the block they occur in.
+  // This is important because we have to handle multiple defs/uses in a block
+  // ourselves: SSAUpdater is purely for cross-block references.
+  // FIXME: Want a TinyVector<Instruction*> since there is usually 0/1 element.
+  DenseMap<BasicBlock*, std::vector<Instruction*> > UsesByBlock;
+  for (unsigned i = 0, e = LoopUses.size(); i != e; ++i) {
+    Instruction *User = LoopUses[i];
+    UsesByBlock[User->getParent()].push_back(User);
+  }
+  
+  // Okay, now we can iterate over all the blocks in the loop with uses,
+  // processing them.  Keep track of which loads are loading a live-in value.
+  SmallVector<LoadInst*, 32> LiveInLoads;
+  DenseMap<Value*, Value*> ReplacedLoads;
+  
+  for (unsigned LoopUse = 0, e = LoopUses.size(); LoopUse != e; ++LoopUse) {
+    Instruction *User = LoopUses[LoopUse];
+    std::vector<Instruction*> &BlockUses = UsesByBlock[User->getParent()];
+    
+    // If this block has already been processed, ignore this repeat use.
+    if (BlockUses.empty()) continue;
+    
+    // Okay, this is the first use in the block.  If this block just has a
+    // single user in it, we can rewrite it trivially.
+    if (BlockUses.size() == 1) {
+      // If it is a store, it is a trivial def of the value in the block.
+      if (isa<StoreInst>(User)) {
+        SSA.AddAvailableValue(User->getParent(),
+                              cast<StoreInst>(User)->getOperand(0));
+      } else {
+        // Otherwise it is a load, queue it to rewrite as a live-in load.
+        LiveInLoads.push_back(cast<LoadInst>(User));
+      }
+      BlockUses.clear();
       continue;
+    }
     
-    const Type *Ty = cast<PointerType>(V->getType())->getElementType();
-    AllocaInst *AI = new AllocaInst(Ty, 0, V->getName()+".tmp", FnStart);
-    PromotedValues.push_back(std::make_pair(AI, V));
+    // Otherwise, check to see if this block is all loads.  If so, we can queue
+    // them all as live in loads.
+    bool HasStore = false;
+    for (unsigned i = 0, e = BlockUses.size(); i != e; ++i) {
+      if (isa<StoreInst>(BlockUses[i])) {
+        HasStore = true;
+        break;
+      }
+    }
+    
+    if (!HasStore) {
+      for (unsigned i = 0, e = BlockUses.size(); i != e; ++i)
+        LiveInLoads.push_back(cast<LoadInst>(BlockUses[i]));
+      BlockUses.clear();
+      continue;
+    }
 
-    // Update the AST and alias analysis.
-    CurAST->copyValue(V, AI);
+    // Otherwise, we have mixed loads and stores (or just a bunch of stores).
+    // Since SSAUpdater is purely for cross-block values, we need to determine
+    // the order of these instructions in the block.  If the first use in the
+    // block is a load, then it uses the live in value.  The last store defines
+    // the live out value.  We handle this by doing a linear scan of the block.
+    BasicBlock *BB = User->getParent();
+    Value *StoredValue = 0;
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
+      if (LoadInst *L = dyn_cast<LoadInst>(II)) {
+        // If this is a load from an unrelated pointer, ignore it.
+        if (!PointerMustAliases.count(L->getOperand(0))) continue;
+
+        // If we haven't seen a store yet, this is a live in use, otherwise
+        // use the stored value.
+        if (StoredValue) {
+          L->replaceAllUsesWith(StoredValue);
+          ReplacedLoads[L] = StoredValue;
+        } else {
+          LiveInLoads.push_back(L);
+        }
+        continue;
+      }
+      
+      if (StoreInst *S = dyn_cast<StoreInst>(II)) {
+        // If this is a store to an unrelated pointer, ignore it.
+        if (!PointerMustAliases.count(S->getOperand(1))) continue;
 
-    for (AliasSet::iterator I = AS.begin(), E = AS.end(); I != E; ++I)
-      ValueToAllocaMap.insert(std::make_pair(I->getValue(), AI));
+        // Remember that this is the active value in the block.
+        StoredValue = S->getOperand(0);
+      }
+    }
+    
+    // The last stored value that happened is the live-out for the block.
+    assert(StoredValue && "Already checked that there is a store in block");
+    SSA.AddAvailableValue(BB, StoredValue);
+    BlockUses.clear();
+  }
+  
+  // Now that all the intra-loop values are classified, set up the preheader.
+  // It gets a load of the pointer we're promoting, and it is the live-out value
+  // from the preheader.
+  LoadInst *PreheaderLoad = new LoadInst(SomePtr,SomePtr->getName()+".promoted",
+                                         Preheader->getTerminator());
+  SSA.AddAvailableValue(Preheader, PreheaderLoad);
+
+  // Now that the preheader is good to go, set up the exit blocks.  Each exit
+  // block gets a store of the live-out values that feed them.  Since we've
+  // already told the SSA updater about the defs in the loop and the preheader
+  // definition, it is all set and we can start using it.
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBlock = ExitBlocks[i];
+    Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+    Instruction *InsertPos = ExitBlock->getFirstNonPHI();
+    new StoreInst(LiveInValue, SomePtr, InsertPos);
+  }
 
-    DEBUG(dbgs() << "LICM: Promoting value: " << *V << "\n");
+  // Okay, now we rewrite all loads that use live-in values in the loop,
+  // inserting PHI nodes as necessary.
+  for (unsigned i = 0, e = LiveInLoads.size(); i != e; ++i) {
+    LoadInst *ALoad = LiveInLoads[i];
+    Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent());
+    ALoad->replaceAllUsesWith(NewVal);
+    CurAST->copyValue(ALoad, NewVal);
+    ReplacedLoads[ALoad] = NewVal;
+  }
+  
+  // If the preheader load is itself a pointer, we need to tell alias analysis
+  // about the new pointer we created in the preheader block and about any PHI
+  // nodes that just got inserted.
+  if (PreheaderLoad->getType()->isPointerTy()) {
+    // Copy any value stored to or loaded from a must-alias of the pointer.
+    CurAST->copyValue(SomeValue, PreheaderLoad);
+    
+    for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i)
+      CurAST->copyValue(SomeValue, NewPHIs[i]);
   }
+  
+  // Now that everything is rewritten, delete the old instructions from the body
+  // of the loop.  They should all be dead now.
+  for (unsigned i = 0, e = LoopUses.size(); i != e; ++i) {
+    Instruction *User = LoopUses[i];
+    
+    // If this is a load that still has uses, then the load must have been added
+    // as a live value in the SSAUpdate data structure for a block (e.g. because
+    // the loaded value was stored later).  In this case, we need to recursively
+    // propagate the updates until we get to the real value.
+    if (!User->use_empty()) {
+      Value *NewVal = ReplacedLoads[User];
+      assert(NewVal && "not a replaced load?");
+      
+      // Propagate down to the ultimate replacee.  The intermediately loads
+      // could theoretically already have been deleted, so we don't want to
+      // dereference the Value*'s.
+      DenseMap<Value*, Value*>::iterator RLI = ReplacedLoads.find(NewVal);
+      while (RLI != ReplacedLoads.end()) {
+        NewVal = RLI->second;
+        RLI = ReplacedLoads.find(NewVal);
+      }
+      
+      User->replaceAllUsesWith(NewVal);
+      CurAST->copyValue(User, NewVal);
+    }
+    
+    CurAST->deleteValue(User);
+    User->eraseFromParent();
+  }
+  
+  // fwew, we're done!
 }
 
+
 /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
 void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {
-  AliasSetTracker *AST = LoopToAliasMap[L];
+  AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
   if (!AST)
     return;
 
@@ -873,7 +909,7 @@ void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {
 /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
 /// set.
 void LICM::deleteAnalysisValue(Value *V, Loop *L) {
-  AliasSetTracker *AST = LoopToAliasMap[L];
+  AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
   if (!AST)
     return;
 
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index e4894e99b68f1..543dfc1cba096 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -28,7 +28,7 @@ namespace {
   class LoopDeletion : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
-    LoopDeletion() : LoopPass(&ID) {}
+    LoopDeletion() : LoopPass(ID) {}
     
     // Possibly eliminate loop L if it is dead.
     bool runOnLoop(Loop* L, LPPassManager& LPM);
@@ -38,9 +38,9 @@ namespace {
                     bool &Changed, BasicBlock *Preheader);
 
     virtual void getAnalysisUsage(AnalysisUsage& AU) const {
-      AU.addRequired<ScalarEvolution>();
       AU.addRequired<DominatorTree>();
       AU.addRequired<LoopInfo>();
+      AU.addRequired<ScalarEvolution>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
       
@@ -55,7 +55,8 @@ namespace {
 }
   
 char LoopDeletion::ID = 0;
-static RegisterPass<LoopDeletion> X("loop-deletion", "Delete dead loops");
+INITIALIZE_PASS(LoopDeletion, "loop-deletion",
+                "Delete dead loops", false, false);
 
 Pass* llvm::createLoopDeletionPass() {
   return new LoopDeletion();
diff --git a/lib/Transforms/Scalar/LoopIndexSplit.cpp b/lib/Transforms/Scalar/LoopIndexSplit.cpp
index 31058e5759a4a..a4336743a8f01 100644
--- a/lib/Transforms/Scalar/LoopIndexSplit.cpp
+++ b/lib/Transforms/Scalar/LoopIndexSplit.cpp
@@ -74,7 +74,7 @@ namespace {
   class LoopIndexSplit : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
-    LoopIndexSplit() : LoopPass(&ID) {}
+    LoopIndexSplit() : LoopPass(ID) {}
 
     // Index split Loop L. Return true if loop is split.
     bool runOnLoop(Loop *L, LPPassManager &LPM);
@@ -197,8 +197,8 @@ namespace {
 }
 
 char LoopIndexSplit::ID = 0;
-static RegisterPass<LoopIndexSplit>
-X("loop-index-split", "Index Split Loops");
+INITIALIZE_PASS(LoopIndexSplit, "loop-index-split",
+                "Index Split Loops", false, false);
 
 Pass *llvm::createLoopIndexSplitPass() {
   return new LoopIndexSplit();
@@ -677,7 +677,7 @@ void LoopIndexSplit::removeBlocks(BasicBlock *DeadBB, Loop *LP,
       for(pred_iterator PI = pred_begin(FrontierBB), PE = pred_end(FrontierBB);
           PI != PE; ++PI) {
         BasicBlock *P = *PI;
-        if (P == DeadBB || DT->dominates(DeadBB, P))
+        if (DT->dominates(DeadBB, P))
           PredBlocks.push_back(P);
       }
 
@@ -799,7 +799,7 @@ void LoopIndexSplit::moveExitCondition(BasicBlock *CondBB, BasicBlock *ActiveBB,
   // the dominance frontiers.
   for (Loop::block_iterator I = LP->block_begin(), E = LP->block_end();
        I != E; ++I) {
-    if (*I == CondBB || !DT->dominates(CondBB, *I)) continue;
+    if (!DT->properlyDominates(CondBB, *I)) continue;
     DominanceFrontier::iterator BBDF = DF->find(*I);
     DominanceFrontier::DomSetType::iterator DomSetI = BBDF->second.begin();
     DominanceFrontier::DomSetType::iterator DomSetE = BBDF->second.end();
@@ -1183,7 +1183,7 @@ bool LoopIndexSplit::cleanBlock(BasicBlock *BB) {
     bool usedOutsideBB = false;
     for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); 
          UI != UE; ++UI) {
-      Instruction *U = cast<Instruction>(UI);
+      Instruction *U = cast<Instruction>(*UI);
       if (U->getParent() != BB)
         usedOutsideBB = true;
     }
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 16c4a15d3550a..65acc1d9257ad 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -35,7 +35,7 @@ namespace {
   class LoopRotate : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
-    LoopRotate() : LoopPass(&ID) {}
+    LoopRotate() : LoopPass(ID) {}
 
     // Rotate Loop L as many times as possible. Return true if
     // loop is rotated at least once.
@@ -43,15 +43,15 @@ namespace {
 
     // LCSSA form makes instruction renaming easier.
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominanceFrontier>();
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addPreservedID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
       AU.addPreserved<ScalarEvolution>();
-      AU.addRequired<LoopInfo>();
-      AU.addPreserved<LoopInfo>();
-      AU.addPreserved<DominatorTree>();
-      AU.addPreserved<DominanceFrontier>();
     }
 
     // Helper functions
@@ -79,7 +79,7 @@ namespace {
 }
   
 char LoopRotate::ID = 0;
-static RegisterPass<LoopRotate> X("loop-rotate", "Rotate Loops");
+INITIALIZE_PASS(LoopRotate, "loop-rotate", "Rotate Loops", false, false);
 
 Pass *llvm::createLoopRotatePass() { return new LoopRotate(); }
 
@@ -221,7 +221,7 @@ bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) {
 
     // The value now exits in two versions: the initial value in the preheader
     // and the loop "next" value in the original header.
-    SSA.Initialize(OrigHeaderVal);
+    SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
     SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
     SSA.AddAvailableValue(OrigPreHeader, OrigPreHeaderVal);
 
@@ -261,6 +261,26 @@ bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) {
   // NewHeader is now the header of the loop.
   L->moveToHeader(NewHeader);
 
+  // Move the original header to the bottom of the loop, where it now more
+  // naturally belongs. This isn't necessary for correctness, and CodeGen can
+  // usually reorder blocks on its own to fix things like this up, but it's
+  // still nice to keep the IR readable.
+  //
+  // The original header should have only one predecessor at this point, since
+  // we checked that the loop had a proper preheader and unique backedge before
+  // we started.
+  assert(OrigHeader->getSinglePredecessor() &&
+         "Original loop header has too many predecessors after loop rotation!");
+  OrigHeader->moveAfter(OrigHeader->getSinglePredecessor());
+
+  // Also, since this original header only has one predecessor, zap its
+  // PHI nodes, which are now trivial.
+  FoldSingleEntryPHINodes(OrigHeader);
+
+  // TODO: We could just go ahead and merge OrigHeader into its predecessor
+  // at this point, if we don't mind updating dominator info.
+
+  // Establish a new preheader, update dominators, etc.
   preserveCanonicalLoopForm(LPM);
 
   ++NumRotated;
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 1f9b4156b9cd6..e8dc5d3a640e6 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -161,9 +161,10 @@ RegUseTracker::DropUse(size_t LUIdx) {
 
 bool
 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
-  if (!RegUsesMap.count(Reg)) return false;
-  const SmallBitVector &UsedByIndices =
-    RegUsesMap.find(Reg)->second.UsedByIndices;
+  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+  if (I == RegUsesMap.end())
+    return false;
+  const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
   int i = UsedByIndices.find_first();
   if (i == -1) return false;
   if ((size_t)i != LUIdx) return true;
@@ -441,12 +442,12 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
   // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
     if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
-      const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
-                                       IgnoreSignificantBits);
-      if (!Start) return 0;
       const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
                                       IgnoreSignificantBits);
       if (!Step) return 0;
+      const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
+                                       IgnoreSignificantBits);
+      if (!Start) return 0;
       return SE.getAddRecExpr(Start, Step, AR->getLoop());
     }
     return 0;
@@ -505,12 +506,14 @@ static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
     int64_t Result = ExtractImmediate(NewOps.front(), SE);
-    S = SE.getAddExpr(NewOps);
+    if (Result != 0)
+      S = SE.getAddExpr(NewOps);
     return Result;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
     int64_t Result = ExtractImmediate(NewOps.front(), SE);
-    S = SE.getAddRecExpr(NewOps, AR->getLoop());
+    if (Result != 0)
+      S = SE.getAddRecExpr(NewOps, AR->getLoop());
     return Result;
   }
   return 0;
@@ -528,12 +531,14 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
     GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
-    S = SE.getAddExpr(NewOps);
+    if (Result)
+      S = SE.getAddExpr(NewOps);
     return Result;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
     GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
-    S = SE.getAddRecExpr(NewOps, AR->getLoop());
+    if (Result)
+      S = SE.getAddRecExpr(NewOps, AR->getLoop());
     return Result;
   }
   return 0;
@@ -965,6 +970,12 @@ public:
   /// may be used.
   bool AllFixupsOutsideLoop;
 
+  /// WidestFixupType - This records the widest use type for any fixup using
+  /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
+  /// max fixup widths to be equivalent, because the narrower one may be relying
+  /// on the implicit truncation to truncate away bogus bits.
+  const Type *WidestFixupType;
+
   /// Formulae - A list of ways to build a value that can satisfy this user.
   /// After the list is populated, one of these is selected heuristically and
   /// used to formulate a replacement for OperandValToReplace in UserInst.
@@ -976,15 +987,14 @@ public:
   LSRUse(KindType K, const Type *T) : Kind(K), AccessTy(T),
                                       MinOffset(INT64_MAX),
                                       MaxOffset(INT64_MIN),
-                                      AllFixupsOutsideLoop(true) {}
+                                      AllFixupsOutsideLoop(true),
+                                      WidestFixupType(0) {}
 
   bool HasFormulaWithSameRegs(const Formula &F) const;
   bool InsertFormula(const Formula &F);
   void DeleteFormula(Formula &F);
   void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
 
-  void check() const;
-
   void print(raw_ostream &OS) const;
   void dump() const;
 };
@@ -1076,13 +1086,16 @@ void LSRUse::print(raw_ostream &OS) const {
   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
        E = Offsets.end(); I != E; ++I) {
     OS << *I;
-    if (next(I) != E)
+    if (llvm::next(I) != E)
       OS << ',';
   }
   OS << '}';
 
   if (AllFixupsOutsideLoop)
     OS << ", all-fixups-outside-loop";
+
+  if (WidestFixupType)
+    OS << ", widest fixup type: " << *WidestFixupType;
 }
 
 void LSRUse::dump() const {
@@ -1354,6 +1367,10 @@ public:
   void FilterOutUndesirableDedicatedRegisters();
 
   size_t EstimateSearchSpaceComplexity() const;
+  void NarrowSearchSpaceByDetectingSupersets();
+  void NarrowSearchSpaceByCollapsingUnrolledCode();
+  void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  void NarrowSearchSpaceByPickingWinnerRegs();
   void NarrowSearchSpaceUsingHeuristics();
 
   void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
@@ -1587,7 +1604,7 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
   const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
 
   // Add one to the backedge-taken count to get the trip count.
-  const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One);
+  const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
   if (IterationCount != SE.getSCEV(Sel)) return Cond;
 
   // Check for a max calculation that matches the pattern. There's no check
@@ -1919,32 +1936,41 @@ void LSRInstance::DeleteUse(LSRUse &LU) {
 LSRUse *
 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
                                        const LSRUse &OrigLU) {
-  // Search all uses for the formula. This could be more clever. Ignore
-  // ICmpZero uses because they may contain formulae generated by
-  // GenerateICmpZeroScales, in which case adding fixup offsets may
-  // be invalid.
+  // Search all uses for the formula. This could be more clever.
   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
     LSRUse &LU = Uses[LUIdx];
+    // Check whether this use is close enough to OrigLU, to see whether it's
+    // worthwhile looking through its formulae.
+    // Ignore ICmpZero uses because they may contain formulae generated by
+    // GenerateICmpZeroScales, in which case adding fixup offsets may
+    // be invalid.
     if (&LU != &OrigLU &&
         LU.Kind != LSRUse::ICmpZero &&
         LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
+        LU.WidestFixupType == OrigLU.WidestFixupType &&
         LU.HasFormulaWithSameRegs(OrigF)) {
+      // Scan through this use's formulae.
       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
            E = LU.Formulae.end(); I != E; ++I) {
         const Formula &F = *I;
+        // Check to see if this formula has the same registers and symbols
+        // as OrigF.
         if (F.BaseRegs == OrigF.BaseRegs &&
             F.ScaledReg == OrigF.ScaledReg &&
             F.AM.BaseGV == OrigF.AM.BaseGV &&
-            F.AM.Scale == OrigF.AM.Scale &&
-            LU.Kind) {
+            F.AM.Scale == OrigF.AM.Scale) {
           if (F.AM.BaseOffs == 0)
             return &LU;
+          // This is the formula where all the registers and symbols matched;
+          // there aren't going to be any others. Since we declined it, we
+          // can skip the rest of the formulae and procede to the next LSRUse.
           break;
         }
       }
     }
   }
 
+  // Nothing looked good.
   return 0;
 }
 
@@ -1976,7 +2002,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
   for (SmallSetVector<const SCEV *, 4>::const_iterator
        I = Strides.begin(), E = Strides.end(); I != E; ++I)
     for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
-         next(I); NewStrideIter != E; ++NewStrideIter) {
+         llvm::next(I); NewStrideIter != E; ++NewStrideIter) {
       const SCEV *OldStride = *I;
       const SCEV *NewStride = *NewStrideIter;
 
@@ -2066,6 +2092,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     LF.Offset = P.second;
     LSRUse &LU = Uses[LF.LUIdx];
     LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+    if (!LU.WidestFixupType ||
+        SE.getTypeSizeInBits(LU.WidestFixupType) <
+        SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+      LU.WidestFixupType = LF.OperandValToReplace->getType();
 
     // If this is the first use of this LSRUse, give it a formula.
     if (LU.Formulae.empty()) {
@@ -2195,6 +2225,10 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
         LF.Offset = P.second;
         LSRUse &LU = Uses[LF.LUIdx];
         LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+        if (!LU.WidestFixupType ||
+            SE.getTypeSizeInBits(LU.WidestFixupType) <
+            SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+          LU.WidestFixupType = LF.OperandValToReplace->getType();
         InsertSupplementalFormula(U, LU, LF.LUIdx);
         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
         break;
@@ -2207,14 +2241,13 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
 /// separate registers. If C is non-null, multiply each subexpression by C.
 static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
                             SmallVectorImpl<const SCEV *> &Ops,
-                            SmallVectorImpl<const SCEV *> &UninterestingOps,
                             const Loop *L,
                             ScalarEvolution &SE) {
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     // Break out add operands.
     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
          I != E; ++I)
-      CollectSubexprs(*I, C, Ops, UninterestingOps, L, SE);
+      CollectSubexprs(*I, C, Ops, L, SE);
     return;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     // Split a non-zero base out of an addrec.
@@ -2222,8 +2255,8 @@ static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
       CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
                                        AR->getStepRecurrence(SE),
                                        AR->getLoop()),
-                      C, Ops, UninterestingOps, L, SE);
-      CollectSubexprs(AR->getStart(), C, Ops, UninterestingOps, L, SE);
+                      C, Ops, L, SE);
+      CollectSubexprs(AR->getStart(), C, Ops, L, SE);
       return;
     }
   } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
@@ -2233,17 +2266,13 @@ static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
             dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
         CollectSubexprs(Mul->getOperand(1),
                         C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0,
-                        Ops, UninterestingOps, L, SE);
+                        Ops, L, SE);
         return;
       }
   }
 
-  // Otherwise use the value itself. Loop-variant "unknown" values are
-  // uninteresting; we won't be able to do anything meaningful with them.
-  if (!C && isa<SCEVUnknown>(S) && !S->isLoopInvariant(L))
-    UninterestingOps.push_back(S);
-  else
-    Ops.push_back(C ? SE.getMulExpr(C, S) : S);
+  // Otherwise use the value itself, optionally with a scale applied.
+  Ops.push_back(C ? SE.getMulExpr(C, S) : S);
 }
 
 /// GenerateReassociations - Split out subexpressions from adds and the bases of
@@ -2257,19 +2286,19 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
     const SCEV *BaseReg = Base.BaseRegs[i];
 
-    SmallVector<const SCEV *, 8> AddOps, UninterestingAddOps;
-    CollectSubexprs(BaseReg, 0, AddOps, UninterestingAddOps, L, SE);
-
-    // Add any uninteresting values as one register, as we won't be able to
-    // form any interesting reassociation opportunities with them. They'll
-    // just have to be added inside the loop no matter what we do.
-    if (!UninterestingAddOps.empty())
-      AddOps.push_back(SE.getAddExpr(UninterestingAddOps));
+    SmallVector<const SCEV *, 8> AddOps;
+    CollectSubexprs(BaseReg, 0, AddOps, L, SE);
 
     if (AddOps.size() == 1) continue;
 
     for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
          JE = AddOps.end(); J != JE; ++J) {
+
+      // Loop-variant "unknown" values are uninteresting; we won't be able to
+      // do anything meaningful with them.
+      if (isa<SCEVUnknown>(*J) && !(*J)->isLoopInvariant(L))
+        continue;
+
       // Don't pull a constant into a register if the constant could be folded
       // into an immediate field.
       if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset,
@@ -2279,9 +2308,9 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
 
       // Collect all operands except *J.
       SmallVector<const SCEV *, 8> InnerAddOps
-        (         ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
+        (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
       InnerAddOps.append
-        (next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end());
+        (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end());
 
       // Don't leave just a constant behind in a register if the constant could
       // be folded into an immediate field.
@@ -2377,7 +2406,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
       if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,
                      LU.Kind, LU.AccessTy, TLI)) {
         // Add the offset to the base register.
-        const SCEV *NewG = SE.getAddExpr(G, SE.getConstant(G->getType(), *I));
+        const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
         // If it cancelled out, drop the base register, otherwise update it.
         if (NewG->isZero()) {
           std::swap(F.BaseRegs[i], F.BaseRegs.back());
@@ -2778,6 +2807,10 @@ LSRInstance::GenerateAllReuseFormulae() {
   }
 
   GenerateCrossUseConstantOffsets();
+
+  DEBUG(dbgs() << "\n"
+                  "After generating reuse formulae:\n";
+        print_uses(dbgs()));
 }
 
 /// If their are multiple formulae with the same set of registers used
@@ -2876,11 +2909,11 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const {
   return Power;
 }
 
-/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
-/// formulae to choose from, use some rough heuristics to prune down the number
-/// of formulae. This keeps the main solver from taking an extraordinary amount
-/// of time in some worst-case scenarios.
-void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
+/// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset
+/// of the registers of another formula, it won't help reduce register
+/// pressure (though it may not necessarily hurt register pressure); remove
+/// it to simplify the system.
+void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
     DEBUG(dbgs() << "The search space is too complex.\n");
 
@@ -2938,7 +2971,12 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
     DEBUG(dbgs() << "After pre-selection:\n";
           print_uses(dbgs()));
   }
+}
 
+/// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers
+/// for expressions like A, A+1, A+2, etc., allocate a single register for
+/// them.
+void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
     DEBUG(dbgs() << "The search space is too complex.\n");
 
@@ -2988,7 +3026,7 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
                 if (Fixup.LUIdx == LUIdx) {
                   Fixup.LUIdx = LUThatHas - &Uses.front();
                   Fixup.Offset += F.AM.BaseOffs;
-                  DEBUG(errs() << "New fixup has offset "
+                  DEBUG(dbgs() << "New fixup has offset "
                                << Fixup.Offset << '\n');
                 }
                 if (Fixup.LUIdx == NumUses-1)
@@ -3009,7 +3047,30 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
     DEBUG(dbgs() << "After pre-selection:\n";
           print_uses(dbgs()));
   }
+}
+
+/// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call 
+/// FilterOutUndesirableDedicatedRegisters again, if necessary, now that
+/// we've done more filtering, as it may be able to find more formulae to
+/// eliminate.
+void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
+  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    DEBUG(dbgs() << "The search space is too complex.\n");
+
+    DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
+                    "undesirable dedicated registers.\n");
+
+    FilterOutUndesirableDedicatedRegisters();
+
+    DEBUG(dbgs() << "After pre-selection:\n";
+          print_uses(dbgs()));
+  }
+}
 
+/// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely
+/// to be profitable, and then in any use which has any reference to that
+/// register, delete all formulae which do not reference that register.
+void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
   // With all other options exhausted, loop until the system is simple
   // enough to handle.
   SmallPtrSet<const SCEV *, 4> Taken;
@@ -3071,6 +3132,17 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
   }
 }
 
+/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
+/// formulae to choose from, use some rough heuristics to prune down the number
+/// of formulae. This keeps the main solver from taking an extraordinary amount
+/// of time in some worst-case scenarios.
+void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
+  NarrowSearchSpaceByDetectingSupersets();
+  NarrowSearchSpaceByCollapsingUnrolledCode();
+  NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  NarrowSearchSpaceByPickingWinnerRegs();
+}
+
 /// SolveRecurse - This is the recursive solver.
 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
                                Cost &SolutionCost,
@@ -3614,10 +3686,6 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   // to formulate the values needed for the uses.
   GenerateAllReuseFormulae();
 
-  DEBUG(dbgs() << "\n"
-                  "After generating reuse formulae:\n";
-        print_uses(dbgs()));
-
   FilterOutUndesirableDedicatedRegisters();
   NarrowSearchSpaceUsingHeuristics();
 
@@ -3724,15 +3792,15 @@ private:
 }
 
 char LoopStrengthReduce::ID = 0;
-static RegisterPass<LoopStrengthReduce>
-X("loop-reduce", "Loop Strength Reduction");
+INITIALIZE_PASS(LoopStrengthReduce, "loop-reduce",
+                "Loop Strength Reduction", false, false);
 
 Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
   return new LoopStrengthReduce(TLI);
 }
 
 LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
-  : LoopPass(&ID), TLI(tli) {}
+  : LoopPass(ID), TLI(tli) {}
 
 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   // We split critical edges, so we change the CFG.  However, we do update
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 4ad41ae4b59f7..d0edfa2200513 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -26,7 +27,7 @@
 using namespace llvm;
 
 static cl::opt<unsigned>
-UnrollThreshold("unroll-threshold", cl::init(100), cl::Hidden,
+UnrollThreshold("unroll-threshold", cl::init(200), cl::Hidden,
   cl::desc("The cut-off point for automatic loop unrolling"));
 
 static cl::opt<unsigned>
@@ -42,7 +43,7 @@ namespace {
   class LoopUnroll : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
-    LoopUnroll() : LoopPass(&ID) {}
+    LoopUnroll() : LoopPass(ID) {}
 
     /// A magic value for use with the Threshold parameter to indicate
     /// that the loop unroll should be performed regardless of how much
@@ -55,23 +56,24 @@ namespace {
     /// loop preheaders be inserted into the CFG...
     ///
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
       AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
-      AU.addRequired<LoopInfo>();
       AU.addPreservedID(LCSSAID);
-      AU.addPreserved<LoopInfo>();
+      AU.addPreserved<ScalarEvolution>();
       // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
       // If loop unroll does not preserve dom info then LCSSA pass on next
       // loop will receive invalid dom info.
       // For now, recreate dom info, if loop is unrolled.
       AU.addPreserved<DominatorTree>();
-      AU.addPreserved<DominanceFrontier>();
     }
   };
 }
 
 char LoopUnroll::ID = 0;
-static RegisterPass<LoopUnroll> X("loop-unroll", "Unroll loops");
+INITIALIZE_PASS(LoopUnroll, "loop-unroll", "Unroll loops", false, false);
 
 Pass *llvm::createLoopUnrollPass() { return new LoopUnroll(); }
 
@@ -145,12 +147,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     return false;
 
   // FIXME: Reconstruct dom info, because it is not preserved properly.
-  DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>();
-  if (DT) {
+  if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>())
     DT->runOnFunction(*F);
-    DominanceFrontier *DF = getAnalysisIfAvailable<DominanceFrontier>();
-    if (DF)
-      DF->runOnFunction(*F);
-  }
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 0c900ffc40277..9afe428ba5691 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -77,7 +77,6 @@ namespace {
     bool redoLoop;
 
     Loop *currentLoop;
-    DominanceFrontier *DF;
     DominatorTree *DT;
     BasicBlock *loopHeader;
     BasicBlock *loopPreheader;
@@ -92,15 +91,15 @@ namespace {
   public:
     static char ID; // Pass ID, replacement for typeid
     explicit LoopUnswitch(bool Os = false) : 
-      LoopPass(&ID), OptimizeForSize(Os), redoLoop(false), 
-      currentLoop(NULL), DF(NULL), DT(NULL), loopHeader(NULL),
+      LoopPass(ID), OptimizeForSize(Os), redoLoop(false), 
+      currentLoop(NULL), DT(NULL), loopHeader(NULL),
       loopPreheader(NULL) {}
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
     bool processCurrentLoop();
 
     /// This transformation requires natural loop information & requires that
-    /// loop preheaders be inserted into the CFG...
+    /// loop preheaders be inserted into the CFG.
     ///
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequiredID(LoopSimplifyID);
@@ -110,7 +109,6 @@ namespace {
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
       AU.addPreserved<DominatorTree>();
-      AU.addPreserved<DominanceFrontier>();
     }
 
   private:
@@ -160,7 +158,7 @@ namespace {
   };
 }
 char LoopUnswitch::ID = 0;
-static RegisterPass<LoopUnswitch> X("loop-unswitch", "Unswitch loops");
+INITIALIZE_PASS(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false);
 
 Pass *llvm::createLoopUnswitchPass(bool Os) { 
   return new LoopUnswitch(Os); 
@@ -201,7 +199,6 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
 bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
   LI = &getAnalysis<LoopInfo>();
   LPM = &LPM_Ref;
-  DF = getAnalysisIfAvailable<DominanceFrontier>();
   DT = getAnalysisIfAvailable<DominatorTree>();
   currentLoop = L;
   Function *F = currentLoop->getHeader()->getParent();
@@ -216,8 +213,6 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
     // FIXME: Reconstruct dom info, because it is not preserved properly.
     if (DT)
       DT->runOnFunction(*F);
-    if (DF)
-      DF->runOnFunction(*F);
   }
   return Changed;
 }
@@ -282,19 +277,18 @@ bool LoopUnswitch::processCurrentLoop() {
   return Changed;
 }
 
-/// isTrivialLoopExitBlock - Check to see if all paths from BB either:
-///   1. Exit the loop with no side effects.
-///   2. Branch to the latch block with no side-effects.
+/// isTrivialLoopExitBlock - Check to see if all paths from BB exit the
+/// loop with no side effects (including infinite loops).
 ///
-/// If these conditions are true, we return true and set ExitBB to the block we
+/// If true, we return true and set ExitBB to the block we
 /// exit through.
 ///
 static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
                                          BasicBlock *&ExitBB,
                                          std::set<BasicBlock*> &Visited) {
   if (!Visited.insert(BB).second) {
-    // Already visited and Ok, end of recursion.
-    return true;
+    // Already visited. Without more analysis, this could indicate an infinte loop.
+    return false;
   } else if (!L->contains(BB)) {
     // Otherwise, this is a loop exit, this is fine so long as this is the
     // first exit.
@@ -324,7 +318,7 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
 /// process.  If so, return the block that is exited to, otherwise return null.
 static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
   std::set<BasicBlock*> Visited;
-  Visited.insert(L->getHeader());  // Branches to header are ok.
+  Visited.insert(L->getHeader());  // Branches to header make infinite loops.
   BasicBlock *ExitBB = 0;
   if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited))
     return ExitBB;
@@ -356,8 +350,8 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
     if (!BI->isConditional() || BI->getCondition() != Cond)
       return false;
   
-    // Check to see if a successor of the branch is guaranteed to go to the
-    // latch block or exit through a one exit block without having any 
+    // Check to see if a successor of the branch is guaranteed to 
+    // exit through a unique exit block without having any 
     // side-effects.  If so, determine the value of Cond that causes it to do
     // this.
     if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp
new file mode 100644
index 0000000000000..973ffe7e6a40f
--- /dev/null
+++ b/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -0,0 +1,161 @@
+//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers atomic intrinsics to non-atomic form for use in a known
+// non-preemptible environment.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loweratomic"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Function.h"
+#include "llvm/Instruction.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/IRBuilder.h"
+
+using namespace llvm;
+
+namespace {
+
+bool LowerAtomicIntrinsic(CallInst *CI) {
+  IRBuilder<> Builder(CI->getParent(), CI);
+
+  Function *Callee = CI->getCalledFunction();
+  if (!Callee)
+    return false;
+
+  unsigned IID = Callee->getIntrinsicID();
+  switch (IID) {
+  case Intrinsic::memory_barrier:
+    break;
+
+  case Intrinsic::atomic_load_add:
+  case Intrinsic::atomic_load_sub:
+  case Intrinsic::atomic_load_and:
+  case Intrinsic::atomic_load_nand:
+  case Intrinsic::atomic_load_or:
+  case Intrinsic::atomic_load_xor:
+  case Intrinsic::atomic_load_max:
+  case Intrinsic::atomic_load_min:
+  case Intrinsic::atomic_load_umax:
+  case Intrinsic::atomic_load_umin: {
+    Value *Ptr = CI->getArgOperand(0);
+    Value *Delta = CI->getArgOperand(1);
+
+    LoadInst *Orig = Builder.CreateLoad(Ptr);
+    Value *Res = NULL;
+    switch (IID) {
+      default: assert(0 && "Unrecognized atomic modify operation");
+      case Intrinsic::atomic_load_add:
+        Res = Builder.CreateAdd(Orig, Delta);
+        break;
+      case Intrinsic::atomic_load_sub:
+        Res = Builder.CreateSub(Orig, Delta);
+        break;
+      case Intrinsic::atomic_load_and:
+        Res = Builder.CreateAnd(Orig, Delta);
+        break;
+      case Intrinsic::atomic_load_nand:
+        Res = Builder.CreateNot(Builder.CreateAnd(Orig, Delta));
+        break;
+      case Intrinsic::atomic_load_or:
+        Res = Builder.CreateOr(Orig, Delta);
+        break;
+      case Intrinsic::atomic_load_xor:
+        Res = Builder.CreateXor(Orig, Delta);
+        break;
+      case Intrinsic::atomic_load_max:
+        Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta),
+                                   Delta,
+                                   Orig);
+        break;
+      case Intrinsic::atomic_load_min:
+        Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta),
+                                   Orig,
+                                   Delta);
+        break;
+      case Intrinsic::atomic_load_umax:
+        Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta),
+                                   Delta,
+                                   Orig);
+        break;
+      case Intrinsic::atomic_load_umin:
+        Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta),
+                                   Orig,
+                                   Delta);
+        break;
+    }
+    Builder.CreateStore(Res, Ptr);
+
+    CI->replaceAllUsesWith(Orig);
+    break;
+  }
+
+  case Intrinsic::atomic_swap: {
+    Value *Ptr = CI->getArgOperand(0);
+    Value *Val = CI->getArgOperand(1);
+
+    LoadInst *Orig = Builder.CreateLoad(Ptr);
+    Builder.CreateStore(Val, Ptr);
+
+    CI->replaceAllUsesWith(Orig);
+    break;
+  }
+
+  case Intrinsic::atomic_cmp_swap: {
+    Value *Ptr = CI->getArgOperand(0);
+    Value *Cmp = CI->getArgOperand(1);
+    Value *Val = CI->getArgOperand(2);
+
+    LoadInst *Orig = Builder.CreateLoad(Ptr);
+    Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+    Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+    Builder.CreateStore(Res, Ptr);
+
+    CI->replaceAllUsesWith(Orig);
+    break;
+  }
+
+  default:
+    return false;
+  }
+
+  assert(CI->use_empty() &&
+         "Lowering should have eliminated any uses of the intrinsic call!");
+  CI->eraseFromParent();
+
+  return true;
+}
+
+struct LowerAtomic : public BasicBlockPass {
+  static char ID;
+  LowerAtomic() : BasicBlockPass(ID) {}
+  bool runOnBasicBlock(BasicBlock &BB) {
+    bool Changed = false;
+    for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) {
+      Instruction *Inst = DI++;
+      if (CallInst *CI = dyn_cast<CallInst>(Inst))
+        Changed |= LowerAtomicIntrinsic(CI);
+    }
+    return Changed;
+  }
+
+};
+
+}
+
+char LowerAtomic::ID = 0;
+INITIALIZE_PASS(LowerAtomic, "loweratomic",
+                "Lower atomic intrinsics to non-atomic form",
+                false, false);
+
+Pass *llvm::createLowerAtomicPass() { return new LowerAtomic(); }
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 0e566c5bd9be2..24fae423d2f70 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -304,7 +304,7 @@ namespace {
     bool runOnFunction(Function &F);
   public:
     static char ID; // Pass identification, replacement for typeid
-    MemCpyOpt() : FunctionPass(&ID) {}
+    MemCpyOpt() : FunctionPass(ID) {}
 
   private:
     // This transformation requires dominator postdominator info
@@ -331,8 +331,7 @@ namespace {
 // createMemCpyOptPass - The public interface to this file...
 FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
 
-static RegisterPass<MemCpyOpt> X("memcpyopt",
-                                 "MemCpy Optimization");
+INITIALIZE_PASS(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false);
 
 
 
@@ -374,7 +373,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       // If the call is readnone, ignore it, otherwise bail out.  We don't even
       // allow readonly here because we don't want something like:
       // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
-      if (AA.getModRefBehavior(CallSite::get(BI)) ==
+      if (AA.getModRefBehavior(CallSite(BI)) ==
             AliasAnalysis::DoesNotAccessMemory)
         continue;
       
@@ -509,7 +508,7 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
   // because we'll need to do type comparisons based on the underlying type.
   Value *cpyDest = cpy->getDest();
   Value *cpySrc = cpy->getSource();
-  CallSite CS = CallSite::get(C);
+  CallSite CS(C);
 
   // We need to be able to reason about the size of the memcpy, so we require
   // that it be a constant.
@@ -637,10 +636,11 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
   return true;
 }
 
-/// processMemCpy - perform simplication of memcpy's.  If we have memcpy A which
-/// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
-/// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
-///  This allows later passes to remove the first memcpy altogether.
+/// processMemCpy - perform simplification of memcpy's.  If we have memcpy A
+/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
+/// B to be a memcpy from X to Z (or potentially a memmove, depending on
+/// circumstances). This allows later passes to remove the first memcpy
+/// altogether.
 bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
   MemoryDependenceAnalysis &MD = getAnalysis<MemoryDependenceAnalysis>();
 
@@ -744,7 +744,8 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
   const Type *ArgTys[3] = { M->getRawDest()->getType(),
                             M->getRawSource()->getType(),
                             M->getLength()->getType() };
-  M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy, ArgTys, 3));
+  M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy,
+                                                 ArgTys, 3));
 
   // MemDep may have over conservative information about this instruction, just
   // conservatively flush it from the cache.
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 98452f5d82c47..b8afcc12d927d 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -77,7 +77,7 @@ namespace {
     bool MadeChange;
   public:
     static char ID; // Pass identification, replacement for typeid
-    Reassociate() : FunctionPass(&ID) {}
+    Reassociate() : FunctionPass(ID) {}
 
     bool runOnFunction(Function &F);
 
@@ -103,7 +103,8 @@ namespace {
 }
 
 char Reassociate::ID = 0;
-static RegisterPass<Reassociate> X("reassociate", "Reassociate expressions");
+INITIALIZE_PASS(Reassociate, "reassociate",
+                "Reassociate expressions", false, false);
 
 // Public interface to the Reassociate pass
 FunctionPass *llvm::createReassociatePass() { return new Reassociate(); }
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
index 13222ac22004e..506b72ac34e0d 100644
--- a/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -36,7 +36,7 @@ STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted");
 namespace {
   struct RegToMem : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    RegToMem() : FunctionPass(&ID) {}
+    RegToMem() : FunctionPass(ID) {}
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequiredID(BreakCriticalEdgesID);
@@ -59,8 +59,8 @@ namespace {
 }
   
 char RegToMem::ID = 0;
-static RegisterPass<RegToMem>
-X("reg2mem", "Demote all values to stack slots");
+INITIALIZE_PASS(RegToMem, "reg2mem", "Demote all values to stack slots",
+                false, false);
 
 
 bool RegToMem::runOnFunction(Function &F) {
@@ -124,7 +124,7 @@ bool RegToMem::runOnFunction(Function &F) {
 
 // createDemoteRegisterToMemory - Provide an entry point to create this pass.
 //
-const PassInfo *const llvm::DemoteRegisterToMemoryID = &X;
+char &llvm::DemoteRegisterToMemoryID = RegToMem::ID;
 FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
   return new RegToMem();
 }
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 907ece8fcce97..6115c05c20ac4 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -275,12 +275,12 @@ public:
     return I->second;
   }
   
-  LatticeVal getStructLatticeValueFor(Value *V, unsigned i) const {
+  /*LatticeVal getStructLatticeValueFor(Value *V, unsigned i) const {
     DenseMap<std::pair<Value*, unsigned>, LatticeVal>::const_iterator I = 
       StructValueState.find(std::make_pair(V, i));
     assert(I != StructValueState.end() && "V is not in valuemap!");
     return I->second;
-  }
+  }*/
 
   /// getTrackedRetVals - Get the inferred return value map.
   ///
@@ -508,17 +508,16 @@ private:
   void visitLoadInst      (LoadInst &I);
   void visitGetElementPtrInst(GetElementPtrInst &I);
   void visitCallInst      (CallInst &I) {
-    visitCallSite(CallSite::get(&I));
+    visitCallSite(&I);
   }
   void visitInvokeInst    (InvokeInst &II) {
-    visitCallSite(CallSite::get(&II));
+    visitCallSite(&II);
     visitTerminatorInst(II);
   }
   void visitCallSite      (CallSite CS);
   void visitUnwindInst    (TerminatorInst &I) { /*returns void*/ }
   void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
   void visitAllocaInst    (Instruction &I) { markOverdefined(&I); }
-  void visitVANextInst    (Instruction &I) { markOverdefined(&I); }
   void visitVAArgInst     (Instruction &I) { markAnythingOverdefined(&I); }
 
   void visitInstruction(Instruction &I) {
@@ -1586,7 +1585,7 @@ namespace {
   ///
   struct SCCP : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    SCCP() : FunctionPass(&ID) {}
+    SCCP() : FunctionPass(ID) {}
 
     // runOnFunction - Run the Sparse Conditional Constant Propagation
     // algorithm, and return true if the function was modified.
@@ -1600,8 +1599,8 @@ namespace {
 } // end anonymous namespace
 
 char SCCP::ID = 0;
-static RegisterPass<SCCP>
-X("sccp", "Sparse Conditional Constant Propagation");
+INITIALIZE_PASS(SCCP, "sccp",
+                "Sparse Conditional Constant Propagation", false, false);
 
 // createSCCPPass - This is the public interface to this file.
 FunctionPass *llvm::createSCCPPass() {
@@ -1702,14 +1701,15 @@ namespace {
   ///
   struct IPSCCP : public ModulePass {
     static char ID;
-    IPSCCP() : ModulePass(&ID) {}
+    IPSCCP() : ModulePass(ID) {}
     bool runOnModule(Module &M);
   };
 } // end anonymous namespace
 
 char IPSCCP::ID = 0;
-static RegisterPass<IPSCCP>
-Y("ipsccp", "Interprocedural Sparse Conditional Constant Propagation");
+INITIALIZE_PASS(IPSCCP, "ipsccp",
+                "Interprocedural Sparse Conditional Constant Propagation",
+                false, false);
 
 // createIPSCCPPass - This is the public interface to this file.
 ModulePass *llvm::createIPSCCPPass() {
@@ -1748,6 +1748,13 @@ static bool AddressIsTaken(const GlobalValue *GV) {
 bool IPSCCP::runOnModule(Module &M) {
   SCCPSolver Solver(getAnalysisIfAvailable<TargetData>());
 
+  // AddressTakenFunctions - This set keeps track of the address-taken functions
+  // that are in the input.  As IPSCCP runs through and simplifies code,
+  // functions that were address taken can end up losing their
+  // address-taken-ness.  Because of this, we keep track of their addresses from
+  // the first pass so we can use them for the later simplification pass.
+  SmallPtrSet<Function*, 32> AddressTakenFunctions;
+  
   // Loop over all functions, marking arguments to those with their addresses
   // taken or that are external as overdefined.
   //
@@ -1763,9 +1770,13 @@ bool IPSCCP::runOnModule(Module &M) {
     // If this function only has direct calls that we can see, we can track its
     // arguments and return value aggressively, and can assume it is not called
     // unless we see evidence to the contrary.
-    if (F->hasLocalLinkage() && !AddressIsTaken(F)) {
-      Solver.AddArgumentTrackedFunction(F);
-      continue;
+    if (F->hasLocalLinkage()) {
+      if (AddressIsTaken(F))
+        AddressTakenFunctions.insert(F);
+      else {
+        Solver.AddArgumentTrackedFunction(F);
+        continue;
+      }
     }
 
     // Assume the function is called.
@@ -1950,7 +1961,7 @@ bool IPSCCP::runOnModule(Module &M) {
       continue;
   
     // We can only do this if we know that nothing else can call the function.
-    if (!F->hasLocalLinkage() || AddressIsTaken(F))
+    if (!F->hasLocalLinkage() || AddressTakenFunctions.count(F))
       continue;
     
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index dd445f63320a4..fee317dbd9ab5 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Target/TargetData.h"
@@ -51,7 +52,7 @@ STATISTIC(NumGlobals,   "Number of allocas copied from constant global");
 namespace {
   struct SROA : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    explicit SROA(signed T = -1) : FunctionPass(&ID) {
+    explicit SROA(signed T = -1) : FunctionPass(ID) {
       if (T == -1)
         SRThreshold = 128;
       else
@@ -114,8 +115,7 @@ namespace {
     void DoScalarReplacement(AllocaInst *AI, 
                              std::vector<AllocaInst*> &WorkList);
     void DeleteDeadInstructions();
-    AllocaInst *AddNewAlloca(Function &F, const Type *Ty, AllocaInst *Base);
-    
+   
     void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
                               SmallVector<AllocaInst*, 32> &NewElts);
     void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
@@ -135,7 +135,8 @@ namespace {
 }
 
 char SROA::ID = 0;
-static RegisterPass<SROA> X("scalarrepl", "Scalar Replacement of Aggregates");
+INITIALIZE_PASS(SROA, "scalarrepl",
+                "Scalar Replacement of Aggregates", false, false);
 
 // Public interface to the ScalarReplAggregates pass
 FunctionPass *llvm::createScalarReplAggregatesPass(signed int Threshold) { 
@@ -193,6 +194,27 @@ private:
 };
 } // end anonymous namespace.
 
+
+/// IsVerbotenVectorType - Return true if this is a vector type ScalarRepl isn't
+/// allowed to form.  We do this to avoid MMX types, which is a complete hack,
+/// but is required until the backend is fixed.
+static bool IsVerbotenVectorType(const VectorType *VTy, const Instruction *I) {
+  StringRef Triple(I->getParent()->getParent()->getParent()->getTargetTriple());
+  if (!Triple.startswith("i386") &&
+      !Triple.startswith("x86_64"))
+    return false;
+  
+  // Reject all the MMX vector types.
+  switch (VTy->getNumElements()) {
+  default: return false;
+  case 1: return VTy->getElementType()->isIntegerTy(64);
+  case 2: return VTy->getElementType()->isIntegerTy(32);
+  case 4: return VTy->getElementType()->isIntegerTy(16);
+  case 8: return VTy->getElementType()->isIntegerTy(8);
+  }
+}
+
+
 /// TryConvert - Analyze the specified alloca, and if it is safe to do so,
 /// rewrite it to be a new alloca which is mem2reg'able.  This returns the new
 /// alloca if possible or null if not.
@@ -209,7 +231,8 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
   // we just get a lot of insert/extracts.  If at least one vector is
   // involved, then we probably really do have a union of vector/array.
   const Type *NewTy;
-  if (VectorTy && VectorTy->isVectorTy() && HadAVector) {
+  if (VectorTy && VectorTy->isVectorTy() && HadAVector &&
+      !IsVerbotenVectorType(cast<VectorType>(VectorTy), AI)) {
     DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n  TYPE = "
           << *VectorTy << '\n');
     NewTy = VectorTy;  // Use the vector type.
@@ -969,7 +992,7 @@ void SROA::isSafeForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
       if (Length)
         isSafeMemAccess(AI, Offset, Length->getZExtValue(), 0,
-                        UI.getOperandNo() == CallInst::ArgOffset, Info);
+                        UI.getOperandNo() == 0, Info);
       else
         MarkUnsafe(Info);
     } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
@@ -1662,6 +1685,12 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
 /// HasPadding - Return true if the specified type has any structure or
 /// alignment padding, false otherwise.
 static bool HasPadding(const Type *Ty, const TargetData &TD) {
+  if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty))
+    return HasPadding(ATy->getElementType(), TD);
+  
+  if (const VectorType *VTy = dyn_cast<VectorType>(Ty))
+    return HasPadding(VTy->getElementType(), TD);
+  
   if (const StructType *STy = dyn_cast<StructType>(Ty)) {
     const StructLayout *SL = TD.getStructLayout(STy);
     unsigned PrevFieldBitOffset = 0;
@@ -1691,12 +1720,8 @@ static bool HasPadding(const Type *Ty, const TargetData &TD) {
       if (PrevFieldEnd < SL->getSizeInBits())
         return true;
     }
-
-  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
-    return HasPadding(ATy->getElementType(), TD);
-  } else if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
-    return HasPadding(VTy->getElementType(), TD);
   }
+  
   return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty);
 }
 
@@ -1787,7 +1812,7 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
     if (isOffset) return false;
 
     // If the memintrinsic isn't using the alloca as the dest, reject it.
-    if (UI.getOperandNo() != CallInst::ArgOffset) return false;
+    if (UI.getOperandNo() != 0) return false;
     
     // If the source of the memcpy/move is not a constant global, reject it.
     if (!PointsToConstantGlobal(MI->getSource()))
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 49d93a2fcc271..360749caf1116 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -42,14 +42,15 @@ STATISTIC(NumSimpl, "Number of blocks simplified");
 namespace {
   struct CFGSimplifyPass : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    CFGSimplifyPass() : FunctionPass(&ID) {}
+    CFGSimplifyPass() : FunctionPass(ID) {}
 
     virtual bool runOnFunction(Function &F);
   };
 }
 
 char CFGSimplifyPass::ID = 0;
-static RegisterPass<CFGSimplifyPass> X("simplifycfg", "Simplify the CFG");
+INITIALIZE_PASS(CFGSimplifyPass, "simplifycfg",
+                "Simplify the CFG", false, false);
 
 // Public interface to the CFGSimplification pass
 FunctionPass *llvm::createCFGSimplificationPass() {
@@ -284,10 +285,9 @@ static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) {
   while (LocalChange) {
     LocalChange = false;
     
-    // Loop over all of the basic blocks (except the first one) and remove them
-    // if they are unneeded...
+    // Loop over all of the basic blocks and remove them if they are unneeded...
     //
-    for (Function::iterator BBIt = ++F.begin(); BBIt != F.end(); ) {
+    for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
       if (SimplifyCFG(BBIt++, TD)) {
         LocalChange = true;
         ++NumSimpl;
diff --git a/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp
index c3408e77807fb..3ec70ec2e024f 100644
--- a/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp
@@ -32,7 +32,7 @@ namespace {
     const TargetData *TD;
   public:
     static char ID; // Pass identification
-    SimplifyHalfPowrLibCalls() : FunctionPass(&ID) {}
+    SimplifyHalfPowrLibCalls() : FunctionPass(ID) {}
 
     bool runOnFunction(Function &F);
 
@@ -46,8 +46,8 @@ namespace {
   char SimplifyHalfPowrLibCalls::ID = 0;
 } // end anonymous namespace.
 
-static RegisterPass<SimplifyHalfPowrLibCalls>
-X("simplify-libcalls-halfpowr", "Simplify half_powr library calls");
+INITIALIZE_PASS(SimplifyHalfPowrLibCalls, "simplify-libcalls-halfpowr",
+                "Simplify half_powr library calls", false, false);
 
 // Public interface to the Simplify HalfPowr LibCalls pass.
 FunctionPass *llvm::createSimplifyHalfPowrLibCallsPass() {
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index b1c619125c355..d7ce53f367153 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -532,7 +532,7 @@ struct StrStrOpt : public LibCallOptimization {
                                    StrLen, B, TD);
       for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end();
            UI != UE; ) {
-        ICmpInst *Old = cast<ICmpInst>(UI++);
+        ICmpInst *Old = cast<ICmpInst>(*UI++);
         Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp,
                                   ConstantInt::getNullValue(StrNCmp->getType()),
                                   "cmp");
@@ -566,8 +566,8 @@ struct StrStrOpt : public LibCallOptimization {
 
     // fold strstr(x, "y") -> strchr(x, 'y').
     if (HasStr2 && ToFindStr.size() == 1)
-      return B.CreateBitCast(EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD),
-                             CI->getType());
+      return B.CreateBitCast(EmitStrChr(CI->getArgOperand(0),
+                             ToFindStr[0], B, TD), CI->getType());
     return 0;
   }
 };
@@ -681,8 +681,8 @@ struct MemSetOpt : public LibCallOptimization {
       return 0;
 
     // memset(p, v, n) -> llvm.memset(p, v, n, 1)
-    Value *Val = B.CreateIntCast(CI->getArgOperand(1), Type::getInt8Ty(*Context),
-                                 false);
+    Value *Val = B.CreateIntCast(CI->getArgOperand(1),
+                                 Type::getInt8Ty(*Context), false);
     EmitMemSet(CI->getArgOperand(0), Val,  CI->getArgOperand(2), false, B, TD);
     return CI->getArgOperand(0);
   }
@@ -1042,9 +1042,9 @@ struct SPrintFOpt : public LibCallOptimization {
       if (!TD) return 0;
 
       // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
-      EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), // Copy the nul byte.
-                 ConstantInt::get(TD->getIntPtrType(*Context),
-                 FormatStr.size()+1), 1, false, B, TD);
+      EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),   // Copy the
+                 ConstantInt::get(TD->getIntPtrType(*Context), // nul byte.
+                 FormatStr.size() + 1), 1, false, B, TD);
       return ConstantInt::get(CI->getType(), FormatStr.size());
     }
 
@@ -1080,7 +1080,8 @@ struct SPrintFOpt : public LibCallOptimization {
       Value *IncLen = B.CreateAdd(Len,
                                   ConstantInt::get(Len->getType(), 1),
                                   "leninc");
-      EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1, false, B, TD);
+      EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(2),
+                 IncLen, 1, false, B, TD);
 
       // The sprintf result is the unincremented number of bytes in the string.
       return B.CreateIntCast(Len, CI->getType(), false);
@@ -1236,7 +1237,7 @@ namespace {
     bool Modified;  // This is only used by doInitialization.
   public:
     static char ID; // Pass identification
-    SimplifyLibCalls() : FunctionPass(&ID), StrCpy(false), StrCpyChk(true) {}
+    SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true) {}
     void InitOptimizations();
     bool runOnFunction(Function &F);
 
@@ -1253,8 +1254,8 @@ namespace {
   char SimplifyLibCalls::ID = 0;
 } // end anonymous namespace.
 
-static RegisterPass<SimplifyLibCalls>
-X("simplify-libcalls", "Simplify well-known library calls");
+INITIALIZE_PASS(SimplifyLibCalls, "simplify-libcalls",
+                "Simplify well-known library calls", false, false);
 
 // Public interface to the Simplify LibCalls pass.
 FunctionPass *llvm::createSimplifyLibCallsPass() {
@@ -2155,7 +2156,7 @@ bool SimplifyLibCalls::doInitialization(Module &M) {
 //   * pow(pow(x,y),z)-> pow(x,y*z)
 //
 // puts:
-//   * puts("") -> putchar("\n")
+//   * puts("") -> putchar('\n')
 //
 // round, roundf, roundl:
 //   * round(cnst) -> cnst'
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index b88ba48505092..95d3dedfb62db 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -35,7 +35,7 @@ namespace {
 
   public:
     static char ID; // Pass identification
-    Sinking() : FunctionPass(&ID) {}
+    Sinking() : FunctionPass(ID) {}
     
     virtual bool runOnFunction(Function &F);
     
@@ -56,8 +56,7 @@ namespace {
 } // end anonymous namespace
   
 char Sinking::ID = 0;
-static RegisterPass<Sinking>
-X("sink", "Code sinking");
+INITIALIZE_PASS(Sinking, "sink", "Code sinking", false, false);
 
 FunctionPass *llvm::createSinkingPass() { return new Sinking(); }
 
diff --git a/lib/Transforms/Scalar/TailDuplication.cpp b/lib/Transforms/Scalar/TailDuplication.cpp
index 9208238f4ba5e..2e437ac778c8c 100644
--- a/lib/Transforms/Scalar/TailDuplication.cpp
+++ b/lib/Transforms/Scalar/TailDuplication.cpp
@@ -49,7 +49,7 @@ namespace {
     bool runOnFunction(Function &F);
   public:
     static char ID; // Pass identification, replacement for typeid
-    TailDup() : FunctionPass(&ID) {}
+    TailDup() : FunctionPass(ID) {}
 
   private:
     inline bool shouldEliminateUnconditionalBranch(TerminatorInst *, unsigned);
@@ -59,7 +59,7 @@ namespace {
 }
 
 char TailDup::ID = 0;
-static RegisterPass<TailDup> X("tailduplicate", "Tail Duplication");
+INITIALIZE_PASS(TailDup, "tailduplicate", "Tail Duplication", false, false);
 
 // Public interface to the Tail Duplication pass
 FunctionPass *llvm::createTailDuplicationPass() { return new TailDup(); }
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 01c8e5d6fcf48..371725467a24e 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -72,7 +72,7 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced");
 namespace {
   struct TailCallElim : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    TailCallElim() : FunctionPass(&ID) {}
+    TailCallElim() : FunctionPass(ID) {}
 
     virtual bool runOnFunction(Function &F);
 
@@ -87,7 +87,8 @@ namespace {
 }
 
 char TailCallElim::ID = 0;
-static RegisterPass<TailCallElim> X("tailcallelim", "Tail Call Elimination");
+INITIALIZE_PASS(TailCallElim, "tailcallelim",
+                "Tail Call Elimination", false, false);
 
 // Public interface to the TailCallElimination pass
 FunctionPass *llvm::createTailCallEliminationPass() {
@@ -277,22 +278,22 @@ static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
   Function *F = CI->getParent()->getParent();
   Value *ReturnedValue = 0;
 
-  for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI)
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()))
-      if (RI != IgnoreRI) {
-        Value *RetOp = RI->getOperand(0);
-
-        // We can only perform this transformation if the value returned is
-        // evaluatable at the start of the initial invocation of the function,
-        // instead of at the end of the evaluation.
-        //
-        if (!isDynamicConstant(RetOp, CI, RI))
-          return 0;
-
-        if (ReturnedValue && RetOp != ReturnedValue)
-          return 0;     // Cannot transform if differing values are returned.
-        ReturnedValue = RetOp;
-      }
+  for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI) {
+    ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator());
+    if (RI == 0 || RI == IgnoreRI) continue;
+
+    // We can only perform this transformation if the value returned is
+    // evaluatable at the start of the initial invocation of the function,
+    // instead of at the end of the evaluation.
+    //
+    Value *RetOp = RI->getOperand(0);
+    if (!isDynamicConstant(RetOp, CI, RI))
+      return 0;
+
+    if (ReturnedValue && RetOp != ReturnedValue)
+      return 0;     // Cannot transform if differing values are returned.
+    ReturnedValue = RetOp;
+  }
   return ReturnedValue;
 }
 
@@ -306,7 +307,7 @@ Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
   assert(I->getNumOperands() == 2 &&
          "Associative/commutative operations should have 2 args!");
 
-  // Exactly one operand should be the result of the call instruction...
+  // Exactly one operand should be the result of the call instruction.
   if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
       (I->getOperand(0) != CI && I->getOperand(1) != CI))
     return 0;
@@ -386,21 +387,22 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
   // tail call if all of the instructions between the call and the return are
   // movable to above the call itself, leaving the call next to the return.
   // Check that this is the case now.
-  for (BBI = CI, ++BBI; &*BBI != Ret; ++BBI)
-    if (!CanMoveAboveCall(BBI, CI)) {
-      // If we can't move the instruction above the call, it might be because it
-      // is an associative and commutative operation that could be tranformed
-      // using accumulator recursion elimination.  Check to see if this is the
-      // case, and if so, remember the initial accumulator value for later.
-      if ((AccumulatorRecursionEliminationInitVal =
-                             CanTransformAccumulatorRecursion(BBI, CI))) {
-        // Yes, this is accumulator recursion.  Remember which instruction
-        // accumulates.
-        AccumulatorRecursionInstr = BBI;
-      } else {
-        return false;   // Otherwise, we cannot eliminate the tail recursion!
-      }
+  for (BBI = CI, ++BBI; &*BBI != Ret; ++BBI) {
+    if (CanMoveAboveCall(BBI, CI)) continue;
+    
+    // If we can't move the instruction above the call, it might be because it
+    // is an associative and commutative operation that could be tranformed
+    // using accumulator recursion elimination.  Check to see if this is the
+    // case, and if so, remember the initial accumulator value for later.
+    if ((AccumulatorRecursionEliminationInitVal =
+                           CanTransformAccumulatorRecursion(BBI, CI))) {
+      // Yes, this is accumulator recursion.  Remember which instruction
+      // accumulates.
+      AccumulatorRecursionInstr = BBI;
+    } else {
+      return false;   // Otherwise, we cannot eliminate the tail recursion!
     }
+  }
 
   // We can only transform call/return pairs that either ignore the return value
   // of the call and return void, ignore the value of the call and return a
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index ec625b4cbb28f..093083a630cf3 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -97,23 +97,13 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB) {
 /// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor,
 /// if possible.  The return value indicates success or failure.
 bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
-  pred_iterator PI(pred_begin(BB)), PE(pred_end(BB));
-  // Can't merge the entry block.  Don't merge away blocks who have their
-  // address taken: this is a bug if the predecessor block is the entry node
-  // (because we'd end up taking the address of the entry) and undesirable in
-  // any case.
-  if (pred_begin(BB) == pred_end(BB) ||
-      BB->hasAddressTaken()) return false;
+  // Don't merge away blocks who have their address taken.
+  if (BB->hasAddressTaken()) return false;
   
-  BasicBlock *PredBB = *PI++;
-  for (; PI != PE; ++PI)  // Search all predecessors, see if they are all same
-    if (*PI != PredBB) {
-      PredBB = 0;       // There are multiple different predecessors...
-      break;
-    }
-  
-  // Can't merge if there are multiple predecessors.
+  // Can't merge if there are multiple predecessors, or no predecessors.
+  BasicBlock *PredBB = BB->getUniquePredecessor();
   if (!PredBB) return false;
+
   // Don't break self-loops.
   if (PredBB == BB) return false;
   // Don't break invokes.
@@ -267,7 +257,7 @@ void llvm::RemoveSuccessor(TerminatorInst *TI, unsigned SuccNum) {
   case Instruction::Switch:    // Should remove entry
   default:
   case Instruction::Ret:       // Cannot happen, has no successors!
-    llvm_unreachable("Unhandled terminator instruction type in RemoveSuccessor!");
+    llvm_unreachable("Unhandled terminator inst type in RemoveSuccessor!");
   }
 
   if (NewTI)   // If it's a different instruction, replace.
@@ -421,7 +411,8 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
   DominatorTree *DT = P ? P->getAnalysisIfAvailable<DominatorTree>() : 0;
   if (DT)
     DT->splitBlock(NewBB);
-  if (DominanceFrontier *DF = P ? P->getAnalysisIfAvailable<DominanceFrontier>():0)
+  if (DominanceFrontier *DF =
+        P ? P->getAnalysisIfAvailable<DominanceFrontier>() : 0)
     DF->splitBlock(NewBB);
 
   // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI
diff --git a/lib/Transforms/Utils/BasicInliner.cpp b/lib/Transforms/Utils/BasicInliner.cpp
index f0e31efa30c43..23a30cc585077 100644
--- a/lib/Transforms/Utils/BasicInliner.cpp
+++ b/lib/Transforms/Utils/BasicInliner.cpp
@@ -82,8 +82,8 @@ void BasicInlinerImpl::inlineFunctions() {
     Function *F = *FI;
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
-        CallSite CS = CallSite::get(I);
-        if (CS.getInstruction() && CS.getCalledFunction()
+        CallSite CS(cast<Value>(I));
+        if (CS && CS.getCalledFunction()
             && !CS.getCalledFunction()->isDeclaration())
           CallSites.push_back(CS);
       }
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 26f53c05a042f..f75ffe6105fa6 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -36,7 +36,7 @@ STATISTIC(NumBroken, "Number of blocks inserted");
 namespace {
   struct BreakCriticalEdges : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    BreakCriticalEdges() : FunctionPass(&ID) {}
+    BreakCriticalEdges() : FunctionPass(ID) {}
 
     virtual bool runOnFunction(Function &F);
 
@@ -53,11 +53,11 @@ namespace {
 }
 
 char BreakCriticalEdges::ID = 0;
-static RegisterPass<BreakCriticalEdges>
-X("break-crit-edges", "Break critical edges in CFG");
+INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
+                "Break critical edges in CFG", false, false);
 
 // Publically exposed interface to pass...
-const PassInfo *const llvm::BreakCriticalEdgesID = &X;
+char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID;
 FunctionPass *llvm::createBreakCriticalEdgesPass() {
   return new BreakCriticalEdges();
 }
@@ -225,7 +225,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
       for (Value::use_iterator UI = TIBB->use_begin(), E = TIBB->use_end();
            UI != E; ) {
         Value::use_iterator Use = UI++;
-        if (PHINode *PN = dyn_cast<PHINode>(Use)) {
+        if (PHINode *PN = dyn_cast<PHINode>(*Use)) {
           // Remove one entry from each PHI.
           if (PN->getParent() == DestBB && UpdatedPHIs.insert(PN))
             PN->setOperand(Use.getOperandNo(), NewBB);
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 7a9d007ed5583..c3139498c2504 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -421,9 +421,9 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
         FT->getParamType(3) != TD->getIntPtrType(Context))
       return false;
 
-    if (isFoldable(3 + CallInst::ArgOffset, 2 + CallInst::ArgOffset, false)) {
-      EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2),
-                 1, false, B, TD);
+    if (isFoldable(3, 2, false)) {
+      EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                 CI->getArgOperand(2), 1, false, B, TD);
       replaceCall(CI->getArgOperand(0));
       return true;
     }
@@ -444,9 +444,9 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
         FT->getParamType(3) != TD->getIntPtrType(Context))
       return false;
 
-    if (isFoldable(3 + CallInst::ArgOffset, 2 + CallInst::ArgOffset, false)) {
-      EmitMemMove(CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2),
-                  1, false, B, TD);
+    if (isFoldable(3, 2, false)) {
+      EmitMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                  CI->getArgOperand(2), 1, false, B, TD);
       replaceCall(CI->getArgOperand(0));
       return true;
     }
@@ -462,10 +462,11 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
         FT->getParamType(3) != TD->getIntPtrType(Context))
       return false;
 
-    if (isFoldable(3 + CallInst::ArgOffset, 2 + CallInst::ArgOffset, false)) {
+    if (isFoldable(3, 2, false)) {
       Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(),
                                    false);
-      EmitMemSet(CI->getArgOperand(0), Val,  CI->getArgOperand(2), false, B, TD);
+      EmitMemSet(CI->getArgOperand(0), Val,  CI->getArgOperand(2),
+                 false, B, TD);
       replaceCall(CI->getArgOperand(0));
       return true;
     }
@@ -487,7 +488,7 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
     // st[rp]cpy_chk call which may fail at runtime if the size is too long.
     // TODO: It might be nice to get a maximum length out of the possible
     // string lengths for varying.
-    if (isFoldable(2 + CallInst::ArgOffset, 1 + CallInst::ArgOffset, true)) {
+    if (isFoldable(2, 1, true)) {
       Value *Ret = EmitStrCpy(CI->getArgOperand(0), CI->getArgOperand(1), B, TD,
                               Name.substr(2, 6));
       replaceCall(Ret);
@@ -505,7 +506,7 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
         FT->getParamType(3) != TD->getIntPtrType(Context))
       return false;
 
-    if (isFoldable(3 + CallInst::ArgOffset, 2 + CallInst::ArgOffset, false)) {
+    if (isFoldable(3, 2, false)) {
       Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
                                CI->getArgOperand(2), B, TD, Name.substr(2, 7));
       replaceCall(Ret);
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index dec227acafd27..61cbeb2bd35b9 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -20,7 +20,6 @@ add_llvm_library(LLVMTransformUtils
   Mem2Reg.cpp
   PromoteMemoryToRegister.cpp
   SSAUpdater.cpp
-  SSI.cpp
   SimplifyCFG.cpp
   UnifyFunctionExitNodes.cpp
   ValueMapper.cpp
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 1dcfd57878466..f43186edae435 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -23,7 +23,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Metadata.h"
 #include "llvm/Support/CFG.h"
-#include "ValueMapper.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/ADT/SmallVector.h"
@@ -69,10 +69,11 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
 }
 
 // Clone OldFunc into NewFunc, transforming the old arguments into references to
-// ArgMap values.
+// VMap values.
 //
 void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
                              ValueToValueMapTy &VMap,
+                             bool ModuleLevelChanges,
                              SmallVectorImpl<ReturnInst*> &Returns,
                              const char *NameSuffix, ClonedCodeInfo *CodeInfo) {
   assert(NameSuffix && "NameSuffix cannot be null!");
@@ -126,7 +127,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
          BE = NewFunc->end(); BB != BE; ++BB)
     // Loop over all instructions, fixing each one as we find it...
     for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II)
-      RemapInstruction(II, VMap);
+      RemapInstruction(II, VMap, ModuleLevelChanges);
 }
 
 /// CloneFunction - Return a copy of the specified function, but without
@@ -139,6 +140,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
 ///
 Function *llvm::CloneFunction(const Function *F,
                               ValueToValueMapTy &VMap,
+                              bool ModuleLevelChanges,
                               ClonedCodeInfo *CodeInfo) {
   std::vector<const Type*> ArgTypes;
 
@@ -167,7 +169,7 @@ Function *llvm::CloneFunction(const Function *F,
     }
 
   SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
-  CloneFunctionInto(NewF, F, VMap, Returns, "", CodeInfo);
+  CloneFunctionInto(NewF, F, VMap, ModuleLevelChanges, Returns, "", CodeInfo);
   return NewF;
 }
 
@@ -180,6 +182,7 @@ namespace {
     Function *NewFunc;
     const Function *OldFunc;
     ValueToValueMapTy &VMap;
+    bool ModuleLevelChanges;
     SmallVectorImpl<ReturnInst*> &Returns;
     const char *NameSuffix;
     ClonedCodeInfo *CodeInfo;
@@ -187,12 +190,14 @@ namespace {
   public:
     PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
                           ValueToValueMapTy &valueMap,
+                          bool moduleLevelChanges,
                           SmallVectorImpl<ReturnInst*> &returns,
                           const char *nameSuffix, 
                           ClonedCodeInfo *codeInfo,
                           const TargetData *td)
-    : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap), Returns(returns),
-      NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) {
+    : NewFunc(newFunc), OldFunc(oldFunc),
+      VMap(valueMap), ModuleLevelChanges(moduleLevelChanges),
+      Returns(returns), NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) {
     }
 
     /// CloneBlock - The specified block is found to be reachable, clone it and
@@ -313,7 +318,7 @@ ConstantFoldMappedInstruction(const Instruction *I) {
   SmallVector<Constant*, 8> Ops;
   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
     if (Constant *Op = dyn_cast_or_null<Constant>(MapValue(I->getOperand(i),
-                                                           VMap)))
+                                                   VMap, ModuleLevelChanges)))
       Ops.push_back(Op);
     else
       return 0;  // All operands not constant!
@@ -334,25 +339,16 @@ ConstantFoldMappedInstruction(const Instruction *I) {
                                   Ops.size(), TD);
 }
 
-static MDNode *UpdateInlinedAtInfo(MDNode *InsnMD, MDNode *TheCallMD) {
-  DILocation ILoc(InsnMD);
-  if (!ILoc.Verify()) return InsnMD;
+static DebugLoc
+UpdateInlinedAtInfo(const DebugLoc &InsnDL, const DebugLoc &TheCallDL,
+                    LLVMContext &Ctx) {
+  DebugLoc NewLoc = TheCallDL;
+  if (MDNode *IA = InsnDL.getInlinedAt(Ctx))
+    NewLoc = UpdateInlinedAtInfo(DebugLoc::getFromDILocation(IA), TheCallDL,
+                                 Ctx);
 
-  DILocation CallLoc(TheCallMD);
-  if (!CallLoc.Verify()) return InsnMD;
-
-  DILocation OrigLocation = ILoc.getOrigLocation();
-  MDNode *NewLoc = TheCallMD;
-  if (OrigLocation.Verify())
-    NewLoc = UpdateInlinedAtInfo(OrigLocation, TheCallMD);
-
-  Value *MDVs[] = {
-    InsnMD->getOperand(0), // Line
-    InsnMD->getOperand(1), // Col
-    InsnMD->getOperand(2), // Scope
-    NewLoc
-  };
-  return MDNode::get(InsnMD->getContext(), MDVs, 4);
+  return DebugLoc::get(InsnDL.getLine(), InsnDL.getCol(),
+                       InsnDL.getScope(Ctx), NewLoc.getAsMDNode(Ctx));
 }
 
 /// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
@@ -364,6 +360,7 @@ static MDNode *UpdateInlinedAtInfo(MDNode *InsnMD, MDNode *TheCallMD) {
 /// used for things like CloneFunction or CloneModule.
 void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                                      ValueToValueMapTy &VMap,
+                                     bool ModuleLevelChanges,
                                      SmallVectorImpl<ReturnInst*> &Returns,
                                      const char *NameSuffix, 
                                      ClonedCodeInfo *CodeInfo,
@@ -377,8 +374,8 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
     assert(VMap.count(II) && "No mapping from source argument specified!");
 #endif
 
-  PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, Returns,
-                            NameSuffix, CodeInfo, TD);
+  PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges,
+                            Returns, NameSuffix, CodeInfo, TD);
 
   // Clone the entry block, and anything recursively reachable from it.
   std::vector<const BasicBlock*> CloneWorklist;
@@ -408,10 +405,9 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
     //
     BasicBlock::iterator I = NewBB->begin();
 
-    unsigned DbgKind = OldFunc->getContext().getMDKindID("dbg");
-    MDNode *TheCallMD = NULL;
-    if (TheCall && TheCall->hasMetadata()) 
-      TheCallMD = TheCall->getMetadata(DbgKind);
+    DebugLoc TheCallDL;
+    if (TheCall) 
+      TheCallDL = TheCall->getDebugLoc();
     
     // Handle PHI nodes specially, as we have to remove references to dead
     // blocks.
@@ -420,15 +416,17 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
       BasicBlock::const_iterator OldI = BI->begin();
       for (; (PN = dyn_cast<PHINode>(I)); ++I, ++OldI) {
         if (I->hasMetadata()) {
-          if (TheCallMD) {
-            if (MDNode *IMD = I->getMetadata(DbgKind)) {
-              MDNode *NewMD = UpdateInlinedAtInfo(IMD, TheCallMD);
-              I->setMetadata(DbgKind, NewMD);
+          if (!TheCallDL.isUnknown()) {
+            DebugLoc IDL = I->getDebugLoc();
+            if (!IDL.isUnknown()) {
+              DebugLoc NewDL = UpdateInlinedAtInfo(IDL, TheCallDL,
+                                                   I->getContext());
+              I->setDebugLoc(NewDL);
             }
           } else {
             // The cloned instruction has dbg info but the call instruction
             // does not have dbg info. Remove dbg info from cloned instruction.
-            I->setMetadata(DbgKind, 0);
+            I->setDebugLoc(DebugLoc());
           }
         }
         PHIToResolve.push_back(cast<PHINode>(OldI));
@@ -444,18 +442,20 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
     // Otherwise, remap the rest of the instructions normally.
     for (; I != NewBB->end(); ++I) {
       if (I->hasMetadata()) {
-        if (TheCallMD) {
-          if (MDNode *IMD = I->getMetadata(DbgKind)) {
-            MDNode *NewMD = UpdateInlinedAtInfo(IMD, TheCallMD);
-            I->setMetadata(DbgKind, NewMD);
+        if (!TheCallDL.isUnknown()) {
+          DebugLoc IDL = I->getDebugLoc();
+          if (!IDL.isUnknown()) {
+            DebugLoc NewDL = UpdateInlinedAtInfo(IDL, TheCallDL,
+                                                 I->getContext());
+            I->setDebugLoc(NewDL);
           }
         } else {
           // The cloned instruction has dbg info but the call instruction
           // does not have dbg info. Remove dbg info from cloned instruction.
-          I->setMetadata(DbgKind, 0);
+          I->setDebugLoc(DebugLoc());
         }
       }
-      RemapInstruction(I, VMap);
+      RemapInstruction(I, VMap, ModuleLevelChanges);
     }
   }
   
@@ -477,7 +477,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
         if (BasicBlock *MappedBlock = 
             cast_or_null<BasicBlock>(VMap[PN->getIncomingBlock(pred)])) {
           Value *InVal = MapValue(PN->getIncomingValue(pred),
-                                  VMap);
+                                  VMap, ModuleLevelChanges);
           assert(InVal && "Unknown input value?");
           PN->setIncomingValue(pred, InVal);
           PN->setIncomingBlock(pred, MappedBlock);
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index fc603d23e9ace..b347bf597f8ef 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -17,7 +17,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/TypeSymbolTable.h"
 #include "llvm/Constant.h"
-#include "ValueMapper.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 
 /// CloneModule - Return an exact copy of the specified module.  This is not as
@@ -89,7 +89,8 @@ Module *llvm::CloneModule(const Module *M,
     GlobalVariable *GV = cast<GlobalVariable>(VMap[I]);
     if (I->hasInitializer())
       GV->setInitializer(cast<Constant>(MapValue(I->getInitializer(),
-                                                 VMap)));
+                                                 VMap,
+                                                 true)));
     GV->setLinkage(I->getLinkage());
     GV->setThreadLocal(I->isThreadLocal());
     GV->setConstant(I->isConstant());
@@ -108,7 +109,7 @@ Module *llvm::CloneModule(const Module *M,
       }
 
       SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
-      CloneFunctionInto(F, I, VMap, Returns);
+      CloneFunctionInto(F, I, VMap, /*ModuleLevelChanges=*/true, Returns);
     }
 
     F->setLinkage(I->getLinkage());
@@ -120,34 +121,17 @@ Module *llvm::CloneModule(const Module *M,
     GlobalAlias *GA = cast<GlobalAlias>(VMap[I]);
     GA->setLinkage(I->getLinkage());
     if (const Constant* C = I->getAliasee())
-      GA->setAliasee(cast<Constant>(MapValue(C, VMap)));
+      GA->setAliasee(cast<Constant>(MapValue(C, VMap, true)));
   }
 
   // And named metadata....
   for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
          E = M->named_metadata_end(); I != E; ++I) {
     const NamedMDNode &NMD = *I;
-    SmallVector<MDNode*, 4> MDs;
+    NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName());
     for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
-      MDs.push_back(cast<MDNode>(MapValue(NMD.getOperand(i), VMap)));
-    NamedMDNode::Create(New->getContext(), NMD.getName(),
-                        MDs.data(), MDs.size(), New);
+      NewNMD->addOperand(cast<MDNode>(MapValue(NMD.getOperand(i), VMap, true)));
   }
 
-  // Update metadata attach with instructions.
-  for (Module::iterator MI = New->begin(), ME = New->end(); MI != ME; ++MI)   
-    for (Function::iterator FI = MI->begin(), FE = MI->end(); 
-         FI != FE; ++FI)
-      for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); 
-           BI != BE; ++BI) {
-        SmallVector<std::pair<unsigned, MDNode *>, 4 > MDs;
-        BI->getAllMetadata(MDs);
-        for (SmallVector<std::pair<unsigned, MDNode *>, 4>::iterator 
-               MDI = MDs.begin(), MDE = MDs.end(); MDI != MDE; ++MDI) {
-          Value *MappedValue = MapValue(MDI->second, VMap);
-          if (MDI->second != MappedValue && MappedValue)
-            BI->setMetadata(MDI->first, cast<MDNode>(MappedValue));
-        }
-      }
   return New;
 }
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 598e7d29e3781..88979e862df26 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -215,12 +215,12 @@ static void UpdateCallGraphAfterInlining(CallSite CS,
     if (I->second->getFunction() == 0)
       if (Function *F = CallSite(NewCall).getCalledFunction()) {
         // Indirect call site resolved to direct call.
-        CallerNode->addCalledFunction(CallSite::get(NewCall), CG[F]);
-        
+        CallerNode->addCalledFunction(CallSite(NewCall), CG[F]);
+
         continue;
       }
-    
-    CallerNode->addCalledFunction(CallSite::get(NewCall), I->second);
+
+    CallerNode->addCalledFunction(CallSite(NewCall), I->second);
   }
   
   // Update the call graph by deleting the edge from Callee to Caller.  We must
@@ -365,7 +365,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
     // have no dead or constant instructions leftover after inlining occurs
     // (which can happen, e.g., because an argument was constant), but we'll be
     // happy with whatever the cloner can do.
-    CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, Returns, ".i",
+    CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, 
+                              /*ModuleLevelChanges=*/false, Returns, ".i",
                               &InlinedFunctionInfo, IFI.TD, TheCall);
 
     // Remember the first block that is newly cloned over.
diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp
index 090af95c4b87b..5ca82996b42f4 100644
--- a/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/lib/Transforms/Utils/InstructionNamer.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 namespace {
   struct InstNamer : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    InstNamer() : FunctionPass(&ID) {}
+    InstNamer() : FunctionPass(ID) {}
     
     void getAnalysisUsage(AnalysisUsage &Info) const {
       Info.setPreservesAll();
@@ -48,12 +48,12 @@ namespace {
   };
   
   char InstNamer::ID = 0;
-  static RegisterPass<InstNamer> X("instnamer",
-                                   "Assign names to anonymous instructions");
+  INITIALIZE_PASS(InstNamer, "instnamer", 
+                  "Assign names to anonymous instructions", false, false);
 }
 
 
-const PassInfo *const llvm::InstructionNamerID = &X;
+char &llvm::InstructionNamerID = InstNamer::ID;
 //===----------------------------------------------------------------------===//
 //
 // InstructionNamer - Give any unnamed non-void instructions "tmp" names.
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index e90c30bba78e2..275b26508f991 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -47,7 +47,7 @@ STATISTIC(NumLCSSA, "Number of live out of a loop variables");
 namespace {
   struct LCSSA : public LoopPass {
     static char ID; // Pass identification, replacement for typeid
-    LCSSA() : LoopPass(&ID) {}
+    LCSSA() : LoopPass(ID) {}
 
     // Cached analysis information for the current function.
     DominatorTree *DT;
@@ -64,22 +64,13 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
 
-      // LCSSA doesn't actually require LoopSimplify, but the PassManager
-      // doesn't know how to schedule LoopSimplify by itself.
-      AU.addRequiredID(LoopSimplifyID);
-      AU.addPreservedID(LoopSimplifyID);
-      AU.addRequiredTransitive<LoopInfo>();
-      AU.addPreserved<LoopInfo>();
-      AU.addRequiredTransitive<DominatorTree>();
-      AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<DominatorTree>();
       AU.addPreserved<DominatorTree>();
-
-      // Request DominanceFrontier now, even though LCSSA does
-      // not use it. This allows Pass Manager to schedule Dominance
-      // Frontier early enough such that one LPPassManager can handle
-      // multiple loop transformation passes.
-      AU.addRequired<DominanceFrontier>(); 
       AU.addPreserved<DominanceFrontier>();
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreserved<ScalarEvolution>();
     }
   private:
     bool ProcessInstruction(Instruction *Inst,
@@ -99,10 +90,10 @@ namespace {
 }
   
 char LCSSA::ID = 0;
-static RegisterPass<LCSSA> X("lcssa", "Loop-Closed SSA Form Pass");
+INITIALIZE_PASS(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false);
 
 Pass *llvm::createLCSSAPass() { return new LCSSA(); }
-const PassInfo *const llvm::LCSSAID = &X;
+char &llvm::LCSSAID = LCSSA::ID;
 
 
 /// BlockDominatesAnExit - Return true if the specified block dominates at least
@@ -215,7 +206,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst,
   DomTreeNode *DomNode = DT->getNode(DomBB);
 
   SSAUpdater SSAUpdate;
-  SSAUpdate.Initialize(Inst);
+  SSAUpdate.Initialize(Inst->getType(), Inst->getName());
   
   // Insert the LCSSA phi's into all of the exit blocks dominated by the
   // value, and add them to the Phi's map.
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 8e9113871f47b..52f0499f39b0e 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -490,6 +490,9 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
 /// rewriting all the predecessors to branch to the successor block and return
 /// true.  If we can't transform, return false.
 bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
+  assert(BB != &BB->getParent()->getEntryBlock() &&
+         "TryToSimplifyUncondBranchFromEmptyBlock called on entry block!");
+
   // We can't eliminate infinite loops.
   BasicBlock *Succ = cast<BranchInst>(BB->getTerminator())->getSuccessor(0);
   if (BB == Succ) return false;
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 4f4edf3a754c1..b3c4801a4f15b 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -46,9 +46,9 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Type.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/CFG.h"
@@ -65,27 +65,30 @@ STATISTIC(NumNested  , "Number of nested loops split out");
 namespace {
   struct LoopSimplify : public LoopPass {
     static char ID; // Pass identification, replacement for typeid
-    LoopSimplify() : LoopPass(&ID) {}
+    LoopSimplify() : LoopPass(ID) {}
 
     // AA - If we have an alias analysis object to update, this is it, otherwise
     // this is null.
     AliasAnalysis *AA;
     LoopInfo *LI;
     DominatorTree *DT;
+    ScalarEvolution *SE;
     Loop *L;
     virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       // We need loop information to identify the loops...
-      AU.addRequiredTransitive<LoopInfo>();
-      AU.addRequiredTransitive<DominatorTree>();
+      AU.addRequired<DominatorTree>();
+      AU.addPreserved<DominatorTree>();
 
+      AU.addRequired<LoopInfo>();
       AU.addPreserved<LoopInfo>();
-      AU.addPreserved<DominatorTree>();
-      AU.addPreserved<DominanceFrontier>();
+
       AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<ScalarEvolution>();
       AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
+      AU.addPreserved<DominanceFrontier>();
+      AU.addPreservedID(LCSSAID);
     }
 
     /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
@@ -104,11 +107,11 @@ namespace {
 }
 
 char LoopSimplify::ID = 0;
-static RegisterPass<LoopSimplify>
-X("loopsimplify", "Canonicalize natural loops", true);
+INITIALIZE_PASS(LoopSimplify, "loopsimplify",
+                "Canonicalize natural loops", true, false);
 
 // Publically exposed interface to pass...
-const PassInfo *const llvm::LoopSimplifyID = &X;
+char &llvm::LoopSimplifyID = LoopSimplify::ID;
 Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
 
 /// runOnLoop - Run down all loops in the CFG (recursively, but we could do
@@ -120,6 +123,7 @@ bool LoopSimplify::runOnLoop(Loop *l, LPPassManager &LPM) {
   LI = &getAnalysis<LoopInfo>();
   AA = getAnalysisIfAvailable<AliasAnalysis>();
   DT = &getAnalysis<DominatorTree>();
+  SE = getAnalysisIfAvailable<ScalarEvolution>();
 
   Changed |= ProcessLoop(L, LPM);
 
@@ -141,15 +145,16 @@ ReprocessLoop:
        BB != E; ++BB) {
     if (*BB == L->getHeader()) continue;
 
-    SmallPtrSet<BasicBlock *, 4> BadPreds;
-    for (pred_iterator PI = pred_begin(*BB), PE = pred_end(*BB); PI != PE; ++PI){
+    SmallPtrSet<BasicBlock*, 4> BadPreds;
+    for (pred_iterator PI = pred_begin(*BB),
+         PE = pred_end(*BB); PI != PE; ++PI) {
       BasicBlock *P = *PI;
       if (!L->contains(P))
         BadPreds.insert(P);
     }
 
     // Delete each unique out-of-loop (and thus dead) predecessor.
-    for (SmallPtrSet<BasicBlock *, 4>::iterator I = BadPreds.begin(),
+    for (SmallPtrSet<BasicBlock*, 4>::iterator I = BadPreds.begin(),
          E = BadPreds.end(); I != E; ++I) {
 
       DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor ";
@@ -530,6 +535,12 @@ Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM) {
 
   DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
 
+  // If ScalarEvolution is around and knows anything about values in
+  // this loop, tell it to forget them, because we're about to
+  // substantially change it.
+  if (SE)
+    SE->forgetLoop(L);
+
   BasicBlock *Header = L->getHeader();
   BasicBlock *NewBB = SplitBlockPredecessors(Header, &OuterLoopPreds[0],
                                              OuterLoopPreds.size(),
@@ -619,6 +630,11 @@ LoopSimplify::InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader) {
   std::vector<BasicBlock*> BackedgeBlocks;
   for (pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I){
     BasicBlock *P = *I;
+
+    // Indirectbr edges cannot be split, so we must fail if we find one.
+    if (isa<IndirectBrInst>(P->getTerminator()))
+      return 0;
+
     if (P != Preheader) BackedgeBlocks.push_back(P);
   }
 
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index e0e07e7bbc821..236bbe9057bfc 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -127,6 +128,11 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM)
     return false;
   }
 
+  // Notify ScalarEvolution that the loop will be substantially changed,
+  // if not outright eliminated.
+  if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>())
+    SE->forgetLoop(L);
+
   // Find trip count
   unsigned TripCount = L->getSmallConstantTripCount();
   // Find trip multiple if count is not available
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index 2696e6913f3bf..a46dd8402aca7 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -78,14 +78,14 @@ namespace {
     static char ID; // Pass identification, replacement for typeid
     explicit LowerInvoke(const TargetLowering *tli = NULL,
                          bool useExpensiveEHSupport = ExpensiveEHSupport)
-      : FunctionPass(&ID), useExpensiveEHSupport(useExpensiveEHSupport),
+      : FunctionPass(ID), useExpensiveEHSupport(useExpensiveEHSupport),
         TLI(tli) { }
     bool doInitialization(Module &M);
     bool runOnFunction(Function &F);
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       // This is a cluster of orthogonal Transforms
-      AU.addPreservedID(PromoteMemoryToRegisterID);
+      AU.addPreserved("mem2reg");
       AU.addPreservedID(LowerSwitchID);
     }
 
@@ -100,10 +100,11 @@ namespace {
 }
 
 char LowerInvoke::ID = 0;
-static RegisterPass<LowerInvoke>
-X("lowerinvoke", "Lower invoke and unwind, for unwindless code generators");
+INITIALIZE_PASS(LowerInvoke, "lowerinvoke",
+                "Lower invoke and unwind, for unwindless code generators",
+                false, false);
 
-const PassInfo *const llvm::LowerInvokePassID = &X;
+char &llvm::LowerInvokePassID = LowerInvoke::ID;
 
 // Public Interface To the LowerInvoke pass.
 FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI) {
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 468a5fe4c5e5d..5530b4700aac6 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -29,19 +29,18 @@ using namespace llvm;
 
 namespace {
   /// LowerSwitch Pass - Replace all SwitchInst instructions with chained branch
-  /// instructions.  Note that this cannot be a BasicBlock pass because it
-  /// modifies the CFG!
+  /// instructions.
   class LowerSwitch : public FunctionPass {
   public:
     static char ID; // Pass identification, replacement for typeid
-    LowerSwitch() : FunctionPass(&ID) {} 
+    LowerSwitch() : FunctionPass(ID) {} 
 
     virtual bool runOnFunction(Function &F);
     
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       // This is a cluster of orthogonal Transforms
       AU.addPreserved<UnifyFunctionExitNodes>();
-      AU.addPreservedID(PromoteMemoryToRegisterID);
+      AU.addPreserved("mem2reg");
       AU.addPreservedID(LowerInvokePassID);
     }
 
@@ -50,8 +49,7 @@ namespace {
       Constant* High;
       BasicBlock* BB;
 
-      CaseRange() : Low(0), High(0), BB(0) { }
-      CaseRange(Constant* low, Constant* high, BasicBlock* bb) :
+      CaseRange(Constant *low = 0, Constant *high = 0, BasicBlock *bb = 0) :
         Low(low), High(high), BB(bb) { }
     };
 
@@ -81,11 +79,11 @@ namespace {
 }
 
 char LowerSwitch::ID = 0;
-static RegisterPass<LowerSwitch>
-X("lowerswitch", "Lower SwitchInst's to branches");
+INITIALIZE_PASS(LowerSwitch, "lowerswitch",
+                "Lower SwitchInst's to branches", false, false);
 
 // Publically exposed interface to pass...
-const PassInfo *const llvm::LowerSwitchID = &X;
+char &llvm::LowerSwitchID = LowerSwitch::ID;
 // createLowerSwitchPass - Interface to this file...
 FunctionPass *llvm::createLowerSwitchPass() {
   return new LowerSwitch();
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
index 99203b662120e..101645bd92b77 100644
--- a/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -27,7 +27,7 @@ STATISTIC(NumPromoted, "Number of alloca's promoted");
 namespace {
   struct PromotePass : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-    PromotePass() : FunctionPass(&ID) {}
+    PromotePass() : FunctionPass(ID) {}
 
     // runOnFunction - To run this pass, first we calculate the alloca
     // instructions that are safe for promotion, then we promote each one.
@@ -49,7 +49,8 @@ namespace {
 }  // end of anonymous namespace
 
 char PromotePass::ID = 0;
-static RegisterPass<PromotePass> X("mem2reg", "Promote Memory to Register");
+INITIALIZE_PASS(PromotePass, "mem2reg", "Promote Memory to Register",
+                false, false);
 
 bool PromotePass::runOnFunction(Function &F) {
   std::vector<AllocaInst*> Allocas;
@@ -81,8 +82,6 @@ bool PromotePass::runOnFunction(Function &F) {
   return Changed;
 }
 
-// Publically exposed interface to pass...
-const PassInfo *const llvm::PromoteMemoryToRegisterID = &X;
 // createPromoteMemoryToRegister - Provide an entry point to create this pass.
 //
 FunctionPass *llvm::createPromoteMemoryToRegisterPass() {
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index c0de1938b2db3..a4e3029e3a5a7 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -228,14 +228,6 @@ namespace {
 
     void run();
 
-    /// properlyDominates - Return true if I1 properly dominates I2.
-    ///
-    bool properlyDominates(Instruction *I1, Instruction *I2) const {
-      if (InvokeInst *II = dyn_cast<InvokeInst>(I1))
-        I1 = II->getNormalDest()->begin();
-      return DT.properlyDominates(I1->getParent(), I2->getParent());
-    }
-    
     /// dominates - Return true if BB1 dominates BB2 using the DominatorTree.
     ///
     bool dominates(BasicBlock *BB1, BasicBlock *BB2) const {
@@ -896,11 +888,12 @@ void PromoteMem2Reg::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
                                                      DIVar, SI);
   
   // Propagate any debug metadata from the store onto the dbg.value.
-  if (MDNode *SIMD = SI->getMetadata("dbg"))
-    DbgVal->setMetadata("dbg", SIMD);
+  DebugLoc SIDL = SI->getDebugLoc();
+  if (!SIDL.isUnknown())
+    DbgVal->setDebugLoc(SIDL);
   // Otherwise propagate debug metadata from dbg.declare.
-  else if (MDNode *MD = DDI->getMetadata("dbg"))
-      DbgVal->setMetadata("dbg", MD);         
+  else
+    DbgVal->setDebugLoc(DDI->getDebugLoc());
 }
 
 // QueuePhiNode - queues a phi-node to be added to a basic-block for a specific
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index f4bdb527655ab..c855988307ea7 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -29,20 +29,21 @@ static AvailableValsTy &getAvailableVals(void *AV) {
 }
 
 SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode*> *NewPHI)
-  : AV(0), PrototypeValue(0), InsertedPHIs(NewPHI) {}
+  : AV(0), ProtoType(0), ProtoName(), InsertedPHIs(NewPHI) {}
 
 SSAUpdater::~SSAUpdater() {
   delete &getAvailableVals(AV);
 }
 
 /// Initialize - Reset this object to get ready for a new set of SSA
-/// updates.  ProtoValue is the value used to name PHI nodes.
-void SSAUpdater::Initialize(Value *ProtoValue) {
+/// updates with type 'Ty'.  PHI nodes get a name based on 'Name'.
+void SSAUpdater::Initialize(const Type *Ty, StringRef Name) {
   if (AV == 0)
     AV = new AvailableValsTy();
   else
     getAvailableVals(AV).clear();
-  PrototypeValue = ProtoValue;
+  ProtoType = Ty;
+  ProtoName = Name;
 }
 
 /// HasValueForBlock - Return true if the SSAUpdater already has a value for
@@ -54,8 +55,8 @@ bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const {
 /// AddAvailableValue - Indicate that a rewritten value is available in the
 /// specified block with the specified value.
 void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
-  assert(PrototypeValue != 0 && "Need to initialize SSAUpdater");
-  assert(PrototypeValue->getType() == V->getType() &&
+  assert(ProtoType != 0 && "Need to initialize SSAUpdater");
+  assert(ProtoType == V->getType() &&
          "All rewritten values must have the same type");
   getAvailableVals(AV)[BB] = V;
 }
@@ -148,7 +149,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
 
   // If there are no predecessors, just return undef.
   if (PredValues.empty())
-    return UndefValue::get(PrototypeValue->getType());
+    return UndefValue::get(ProtoType);
 
   // Otherwise, if all the merged values are the same, just use it.
   if (SingularValue != 0)
@@ -168,9 +169,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   }
 
   // Ok, we have no way out, insert a new one now.
-  PHINode *InsertedPHI = PHINode::Create(PrototypeValue->getType(),
-                                         PrototypeValue->getName(),
-                                         &BB->front());
+  PHINode *InsertedPHI = PHINode::Create(ProtoType, ProtoName, &BB->front());
   InsertedPHI->reserveOperandSpace(PredValues.size());
 
   // Fill in all the predecessors of the PHI.
@@ -205,6 +204,22 @@ void SSAUpdater::RewriteUse(Use &U) {
   U.set(V);
 }
 
+/// RewriteUseAfterInsertions - Rewrite a use, just like RewriteUse.  However,
+/// this version of the method can rewrite uses in the same block as a
+/// definition, because it assumes that all uses of a value are below any
+/// inserted values.
+void SSAUpdater::RewriteUseAfterInsertions(Use &U) {
+  Instruction *User = cast<Instruction>(U.getUser());
+  
+  Value *V;
+  if (PHINode *UserPN = dyn_cast<PHINode>(User))
+    V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U));
+  else
+    V = GetValueAtEndOfBlock(User->getParent());
+  
+  U.set(V);
+}
+
 /// PHIiter - Iterator for PHI operands.  This is used for the PHI_iterator
 /// in the SSAUpdaterImpl template.
 namespace {
@@ -266,15 +281,14 @@ public:
   /// GetUndefVal - Get an undefined value of the same type as the value
   /// being handled.
   static Value *GetUndefVal(BasicBlock *BB, SSAUpdater *Updater) {
-    return UndefValue::get(Updater->PrototypeValue->getType());
+    return UndefValue::get(Updater->ProtoType);
   }
 
   /// CreateEmptyPHI - Create a new PHI instruction in the specified block.
   /// Reserve space for the operands but do not fill them in yet.
   static Value *CreateEmptyPHI(BasicBlock *BB, unsigned NumPreds,
                                SSAUpdater *Updater) {
-    PHINode *PHI = PHINode::Create(Updater->PrototypeValue->getType(),
-                                   Updater->PrototypeValue->getName(),
+    PHINode *PHI = PHINode::Create(Updater->ProtoType, Updater->ProtoName,
                                    &BB->front());
     PHI->reserveOperandSpace(NumPreds);
     return PHI;
diff --git a/lib/Transforms/Utils/SSI.cpp b/lib/Transforms/Utils/SSI.cpp
deleted file mode 100644
index 4e813ddf95c7d..0000000000000
--- a/lib/Transforms/Utils/SSI.cpp
+++ /dev/null
@@ -1,432 +0,0 @@
-//===------------------- SSI.cpp - Creates SSI Representation -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass converts a list of variables to the Static Single Information
-// form. This is a program representation described by Scott Ananian in his
-// Master Thesis: "The Static Single Information Form (1999)".
-// We are building an on-demand representation, that is, we do not convert
-// every single variable in the target function to SSI form. Rather, we receive
-// a list of target variables that must be converted. We also do not
-// completely convert a target variable to the SSI format. Instead, we only
-// change the variable in the points where new information can be attached
-// to its live range, that is, at branch points.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "ssi"
-
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/SSI.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Dominators.h"
-
-using namespace llvm;
-
-static const std::string SSI_PHI = "SSI_phi";
-static const std::string SSI_SIG = "SSI_sigma";
-
-STATISTIC(NumSigmaInserted, "Number of sigma functions inserted");
-STATISTIC(NumPhiInserted, "Number of phi functions inserted");
-
-void SSI::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequiredTransitive<DominanceFrontier>();
-  AU.addRequiredTransitive<DominatorTree>();
-  AU.setPreservesAll();
-}
-
-bool SSI::runOnFunction(Function &F) {
-  DT_ = &getAnalysis<DominatorTree>();
-  return false;
-}
-
-/// This methods creates the SSI representation for the list of values
-/// received. It will only create SSI representation if a value is used
-/// to decide a branch. Repeated values are created only once.
-///
-void SSI::createSSI(SmallVectorImpl<Instruction *> &value) {
-  init(value);
-
-  SmallPtrSet<Instruction*, 4> needConstruction;
-  for (SmallVectorImpl<Instruction*>::iterator I = value.begin(),
-       E = value.end(); I != E; ++I)
-    if (created.insert(*I))
-      needConstruction.insert(*I);
-
-  insertSigmaFunctions(needConstruction);
-
-  // Test if there is a need to transform to SSI
-  if (!needConstruction.empty()) {
-    insertPhiFunctions(needConstruction);
-    renameInit(needConstruction);
-    rename(DT_->getRoot());
-    fixPhis();
-  }
-
-  clean();
-}
-
-/// Insert sigma functions (a sigma function is a phi function with one
-/// operator)
-///
-void SSI::insertSigmaFunctions(SmallPtrSet<Instruction*, 4> &value) {
-  for (SmallPtrSet<Instruction*, 4>::iterator I = value.begin(),
-       E = value.end(); I != E; ++I) {
-    for (Value::use_iterator begin = (*I)->use_begin(),
-         end = (*I)->use_end(); begin != end; ++begin) {
-      // Test if the Use of the Value is in a comparator
-      if (CmpInst *CI = dyn_cast<CmpInst>(begin)) {
-        // Iterates through all uses of CmpInst
-        for (Value::use_iterator begin_ci = CI->use_begin(),
-             end_ci = CI->use_end(); begin_ci != end_ci; ++begin_ci) {
-          // Test if any use of CmpInst is in a Terminator
-          if (TerminatorInst *TI = dyn_cast<TerminatorInst>(begin_ci)) {
-            insertSigma(TI, *I);
-          }
-        }
-      }
-    }
-  }
-}
-
-/// Inserts Sigma Functions in every BasicBlock successor to Terminator
-/// Instruction TI. All inserted Sigma Function are related to Instruction I.
-///
-void SSI::insertSigma(TerminatorInst *TI, Instruction *I) {
-  // Basic Block of the Terminator Instruction
-  BasicBlock *BB = TI->getParent();
-  for (unsigned i = 0, e = TI->getNumSuccessors(); i < e; ++i) {
-    // Next Basic Block
-    BasicBlock *BB_next = TI->getSuccessor(i);
-    if (BB_next != BB &&
-        BB_next->getSinglePredecessor() != NULL &&
-        dominateAny(BB_next, I)) {
-      PHINode *PN = PHINode::Create(I->getType(), SSI_SIG, BB_next->begin());
-      PN->addIncoming(I, BB);
-      sigmas[PN] = I;
-      created.insert(PN);
-      defsites[I].push_back(BB_next);
-      ++NumSigmaInserted;
-    }
-  }
-}
-
-/// Insert phi functions when necessary
-///
-void SSI::insertPhiFunctions(SmallPtrSet<Instruction*, 4> &value) {
-  DominanceFrontier *DF = &getAnalysis<DominanceFrontier>();
-  for (SmallPtrSet<Instruction*, 4>::iterator I = value.begin(),
-       E = value.end(); I != E; ++I) {
-    // Test if there were any sigmas for this variable
-    SmallPtrSet<BasicBlock *, 16> BB_visited;
-
-    // Insert phi functions if there is any sigma function
-    while (!defsites[*I].empty()) {
-
-      BasicBlock *BB = defsites[*I].back();
-
-      defsites[*I].pop_back();
-      DominanceFrontier::iterator DF_BB = DF->find(BB);
-
-      // The BB is unreachable. Skip it.
-      if (DF_BB == DF->end())
-        continue; 
-
-      // Iterates through all the dominance frontier of BB
-      for (std::set<BasicBlock *>::iterator DF_BB_begin =
-           DF_BB->second.begin(), DF_BB_end = DF_BB->second.end();
-           DF_BB_begin != DF_BB_end; ++DF_BB_begin) {
-        BasicBlock *BB_dominated = *DF_BB_begin;
-
-        // Test if has not yet visited this node and if the
-        // original definition dominates this node
-        if (BB_visited.insert(BB_dominated) &&
-            DT_->properlyDominates(value_original[*I], BB_dominated) &&
-            dominateAny(BB_dominated, *I)) {
-          PHINode *PN = PHINode::Create(
-              (*I)->getType(), SSI_PHI, BB_dominated->begin());
-          phis.insert(std::make_pair(PN, *I));
-          created.insert(PN);
-
-          defsites[*I].push_back(BB_dominated);
-          ++NumPhiInserted;
-        }
-      }
-    }
-    BB_visited.clear();
-  }
-}
-
-/// Some initialization for the rename part
-///
-void SSI::renameInit(SmallPtrSet<Instruction*, 4> &value) {
-  for (SmallPtrSet<Instruction*, 4>::iterator I = value.begin(),
-       E = value.end(); I != E; ++I)
-    value_stack[*I].push_back(*I);
-}
-
-/// Renames all variables in the specified BasicBlock.
-/// Only variables that need to be rename will be.
-///
-void SSI::rename(BasicBlock *BB) {
-  SmallPtrSet<Instruction*, 8> defined;
-
-  // Iterate through instructions and make appropriate renaming.
-  // For SSI_PHI (b = PHI()), store b at value_stack as a new
-  // definition of the variable it represents.
-  // For SSI_SIG (b = PHI(a)), substitute a with the current
-  // value of a, present in the value_stack.
-  // Then store bin the value_stack as the new definition of a.
-  // For all other instructions (b = OP(a, c, d, ...)), we need to substitute
-  // all operands with its current value, present in value_stack.
-  for (BasicBlock::iterator begin = BB->begin(), end = BB->end();
-       begin != end; ++begin) {
-    Instruction *I = begin;
-    if (PHINode *PN = dyn_cast<PHINode>(I)) { // Treat PHI functions
-      Instruction* position;
-
-      // Treat SSI_PHI
-      if ((position = getPositionPhi(PN))) {
-        value_stack[position].push_back(PN);
-        defined.insert(position);
-      // Treat SSI_SIG
-      } else if ((position = getPositionSigma(PN))) {
-        substituteUse(I);
-        value_stack[position].push_back(PN);
-        defined.insert(position);
-      }
-
-      // Treat all other PHI functions
-      else {
-        substituteUse(I);
-      }
-    }
-
-    // Treat all other functions
-    else {
-      substituteUse(I);
-    }
-  }
-
-  // This loop iterates in all BasicBlocks that are successors of the current
-  // BasicBlock. For each SSI_PHI instruction found, insert an operand.
-  // This operand is the current operand in value_stack for the variable
-  // in "position". And the BasicBlock this operand represents is the current
-  // BasicBlock.
-  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) {
-    BasicBlock *BB_succ = *SI;
-
-    for (BasicBlock::iterator begin = BB_succ->begin(),
-         notPhi = BB_succ->getFirstNonPHI(); begin != *notPhi; ++begin) {
-      Instruction *I = begin;
-      PHINode *PN = dyn_cast<PHINode>(I);
-      Instruction* position;
-      if (PN && ((position = getPositionPhi(PN)))) {
-        PN->addIncoming(value_stack[position].back(), BB);
-      }
-    }
-  }
-
-  // This loop calls rename on all children from this block. This time children
-  // refers to a successor block in the dominance tree.
-  DomTreeNode *DTN = DT_->getNode(BB);
-  for (DomTreeNode::iterator begin = DTN->begin(), end = DTN->end();
-       begin != end; ++begin) {
-    DomTreeNodeBase<BasicBlock> *DTN_children = *begin;
-    BasicBlock *BB_children = DTN_children->getBlock();
-    rename(BB_children);
-  }
-
-  // Now we remove all inserted definitions of a variable from the top of
-  // the stack leaving the previous one as the top.
-  for (SmallPtrSet<Instruction*, 8>::iterator DI = defined.begin(),
-       DE = defined.end(); DI != DE; ++DI)
-    value_stack[*DI].pop_back();
-}
-
-/// Substitute any use in this instruction for the last definition of
-/// the variable
-///
-void SSI::substituteUse(Instruction *I) {
-  for (unsigned i = 0, e = I->getNumOperands(); i < e; ++i) {
-    Value *operand = I->getOperand(i);
-    for (DenseMap<Instruction*, SmallVector<Instruction*, 1> >::iterator
-         VI = value_stack.begin(), VE = value_stack.end(); VI != VE; ++VI) {
-      if (operand == VI->second.front() &&
-          I != VI->second.back()) {
-        PHINode *PN_I = dyn_cast<PHINode>(I);
-        PHINode *PN_vs = dyn_cast<PHINode>(VI->second.back());
-
-        // If a phi created in a BasicBlock is used as an operand of another
-        // created in the same BasicBlock, this step marks this second phi,
-        // to fix this issue later. It cannot be fixed now, because the
-        // operands of the first phi are not final yet.
-        if (PN_I && PN_vs &&
-            VI->second.back()->getParent() == I->getParent()) {
-
-          phisToFix.insert(PN_I);
-        }
-
-        I->setOperand(i, VI->second.back());
-        break;
-      }
-    }
-  }
-}
-
-/// Test if the BasicBlock BB dominates any use or definition of value.
-/// If it dominates a phi instruction that is on the same BasicBlock,
-/// that does not count.
-///
-bool SSI::dominateAny(BasicBlock *BB, Instruction *value) {
-  for (Value::use_iterator begin = value->use_begin(),
-       end = value->use_end(); begin != end; ++begin) {
-    Instruction *I = cast<Instruction>(*begin);
-    BasicBlock *BB_father = I->getParent();
-    if (BB == BB_father && isa<PHINode>(I))
-      continue;
-    if (DT_->dominates(BB, BB_father)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-/// When there is a phi node that is created in a BasicBlock and it is used
-/// as an operand of another phi function used in the same BasicBlock,
-/// LLVM looks this as an error. So on the second phi, the first phi is called
-/// P and the BasicBlock it incomes is B. This P will be replaced by the value
-/// it has for BasicBlock B. It also includes undef values for predecessors
-/// that were not included in the phi.
-///
-void SSI::fixPhis() {
-  for (SmallPtrSet<PHINode *, 1>::iterator begin = phisToFix.begin(),
-       end = phisToFix.end(); begin != end; ++begin) {
-    PHINode *PN = *begin;
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) {
-      PHINode *PN_father = dyn_cast<PHINode>(PN->getIncomingValue(i));
-      if (PN_father && PN->getParent() == PN_father->getParent() &&
-          !DT_->dominates(PN->getParent(), PN->getIncomingBlock(i))) {
-        BasicBlock *BB = PN->getIncomingBlock(i);
-        int pos = PN_father->getBasicBlockIndex(BB);
-        PN->setIncomingValue(i, PN_father->getIncomingValue(pos));
-      }
-    }
-  }
-
-  for (DenseMapIterator<PHINode *, Instruction*> begin = phis.begin(),
-       end = phis.end(); begin != end; ++begin) {
-    PHINode *PN = begin->first;
-    BasicBlock *BB = PN->getParent();
-    pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
-    SmallVector<BasicBlock*, 8> Preds(PI, PE);
-    for (unsigned size = Preds.size();
-         PI != PE && PN->getNumIncomingValues() != size; ++PI) {
-      bool found = false;
-      for (unsigned i = 0, pn_end = PN->getNumIncomingValues();
-           i < pn_end; ++i) {
-        if (PN->getIncomingBlock(i) == *PI) {
-          found = true;
-          break;
-        }
-      }
-      if (!found) {
-        PN->addIncoming(UndefValue::get(PN->getType()), *PI);
-      }
-    }
-  }
-}
-
-/// Return which variable (position on the vector of variables) this phi
-/// represents on the phis list.
-///
-Instruction* SSI::getPositionPhi(PHINode *PN) {
-  DenseMap<PHINode *, Instruction*>::iterator val = phis.find(PN);
-  if (val == phis.end())
-    return 0;
-  else
-    return val->second;
-}
-
-/// Return which variable (position on the vector of variables) this phi
-/// represents on the sigmas list.
-///
-Instruction* SSI::getPositionSigma(PHINode *PN) {
-  DenseMap<PHINode *, Instruction*>::iterator val = sigmas.find(PN);
-  if (val == sigmas.end())
-    return 0;
-  else
-    return val->second;
-}
-
-/// Initializes
-///
-void SSI::init(SmallVectorImpl<Instruction *> &value) {
-  for (SmallVectorImpl<Instruction *>::iterator I = value.begin(),
-       E = value.end(); I != E; ++I) {
-    value_original[*I] = (*I)->getParent();
-    defsites[*I].push_back((*I)->getParent());
-  }
-}
-
-/// Clean all used resources in this creation of SSI
-///
-void SSI::clean() {
-  phis.clear();
-  sigmas.clear();
-  phisToFix.clear();
-
-  defsites.clear();
-  value_stack.clear();
-  value_original.clear();
-}
-
-/// createSSIPass - The public interface to this file...
-///
-FunctionPass *llvm::createSSIPass() { return new SSI(); }
-
-char SSI::ID = 0;
-static RegisterPass<SSI> X("ssi", "Static Single Information Construction");
-
-/// SSIEverything - A pass that runs createSSI on every non-void variable,
-/// intended for debugging.
-namespace {
-  struct SSIEverything : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    SSIEverything() : FunctionPass(&ID) {}
-
-    bool runOnFunction(Function &F);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<SSI>();
-    }
-  };
-}
-
-bool SSIEverything::runOnFunction(Function &F) {
-  SmallVector<Instruction *, 16> Insts;
-  SSI &ssi = getAnalysis<SSI>();
-
-  if (F.isDeclaration() || F.isIntrinsic()) return false;
-
-  for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B)
-    for (BasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I)
-      if (!I->getType()->isVoidTy())
-        Insts.push_back(I);
-
-  ssi.createSSI(Insts);
-  return true;
-}
-
-/// createSSIEverythingPass - The public interface to this file...
-///
-FunctionPass *llvm::createSSIEverythingPass() { return new SSIEverything(); }
-
-char SSIEverything::ID = 0;
-static RegisterPass<SSIEverything>
-Y("ssi-everything", "Static Single Information Construction");
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 27b07d9731a58..28d7afbf1c33e 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -949,7 +949,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
        UI != E; ++UI) {
     // Ignore any user that is not a PHI node in BB2.  These can only occur in
     // unreachable blocks, because they would not be dominated by the instr.
-    PHINode *PN = dyn_cast<PHINode>(UI);
+    PHINode *PN = dyn_cast<PHINode>(*UI);
     if (!PN || PN->getParent() != BB2)
       return false;
     PHIUses.push_back(PN);
@@ -1724,12 +1724,12 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 
   assert(BB && BB->getParent() && "Block not embedded in function!");
   assert(BB->getTerminator() && "Degenerate basic block encountered!");
-  assert(&BB->getParent()->getEntryBlock() != BB &&
-         "Can't Simplify entry block!");
 
-  // Remove basic blocks that have no predecessors... or that just have themself
-  // as a predecessor.  These are unreachable.
-  if (pred_begin(BB) == pred_end(BB) || BB->getSinglePredecessor() == BB) {
+  // Remove basic blocks that have no predecessors (except the entry block)...
+  // or that just have themself as a predecessor.  These are unreachable.
+  if ((pred_begin(BB) == pred_end(BB) &&
+       &BB->getParent()->getEntryBlock() != BB) ||
+      BB->getSinglePredecessor() == BB) {
     DEBUG(dbgs() << "Removing BB: \n" << *BB);
     DeleteDeadBlock(BB);
     return true;
@@ -1880,8 +1880,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
       while (isa<DbgInfoIntrinsic>(BBI))
         ++BBI;
       if (BBI->isTerminator()) // Terminator is the only non-phi instruction!
-        if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
-          return true;
+        if (BB != &BB->getParent()->getEntryBlock())
+          if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
+            return true;
       
     } else {  // Conditional branch
       if (isValueEqualityComparison(BI)) {
@@ -2049,12 +2050,38 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
       }
 
       // If this block is now dead, remove it.
-      if (pred_begin(BB) == pred_end(BB)) {
+      if (pred_begin(BB) == pred_end(BB) &&
+          BB != &BB->getParent()->getEntryBlock()) {
         // We know there are no successors, so just nuke the block.
         M->getBasicBlockList().erase(BB);
         return true;
       }
     }
+  } else if (IndirectBrInst *IBI =
+               dyn_cast<IndirectBrInst>(BB->getTerminator())) {
+    // Eliminate redundant destinations.
+    SmallPtrSet<Value *, 8> Succs;
+    for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
+      BasicBlock *Dest = IBI->getDestination(i);
+      if (!Dest->hasAddressTaken() || !Succs.insert(Dest)) {
+        Dest->removePredecessor(BB);
+        IBI->removeDestination(i);
+        --i; --e;
+        Changed = true;
+      }
+    } 
+
+    if (IBI->getNumDestinations() == 0) {
+      // If the indirectbr has no successors, change it to unreachable.
+      new UnreachableInst(IBI->getContext(), IBI);
+      IBI->eraseFromParent();
+      Changed = true;
+    } else if (IBI->getNumDestinations() == 1) {
+      // If the indirectbr has one successor, change it to a direct branch.
+      BranchInst::Create(IBI->getDestination(0), IBI);
+      IBI->eraseFromParent();
+      Changed = true;
+    }
   }
 
   // Merge basic blocks into their predecessor if there is only one distinct
@@ -2068,12 +2095,15 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   // is a conditional branch, see if we can hoist any code from this block up
   // into our predecessor.
   pred_iterator PI(pred_begin(BB)), PE(pred_end(BB));
-  BasicBlock *OnlyPred = *PI++;
-  for (; PI != PE; ++PI)  // Search all predecessors, see if they are all same
-    if (*PI != OnlyPred) {
+  BasicBlock *OnlyPred = 0;
+  for (; PI != PE; ++PI) { // Search all predecessors, see if they are all same
+    if (!OnlyPred)
+      OnlyPred = *PI;
+    else if (*PI != OnlyPred) {
       OnlyPred = 0;       // There are multiple different predecessors...
       break;
     }
+  }
   
   if (OnlyPred)
     if (BranchInst *BI = dyn_cast<BranchInst>(OnlyPred->getTerminator()))
@@ -2172,8 +2202,6 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 /// eliminates unreachable basic blocks, and does other "peephole" optimization
 /// of the CFG.  It returns true if a modification was made.
 ///
-/// WARNING:  The entry node of a function may not be simplified.
-///
 bool llvm::SimplifyCFG(BasicBlock *BB, const TargetData *TD) {
   return SimplifyCFGOpt(TD).run(BB);
 }
diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index 3fa8b70a8505a..a51f1e1a47f65 100644
--- a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -24,8 +24,8 @@
 using namespace llvm;
 
 char UnifyFunctionExitNodes::ID = 0;
-static RegisterPass<UnifyFunctionExitNodes>
-X("mergereturn", "Unify function exit nodes");
+INITIALIZE_PASS(UnifyFunctionExitNodes, "mergereturn",
+                "Unify function exit nodes", false, false);
 
 Pass *llvm::createUnifyFunctionExitNodesPass() {
   return new UnifyFunctionExitNodes();
@@ -35,7 +35,7 @@ void UnifyFunctionExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
   // We preserve the non-critical-edgeness property
   AU.addPreservedID(BreakCriticalEdgesID);
   // This is a cluster of orthogonal Transforms
-  AU.addPreservedID(PromoteMemoryToRegisterID);
+  AU.addPreserved("mem2reg");
   AU.addPreservedID(LowerSwitchID);
 }
 
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 3f6a90c94ebb6..fc4bde77d4f95 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ValueMapper.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Type.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
@@ -20,28 +20,51 @@
 #include "llvm/ADT/SmallVector.h"
 using namespace llvm;
 
-Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM) {
+Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM,
+                      bool ModuleLevelChanges) {
   Value *&VMSlot = VM[V];
   if (VMSlot) return VMSlot;      // Does it exist in the map yet?
   
   // NOTE: VMSlot can be invalidated by any reference to VM, which can grow the
   // DenseMap.  This includes any recursive calls to MapValue.
 
-  // Global values and non-function-local metadata do not need to be seeded into
-  // the VM if they are using the identity mapping.
+  // Global values do not need to be seeded into the VM if they
+  // are using the identity mapping.
   if (isa<GlobalValue>(V) || isa<InlineAsm>(V) || isa<MDString>(V) ||
-      (isa<MDNode>(V) && !cast<MDNode>(V)->isFunctionLocal()))
+      (isa<MDNode>(V) && !cast<MDNode>(V)->isFunctionLocal() &&
+       !ModuleLevelChanges))
     return VMSlot = const_cast<Value*>(V);
 
   if (const MDNode *MD = dyn_cast<MDNode>(V)) {
-    SmallVector<Value*, 4> Elts;
-    for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i)
-      Elts.push_back(MD->getOperand(i) ? MapValue(MD->getOperand(i), VM) : 0);
-    return VM[V] = MDNode::get(V->getContext(), Elts.data(), Elts.size());
+    // Start by assuming that we'll use the identity mapping.
+    VMSlot = const_cast<Value*>(V);
+
+    // Check all operands to see if any need to be remapped.
+    for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) {
+      Value *OP = MD->getOperand(i);
+      if (!OP || MapValue(OP, VM, ModuleLevelChanges) == OP) continue;
+
+      // Ok, at least one operand needs remapping.
+      MDNode *Dummy = MDNode::getTemporary(V->getContext(), 0, 0);
+      VM[V] = Dummy;
+      SmallVector<Value*, 4> Elts;
+      Elts.reserve(MD->getNumOperands());
+      for (i = 0; i != e; ++i)
+        Elts.push_back(MD->getOperand(i) ? 
+                       MapValue(MD->getOperand(i), VM, ModuleLevelChanges) : 0);
+      MDNode *NewMD = MDNode::get(V->getContext(), Elts.data(), Elts.size());
+      Dummy->replaceAllUsesWith(NewMD);
+      MDNode::deleteTemporary(Dummy);
+      return VM[V] = NewMD;
+    }
+
+    // No operands needed remapping; keep the identity map.
+    return const_cast<Value*>(V);
   }
 
   Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V));
-  if (C == 0) return 0;
+  if (C == 0)
+    return 0;
   
   if (isa<ConstantInt>(C) || isa<ConstantFP>(C) ||
       isa<ConstantPointerNull>(C) || isa<ConstantAggregateZero>(C) ||
@@ -51,7 +74,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM) {
   if (ConstantArray *CA = dyn_cast<ConstantArray>(C)) {
     for (User::op_iterator b = CA->op_begin(), i = b, e = CA->op_end();
          i != e; ++i) {
-      Value *MV = MapValue(*i, VM);
+      Value *MV = MapValue(*i, VM, ModuleLevelChanges);
       if (MV != *i) {
         // This array must contain a reference to a global, make a new array
         // and return it.
@@ -62,7 +85,8 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM) {
           Values.push_back(cast<Constant>(*j));
         Values.push_back(cast<Constant>(MV));
         for (++i; i != e; ++i)
-          Values.push_back(cast<Constant>(MapValue(*i, VM)));
+          Values.push_back(cast<Constant>(MapValue(*i, VM,
+                                                   ModuleLevelChanges)));
         return VM[V] = ConstantArray::get(CA->getType(), Values);
       }
     }
@@ -72,7 +96,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM) {
   if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C)) {
     for (User::op_iterator b = CS->op_begin(), i = b, e = CS->op_end();
          i != e; ++i) {
-      Value *MV = MapValue(*i, VM);
+      Value *MV = MapValue(*i, VM, ModuleLevelChanges);
       if (MV != *i) {
         // This struct must contain a reference to a global, make a new struct
         // and return it.
@@ -83,7 +107,8 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM) {
           Values.push_back(cast<Constant>(*j));
         Values.push_back(cast<Constant>(MV));
         for (++i; i != e; ++i)
-          Values.push_back(cast<Constant>(MapValue(*i, VM)));
+          Values.push_back(cast<Constant>(MapValue(*i, VM,
+                                                   ModuleLevelChanges)));
         return VM[V] = ConstantStruct::get(CS->getType(), Values);
       }
     }
@@ -93,14 +118,14 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM) {
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
     std::vector<Constant*> Ops;
     for (User::op_iterator i = CE->op_begin(), e = CE->op_end(); i != e; ++i)
-      Ops.push_back(cast<Constant>(MapValue(*i, VM)));
+      Ops.push_back(cast<Constant>(MapValue(*i, VM, ModuleLevelChanges)));
     return VM[V] = CE->getWithOperands(Ops);
   }
   
   if (ConstantVector *CV = dyn_cast<ConstantVector>(C)) {
     for (User::op_iterator b = CV->op_begin(), i = b, e = CV->op_end();
          i != e; ++i) {
-      Value *MV = MapValue(*i, VM);
+      Value *MV = MapValue(*i, VM, ModuleLevelChanges);
       if (MV != *i) {
         // This vector value must contain a reference to a global, make a new
         // vector constant and return it.
@@ -111,7 +136,8 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM) {
           Values.push_back(cast<Constant>(*j));
         Values.push_back(cast<Constant>(MV));
         for (++i; i != e; ++i)
-          Values.push_back(cast<Constant>(MapValue(*i, VM)));
+          Values.push_back(cast<Constant>(MapValue(*i, VM,
+                                                   ModuleLevelChanges)));
         return VM[V] = ConstantVector::get(Values);
       }
     }
@@ -119,19 +145,33 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM) {
   }
   
   BlockAddress *BA = cast<BlockAddress>(C);
-  Function *F = cast<Function>(MapValue(BA->getFunction(), VM));
-  BasicBlock *BB = cast_or_null<BasicBlock>(MapValue(BA->getBasicBlock(),VM));
+  Function *F = cast<Function>(MapValue(BA->getFunction(), VM,
+                                        ModuleLevelChanges));
+  BasicBlock *BB = cast_or_null<BasicBlock>(MapValue(BA->getBasicBlock(),VM,
+                                             ModuleLevelChanges));
   return VM[V] = BlockAddress::get(F, BB ? BB : BA->getBasicBlock());
 }
 
 /// RemapInstruction - Convert the instruction operands from referencing the
 /// current values into those specified by VMap.
 ///
-void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap) {
+void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
+                            bool ModuleLevelChanges) {
+  // Remap operands.
   for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) {
-    Value *V = MapValue(*op, VMap);
+    Value *V = MapValue(*op, VMap, ModuleLevelChanges);
     assert(V && "Referenced value not in value map!");
     *op = V;
   }
-}
 
+  // Remap attached metadata.
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  I->getAllMetadata(MDs);
+  for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator
+       MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) {
+    Value *Old = MI->second;
+    Value *New = MapValue(Old, VMap, ModuleLevelChanges);
+    if (New != Old)
+      I->setMetadata(MI->first, cast<MDNode>(New));
+  }
+}
diff --git a/lib/Transforms/Utils/ValueMapper.h b/lib/Transforms/Utils/ValueMapper.h
deleted file mode 100644
index f4ff643ca03ed..0000000000000
--- a/lib/Transforms/Utils/ValueMapper.h
+++ /dev/null
@@ -1,29 +0,0 @@
-//===- ValueMapper.h - Interface shared by lib/Transforms/Utils -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the MapValue interface which is used by various parts of
-// the Transforms/Utils library to implement cloning and linking facilities.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef VALUEMAPPER_H
-#define VALUEMAPPER_H
-
-#include "llvm/ADT/ValueMap.h"
-
-namespace llvm {
-  class Value;
-  class Instruction;
-  typedef ValueMap<const Value *, Value *> ValueToValueMapTy;
-
-  Value *MapValue(const Value *V, ValueToValueMapTy &VM);
-  void RemapInstruction(Instruction *I, ValueToValueMapTy &VM);
-} // End llvm namespace
-
-#endif
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index 09b8aa507d833..831a9960463d1 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp
@@ -16,7 +16,7 @@
 
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Assembly/PrintModulePass.h"
-#include "llvm/Assembly/AsmAnnotationWriter.h"
+#include "llvm/Assembly/AssemblyAnnotationWriter.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
@@ -63,8 +63,6 @@ static const Module *getModuleFromVal(const Value *V) {
   
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
     return GV->getParent();
-  if (const NamedMDNode *NMD = dyn_cast<NamedMDNode>(V))
-    return NMD->getParent();
   return 0;
 }
 
@@ -230,7 +228,7 @@ void TypePrinting::CalcTypeName(const Type *Ty,
          E = STy->element_end(); I != E; ++I) {
       OS << ' ';
       CalcTypeName(*I, TypeStack, OS);
-      if (next(I) == STy->element_end())
+      if (llvm::next(I) == STy->element_end())
         OS << ' ';
       else
         OS << ',';
@@ -240,21 +238,6 @@ void TypePrinting::CalcTypeName(const Type *Ty,
       OS << '>';
     break;
   }
-  case Type::UnionTyID: {
-    const UnionType *UTy = cast<UnionType>(Ty);
-    OS << "union {";
-    for (StructType::element_iterator I = UTy->element_begin(),
-         E = UTy->element_end(); I != E; ++I) {
-      OS << ' ';
-      CalcTypeName(*I, TypeStack, OS);
-      if (next(I) == UTy->element_end())
-        OS << ' ';
-      else
-        OS << ',';
-    }
-    OS << '}';
-    break;
-  }
   case Type::PointerTyID: {
     const PointerType *PTy = cast<PointerType>(Ty);
     CalcTypeName(PTy->getElementType(), TypeStack, OS);
@@ -581,8 +564,12 @@ static SlotTracker *createSlotTracker(const Value *V) {
   if (const Function *Func = dyn_cast<Function>(V))
     return new SlotTracker(Func);
 
-  if (isa<MDNode>(V))
+  if (const MDNode *MD = dyn_cast<MDNode>(V)) {
+    if (!MD->isFunctionLocal())
+      return new SlotTracker(MD->getFunction());
+
     return new SlotTracker((Function *)0);
+  }
 
   return 0;
 }
@@ -634,10 +621,8 @@ void SlotTracker::processModule() {
          I = TheModule->named_metadata_begin(),
          E = TheModule->named_metadata_end(); I != E; ++I) {
     const NamedMDNode *NMD = I;
-    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
-      if (MDNode *MD = NMD->getOperand(i))
-        CreateMetadataSlot(MD);
-    }
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
+      CreateMetadataSlot(NMD->getOperand(i));
   }
 
   // Add all the unnamed functions to the table.
@@ -778,15 +763,14 @@ void SlotTracker::CreateMetadataSlot(const MDNode *N) {
 
   // Don't insert if N is a function-local metadata, these are always printed
   // inline.
-  if (N->isFunctionLocal())
-    return;
-
-  mdn_iterator I = mdnMap.find(N);
-  if (I != mdnMap.end())
-    return;
+  if (!N->isFunctionLocal()) {
+    mdn_iterator I = mdnMap.find(N);
+    if (I != mdnMap.end())
+      return;
 
-  unsigned DestSlot = mdnNext++;
-  mdnMap[N] = DestSlot;
+    unsigned DestSlot = mdnNext++;
+    mdnMap[N] = DestSlot;
+  }
 
   // Recursively add any MDNodes referenced by operands.
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
@@ -800,7 +784,8 @@ void SlotTracker::CreateMetadataSlot(const MDNode *N) {
 
 static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
                                    TypePrinting *TypePrinter,
-                                   SlotTracker *Machine);
+                                   SlotTracker *Machine,
+                                   const Module *Context);
 
 
 
@@ -856,7 +841,8 @@ static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
 
 static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
                                   TypePrinting &TypePrinter,
-                                  SlotTracker *Machine) {
+                                  SlotTracker *Machine,
+                                  const Module *Context) {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
     if (CI->getType()->isIntegerTy(1)) {
       Out << (CI->getZExtValue() ? "true" : "false");
@@ -972,9 +958,11 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
   
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV)) {
     Out << "blockaddress(";
-    WriteAsOperandInternal(Out, BA->getFunction(), &TypePrinter, Machine);
+    WriteAsOperandInternal(Out, BA->getFunction(), &TypePrinter, Machine,
+                           Context);
     Out << ", ";
-    WriteAsOperandInternal(Out, BA->getBasicBlock(), &TypePrinter, Machine);
+    WriteAsOperandInternal(Out, BA->getBasicBlock(), &TypePrinter, Machine,
+                           Context);
     Out << ")";
     return;
   }
@@ -994,12 +982,14 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
         TypePrinter.print(ETy, Out);
         Out << ' ';
         WriteAsOperandInternal(Out, CA->getOperand(0),
-                               &TypePrinter, Machine);
+                               &TypePrinter, Machine,
+                               Context);
         for (unsigned i = 1, e = CA->getNumOperands(); i != e; ++i) {
           Out << ", ";
           TypePrinter.print(ETy, Out);
           Out << ' ';
-          WriteAsOperandInternal(Out, CA->getOperand(i), &TypePrinter, Machine);
+          WriteAsOperandInternal(Out, CA->getOperand(i), &TypePrinter, Machine,
+                                 Context);
         }
       }
       Out << ']';
@@ -1017,14 +1007,16 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       TypePrinter.print(CS->getOperand(0)->getType(), Out);
       Out << ' ';
 
-      WriteAsOperandInternal(Out, CS->getOperand(0), &TypePrinter, Machine);
+      WriteAsOperandInternal(Out, CS->getOperand(0), &TypePrinter, Machine,
+                             Context);
 
       for (unsigned i = 1; i < N; i++) {
         Out << ", ";
         TypePrinter.print(CS->getOperand(i)->getType(), Out);
         Out << ' ';
 
-        WriteAsOperandInternal(Out, CS->getOperand(i), &TypePrinter, Machine);
+        WriteAsOperandInternal(Out, CS->getOperand(i), &TypePrinter, Machine,
+                               Context);
       }
       Out << ' ';
     }
@@ -1035,15 +1027,6 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     return;
   }
 
-  if (const ConstantUnion *CU = dyn_cast<ConstantUnion>(CV)) {
-    Out << "{ ";
-    TypePrinter.print(CU->getOperand(0)->getType(), Out);
-    Out << ' ';
-    WriteAsOperandInternal(Out, CU->getOperand(0), &TypePrinter, Machine);
-    Out << " }";
-    return;
-  }
-  
   if (const ConstantVector *CP = dyn_cast<ConstantVector>(CV)) {
     const Type *ETy = CP->getType()->getElementType();
     assert(CP->getNumOperands() > 0 &&
@@ -1051,12 +1034,14 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     Out << '<';
     TypePrinter.print(ETy, Out);
     Out << ' ';
-    WriteAsOperandInternal(Out, CP->getOperand(0), &TypePrinter, Machine);
+    WriteAsOperandInternal(Out, CP->getOperand(0), &TypePrinter, Machine,
+                           Context);
     for (unsigned i = 1, e = CP->getNumOperands(); i != e; ++i) {
       Out << ", ";
       TypePrinter.print(ETy, Out);
       Out << ' ';
-      WriteAsOperandInternal(Out, CP->getOperand(i), &TypePrinter, Machine);
+      WriteAsOperandInternal(Out, CP->getOperand(i), &TypePrinter, Machine,
+                             Context);
     }
     Out << '>';
     return;
@@ -1087,7 +1072,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     for (User::const_op_iterator OI=CE->op_begin(); OI != CE->op_end(); ++OI) {
       TypePrinter.print((*OI)->getType(), Out);
       Out << ' ';
-      WriteAsOperandInternal(Out, *OI, &TypePrinter, Machine);
+      WriteAsOperandInternal(Out, *OI, &TypePrinter, Machine, Context);
       if (OI+1 != CE->op_end())
         Out << ", ";
     }
@@ -1112,7 +1097,8 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
 
 static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
                                     TypePrinting *TypePrinter,
-                                    SlotTracker *Machine) {
+                                    SlotTracker *Machine,
+                                    const Module *Context) {
   Out << "!{";
   for (unsigned mi = 0, me = Node->getNumOperands(); mi != me; ++mi) {
     const Value *V = Node->getOperand(mi);
@@ -1122,7 +1108,7 @@ static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
       TypePrinter->print(V->getType(), Out);
       Out << ' ';
       WriteAsOperandInternal(Out, Node->getOperand(mi), 
-                             TypePrinter, Machine);
+                             TypePrinter, Machine, Context);
     }
     if (mi + 1 != me)
       Out << ", ";
@@ -1138,7 +1124,8 @@ static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
 ///
 static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
                                    TypePrinting *TypePrinter,
-                                   SlotTracker *Machine) {
+                                   SlotTracker *Machine,
+                                   const Module *Context) {
   if (V->hasName()) {
     PrintLLVMName(Out, V);
     return;
@@ -1147,7 +1134,7 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
   const Constant *CV = dyn_cast<Constant>(V);
   if (CV && !isa<GlobalValue>(CV)) {
     assert(TypePrinter && "Constants require TypePrinting!");
-    WriteConstantInternal(Out, CV, *TypePrinter, Machine);
+    WriteConstantInternal(Out, CV, *TypePrinter, Machine, Context);
     return;
   }
 
@@ -1168,12 +1155,16 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
   if (const MDNode *N = dyn_cast<MDNode>(V)) {
     if (N->isFunctionLocal()) {
       // Print metadata inline, not via slot reference number.
-      WriteMDNodeBodyInternal(Out, N, TypePrinter, Machine);
+      WriteMDNodeBodyInternal(Out, N, TypePrinter, Machine, Context);
       return;
     }
   
-    if (!Machine)
-      Machine = createSlotTracker(V);
+    if (!Machine) {
+      if (N->isFunctionLocal())
+        Machine = new SlotTracker(N->getFunction());
+      else
+        Machine = new SlotTracker(Context);
+    }
     Out << '!' << Machine->getMetadataSlot(N);
     return;
   }
@@ -1227,8 +1218,9 @@ void llvm::WriteAsOperand(raw_ostream &Out, const Value *V,
   // Fast path: Don't construct and populate a TypePrinting object if we
   // won't be needing any types printed.
   if (!PrintType &&
-      (!isa<Constant>(V) || V->hasName() || isa<GlobalValue>(V))) {
-    WriteAsOperandInternal(Out, V, 0, 0);
+      ((!isa<Constant>(V) && !isa<MDNode>(V)) ||
+       V->hasName() || isa<GlobalValue>(V))) {
+    WriteAsOperandInternal(Out, V, 0, 0, Context);
     return;
   }
 
@@ -1242,7 +1234,7 @@ void llvm::WriteAsOperand(raw_ostream &Out, const Value *V,
     Out << ' ';
   }
 
-  WriteAsOperandInternal(Out, V, &TypePrinter, 0);
+  WriteAsOperandInternal(Out, V, &TypePrinter, 0, Context);
 }
 
 namespace {
@@ -1297,7 +1289,7 @@ void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) {
     TypePrinter.print(Operand->getType(), Out);
     Out << ' ';
   }
-  WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine);
+  WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine, TheModule);
 }
 
 void AssemblyWriter::writeParamOperand(const Value *Operand,
@@ -1314,7 +1306,7 @@ void AssemblyWriter::writeParamOperand(const Value *Operand,
     Out << ' ' << Attribute::getAsString(Attrs);
   Out << ' ';
   // Print the operand
-  WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine);
+  WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine, TheModule);
 }
 
 void AssemblyWriter::printModule(const Module *M) {
@@ -1403,10 +1395,7 @@ void AssemblyWriter::printNamedMDNode(const NamedMDNode *NMD) {
   Out << "!" << NMD->getName() << " = !{";
   for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
     if (i) Out << ", ";
-    if (MDNode *MD = NMD->getOperand(i))
-      Out << '!' << Machine.getMetadataSlot(MD);
-    else
-      Out << "null";
+    Out << '!' << Machine.getMetadataSlot(NMD->getOperand(i));
   }
   Out << "}\n";
 }
@@ -1421,6 +1410,9 @@ static void PrintLinkage(GlobalValue::LinkageTypes LT,
   case GlobalValue::LinkerPrivateWeakLinkage:
     Out << "linker_private_weak ";
     break;
+  case GlobalValue::LinkerPrivateWeakDefAutoLinkage:
+    Out << "linker_private_weak_def_auto ";
+    break;
   case GlobalValue::InternalLinkage:      Out << "internal ";       break;
   case GlobalValue::LinkOnceAnyLinkage:   Out << "linkonce ";       break;
   case GlobalValue::LinkOnceODRLinkage:   Out << "linkonce_odr ";   break;
@@ -1451,7 +1443,7 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   if (GV->isMaterializable())
     Out << "; Materializable\n";
 
-  WriteAsOperandInternal(Out, GV, &TypePrinter, &Machine);
+  WriteAsOperandInternal(Out, GV, &TypePrinter, &Machine, GV->getParent());
   Out << " = ";
 
   if (!GV->hasInitializer() && GV->hasExternalLinkage())
@@ -1510,7 +1502,7 @@ void AssemblyWriter::printAlias(const GlobalAlias *GA) {
     TypePrinter.print(F->getFunctionType(), Out);
     Out << "* ";
 
-    WriteAsOperandInternal(Out, F, &TypePrinter, &Machine);
+    WriteAsOperandInternal(Out, F, &TypePrinter, &Machine, F->getParent());
   } else if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(Aliasee)) {
     TypePrinter.print(GA->getType(), Out);
     Out << ' ';
@@ -1593,7 +1585,7 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out <<  Attribute::getAsString(Attrs.getRetAttributes()) << ' ';
   TypePrinter.print(F->getReturnType(), Out);
   Out << ' ';
-  WriteAsOperandInternal(Out, F, &TypePrinter, &Machine);
+  WriteAsOperandInternal(Out, F, &TypePrinter, &Machine, F->getParent());
   Out << '(';
   Machine.incorporateFunction(F);
 
@@ -1643,11 +1635,10 @@ void AssemblyWriter::printFunction(const Function *F) {
   if (F->hasGC())
     Out << " gc \"" << F->getGC() << '"';
   if (F->isDeclaration()) {
-    Out << "\n";
+    Out << '\n';
   } else {
     Out << " {";
-
-    // Output all of its basic blocks... for the function
+    // Output all of the function's basic blocks.
     for (Function::const_iterator I = F->begin(), E = F->end(); I != E; ++I)
       printBasicBlock(I);
 
@@ -1696,7 +1687,7 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
     Out.PadToColumn(50);
     Out << "; Error: Block without parent!";
   } else if (BB != &BB->getParent()->getEntryBlock()) {  // Not the entry block?
-    // Output predecessors for the block...
+    // Output predecessors for the block.
     Out.PadToColumn(50);
     Out << ";";
     const_pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
@@ -1734,13 +1725,6 @@ void AssemblyWriter::printInfoComment(const Value &V) {
     AnnotationWriter->printInfoComment(V, Out);
     return;
   }
-
-  if (V.getType()->isVoidTy()) return;
-  
-  Out.PadToColumn(50);
-  Out << "; <";
-  TypePrinter.print(V.getType(), Out);
-  Out << "> [#uses=" << V.getNumUses() << ']';  // Output # uses
 }
 
 // This member is called for each Instruction in a function..
@@ -2029,7 +2013,9 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       } else {
         Out << ", !<unknown kind #" << Kind << ">";
       }
-      Out << " !" << Machine.getMetadataSlot(InstMD[i].second);
+      Out << ' ';
+      WriteAsOperandInternal(Out, InstMD[i].second, &TypePrinter, &Machine,
+                             TheModule);
     }
   }
   printInfoComment(I);
@@ -2077,7 +2063,7 @@ void AssemblyWriter::writeAllMDNodes() {
 }
 
 void AssemblyWriter::printMDNodeBody(const MDNode *Node) {
-  WriteMDNodeBodyInternal(Out, Node, &TypePrinter, &Machine);
+  WriteMDNodeBodyInternal(Out, Node, &TypePrinter, &Machine, TheModule);
   WriteMDNodeComment(Node, Out);
   Out << "\n";
 }
@@ -2093,6 +2079,13 @@ void Module::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW) const {
   W.printModule(this);
 }
 
+void NamedMDNode::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW) const {
+  SlotTracker SlotTable(getParent());
+  formatted_raw_ostream OS(ROS);
+  AssemblyWriter W(OS, SlotTable, getParent(), AAW);
+  W.printNamedMDNode(this);
+}
+
 void Type::print(raw_ostream &OS) const {
   if (this == 0) {
     OS << "<null Type>";
@@ -2130,15 +2123,11 @@ void Value::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW) const {
     SlotTracker SlotTable(F);
     AssemblyWriter W(OS, SlotTable, F ? F->getParent() : 0, AAW);
     W.printMDNodeBody(N);
-  } else if (const NamedMDNode *N = dyn_cast<NamedMDNode>(this)) {
-    SlotTracker SlotTable(N->getParent());
-    AssemblyWriter W(OS, SlotTable, N->getParent(), AAW);
-    W.printNamedMDNode(N);
   } else if (const Constant *C = dyn_cast<Constant>(this)) {
     TypePrinting TypePrinter;
     TypePrinter.print(C->getType(), OS);
     OS << ' ';
-    WriteConstantInternal(OS, C, TypePrinter, 0);
+    WriteConstantInternal(OS, C, TypePrinter, 0, 0);
   } else if (isa<InlineAsm>(this) || isa<MDString>(this) ||
              isa<Argument>(this)) {
     WriteAsOperand(OS, this, true, 0);
diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp
index dc39024e39456..9330e141c3412 100644
--- a/lib/VMCore/AutoUpgrade.cpp
+++ b/lib/VMCore/AutoUpgrade.cpp
@@ -78,6 +78,63 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         NewFn = F;
         return true;
       }
+    } else if (Name.compare(5, 9, "arm.neon.", 9) == 0) {
+      if (((Name.compare(14, 5, "vmovl", 5) == 0 ||
+            Name.compare(14, 5, "vaddl", 5) == 0 ||
+            Name.compare(14, 5, "vsubl", 5) == 0 ||
+            Name.compare(14, 5, "vaddw", 5) == 0 ||
+            Name.compare(14, 5, "vsubw", 5) == 0 ||
+            Name.compare(14, 5, "vmull", 5) == 0 ||
+            Name.compare(14, 5, "vmlal", 5) == 0 ||
+            Name.compare(14, 5, "vmlsl", 5) == 0 ||
+            Name.compare(14, 5, "vabdl", 5) == 0 ||
+            Name.compare(14, 5, "vabal", 5) == 0) &&
+           (Name.compare(19, 2, "s.", 2) == 0 ||
+            Name.compare(19, 2, "u.", 2) == 0)) ||
+
+          (Name.compare(14, 4, "vaba", 4) == 0 &&
+           (Name.compare(18, 2, "s.", 2) == 0 ||
+            Name.compare(18, 2, "u.", 2) == 0)) ||
+
+          (Name.compare(14, 6, "vmovn.", 6) == 0)) {
+
+        // Calls to these are transformed into IR without intrinsics.
+        NewFn = 0;
+        return true;
+      }
+      // Old versions of NEON ld/st intrinsics are missing alignment arguments.
+      bool isVLd = (Name.compare(14, 3, "vld", 3) == 0);
+      bool isVSt = (Name.compare(14, 3, "vst", 3) == 0);
+      if (isVLd || isVSt) {
+        unsigned NumVecs = Name.at(17) - '0';
+        if (NumVecs == 0 || NumVecs > 4)
+          return false;
+        bool isLaneOp = (Name.compare(18, 5, "lane.", 5) == 0);
+        if (!isLaneOp && Name.at(18) != '.')
+          return false;
+        unsigned ExpectedArgs = 2; // for the address and alignment
+        if (isVSt || isLaneOp)
+          ExpectedArgs += NumVecs;
+        if (isLaneOp)
+          ExpectedArgs += 1; // for the lane number
+        unsigned NumP = FTy->getNumParams();
+        if (NumP != ExpectedArgs - 1)
+          return false;
+
+        // Change the name of the old (bad) intrinsic, because 
+        // its type is incorrect, but we cannot overload that name.
+        F->setName("");
+
+        // One argument is missing: add the alignment argument.
+        std::vector<const Type*> NewParams;
+        for (unsigned p = 0; p < NumP; ++p)
+          NewParams.push_back(FTy->getParamType(p));
+        NewParams.push_back(Type::getInt32Ty(F->getContext()));
+        FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(),
+                                                 NewParams, false);
+        NewFn = cast<Function>(M->getOrInsertFunction(Name, NewFTy));
+        return true;
+      }
     }
     break;
   case 'b':
@@ -182,7 +239,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         NewFnName = "llvm.memset.p0i8.i64";
     }
     if (NewFnName) {
-      const FunctionType *FTy = F->getFunctionType();
       NewFn = cast<Function>(M->getOrInsertFunction(NewFnName, 
                                             FTy->getReturnType(),
                                             FTy->getParamType(0),
@@ -309,6 +365,73 @@ bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn) {
   return Upgraded;
 }
 
+bool llvm::UpgradeGlobalVariable(GlobalVariable *GV) {
+  StringRef Name(GV->getName());
+
+  // We are only upgrading one symbol here.
+  if (Name == ".llvm.eh.catch.all.value") {
+    GV->setName("llvm.eh.catch.all.value");
+    return true;
+  }
+
+  return false;
+}
+
+/// ExtendNEONArgs - For NEON "long" and "wide" operations, where the results
+/// have vector elements twice as big as one or both source operands, do the
+/// sign- or zero-extension that used to be handled by intrinsics.  The
+/// extended values are returned via V0 and V1.
+static void ExtendNEONArgs(CallInst *CI, Value *Arg0, Value *Arg1,
+                           Value *&V0, Value *&V1) {
+  Function *F = CI->getCalledFunction();
+  const std::string& Name = F->getName();
+  bool isLong = (Name.at(18) == 'l');
+  bool isSigned = (Name.at(19) == 's');
+
+  if (isSigned) {
+    if (isLong)
+      V0 = new SExtInst(Arg0, CI->getType(), "", CI);
+    else
+      V0 = Arg0;
+    V1 = new SExtInst(Arg1, CI->getType(), "", CI);
+  } else {
+    if (isLong)
+      V0 = new ZExtInst(Arg0, CI->getType(), "", CI);
+    else
+      V0 = Arg0;
+    V1 = new ZExtInst(Arg1, CI->getType(), "", CI);
+  }
+}
+
+/// CallVABD - As part of expanding a call to one of the old NEON vabdl, vaba,
+/// or vabal intrinsics, construct a call to a vabd intrinsic.  Examine the
+/// name of the old intrinsic to determine whether to use a signed or unsigned
+/// vabd intrinsic.  Get the type from the old call instruction, adjusted for
+/// half-size vector elements if the old intrinsic was vabdl or vabal.
+static Instruction *CallVABD(CallInst *CI, Value *Arg0, Value *Arg1) {
+  Function *F = CI->getCalledFunction();
+  const std::string& Name = F->getName();
+  bool isLong = (Name.at(18) == 'l');
+  bool isSigned = (Name.at(isLong ? 19 : 18) == 's');
+
+  Intrinsic::ID intID;
+  if (isSigned)
+    intID = Intrinsic::arm_neon_vabds;
+  else
+    intID = Intrinsic::arm_neon_vabdu;
+
+  const Type *Ty = CI->getType();
+  if (isLong)
+    Ty = VectorType::getTruncatedElementVectorType(cast<const VectorType>(Ty));
+
+  Function *VABD = Intrinsic::getDeclaration(F->getParent(), intID, &Ty, 1);
+  Value *Operands[2];
+  Operands[0] = Arg0;
+  Operands[1] = Arg1;
+  return CallInst::Create(VABD, Operands, Operands+2, 
+                          "upgraded."+CI->getName(), CI);
+}
+
 // UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the 
 // upgraded intrinsic. All argument and return casting must be provided in 
 // order to seamlessly integrate with existing context.
@@ -320,6 +443,60 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   assert(F && "CallInst has no function associated with it.");
 
   if (!NewFn) {
+    // Get the Function's name.
+    const std::string& Name = F->getName();
+
+    // Upgrade ARM NEON intrinsics.
+    if (Name.compare(5, 9, "arm.neon.", 9) == 0) {
+      Instruction *NewI;
+      Value *V0, *V1;
+      if (Name.compare(14, 7, "vmovls.", 7) == 0) {
+        NewI = new SExtInst(CI->getArgOperand(0), CI->getType(),
+                            "upgraded." + CI->getName(), CI);
+      } else if (Name.compare(14, 7, "vmovlu.", 7) == 0) {
+        NewI = new ZExtInst(CI->getArgOperand(0), CI->getType(),
+                            "upgraded." + CI->getName(), CI);
+      } else if (Name.compare(14, 4, "vadd", 4) == 0) {
+        ExtendNEONArgs(CI, CI->getArgOperand(0), CI->getArgOperand(1), V0, V1);
+        NewI = BinaryOperator::CreateAdd(V0, V1, "upgraded."+CI->getName(), CI);
+      } else if (Name.compare(14, 4, "vsub", 4) == 0) {
+        ExtendNEONArgs(CI, CI->getArgOperand(0), CI->getArgOperand(1), V0, V1);
+        NewI = BinaryOperator::CreateSub(V0, V1,"upgraded."+CI->getName(),CI);
+      } else if (Name.compare(14, 4, "vmul", 4) == 0) {
+        ExtendNEONArgs(CI, CI->getArgOperand(0), CI->getArgOperand(1), V0, V1);
+        NewI = BinaryOperator::CreateMul(V0, V1,"upgraded."+CI->getName(),CI);
+      } else if (Name.compare(14, 4, "vmla", 4) == 0) {
+        ExtendNEONArgs(CI, CI->getArgOperand(1), CI->getArgOperand(2), V0, V1);
+        Instruction *MulI = BinaryOperator::CreateMul(V0, V1, "", CI);
+        NewI = BinaryOperator::CreateAdd(CI->getArgOperand(0), MulI,
+                                         "upgraded."+CI->getName(), CI);
+      } else if (Name.compare(14, 4, "vmls", 4) == 0) {
+        ExtendNEONArgs(CI, CI->getArgOperand(1), CI->getArgOperand(2), V0, V1);
+        Instruction *MulI = BinaryOperator::CreateMul(V0, V1, "", CI);
+        NewI = BinaryOperator::CreateSub(CI->getArgOperand(0), MulI,
+                                         "upgraded."+CI->getName(), CI);
+      } else if (Name.compare(14, 4, "vabd", 4) == 0) {
+        NewI = CallVABD(CI, CI->getArgOperand(0), CI->getArgOperand(1));
+        NewI = new ZExtInst(NewI, CI->getType(), "upgraded."+CI->getName(), CI);
+      } else if (Name.compare(14, 4, "vaba", 4) == 0) {
+        NewI = CallVABD(CI, CI->getArgOperand(1), CI->getArgOperand(2));
+        if (Name.at(18) == 'l')
+          NewI = new ZExtInst(NewI, CI->getType(), "", CI);
+        NewI = BinaryOperator::CreateAdd(CI->getArgOperand(0), NewI,
+                                         "upgraded."+CI->getName(), CI);
+      } else if (Name.compare(14, 6, "vmovn.", 6) == 0) {
+        NewI = new TruncInst(CI->getArgOperand(0), CI->getType(),
+                             "upgraded." + CI->getName(), CI);
+      } else {
+        llvm_unreachable("Unknown arm.neon function for CallInst upgrade.");
+      }
+      // Replace any uses of the old CallInst.
+      if (!CI->use_empty())
+        CI->replaceAllUsesWith(NewI);
+      CI->eraseFromParent();
+      return;
+    }
+
     bool isLoadH = false, isLoadL = false, isMovL = false;
     bool isMovSD = false, isShufPD = false;
     bool isUnpckhPD = false, isUnpcklPD = false;
@@ -398,7 +575,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
         SI = new ShuffleVectorInst(Op0, Op1, Mask, "upgraded.", CI);
       } else if (isShufPD) {
         Value *Op1 = CI->getArgOperand(1);
-        unsigned MaskVal = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+        unsigned MaskVal =
+                        cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
         Idxs.push_back(ConstantInt::get(Type::getInt32Ty(C), MaskVal & 1));
         Idxs.push_back(ConstantInt::get(Type::getInt32Ty(C),
                                                ((MaskVal >> 1) & 1)+2));
@@ -547,7 +725,40 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   }
 
   switch (NewFn->getIntrinsicID()) {
-  default:  llvm_unreachable("Unknown function for CallInst upgrade.");
+  default: llvm_unreachable("Unknown function for CallInst upgrade.");
+  case Intrinsic::arm_neon_vld1:
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    // Add a default alignment argument of 1.
+    SmallVector<Value*, 8> Operands(CS.arg_begin(), CS.arg_end());
+    Operands.push_back(ConstantInt::get(Type::getInt32Ty(C), 1));
+    CallInst *NewCI = CallInst::Create(NewFn, Operands.begin(), Operands.end(),
+                                       CI->getName(), CI);
+    NewCI->setTailCall(CI->isTailCall());
+    NewCI->setCallingConv(CI->getCallingConv());
+
+    //  Handle any uses of the old CallInst.
+    if (!CI->use_empty())
+      //  Replace all uses of the old call with the new cast which has the 
+      //  correct type.
+      CI->replaceAllUsesWith(NewCI);
+    
+    //  Clean up the old call now that it has been completely upgraded.
+    CI->eraseFromParent();
+    break;
+  }        
+
   case Intrinsic::x86_mmx_psll_d:
   case Intrinsic::x86_mmx_psll_q:
   case Intrinsic::x86_mmx_psll_w:
diff --git a/lib/VMCore/CMakeLists.txt b/lib/VMCore/CMakeLists.txt
index c64564b8a1e78..1388c93cce39c 100644
--- a/lib/VMCore/CMakeLists.txt
+++ b/lib/VMCore/CMakeLists.txt
@@ -23,6 +23,7 @@ add_llvm_library(LLVMCore
   Module.cpp
   Pass.cpp
   PassManager.cpp
+  PassRegistry.cpp
   PrintModulePass.cpp
   Type.cpp
   TypeSymbolTable.cpp
diff --git a/lib/VMCore/ConstantFold.cpp b/lib/VMCore/ConstantFold.cpp
index 35672661e4459..9a91dafab2ff7 100644
--- a/lib/VMCore/ConstantFold.cpp
+++ b/lib/VMCore/ConstantFold.cpp
@@ -357,22 +357,6 @@ static Constant *getFoldedSizeOf(const Type *Ty, const Type *DestTy,
       }
     }
 
-  if (const UnionType *UTy = dyn_cast<UnionType>(Ty)) {
-    unsigned NumElems = UTy->getNumElements();
-    // Check for a union with all members having the same size.
-    Constant *MemberSize =
-      getFoldedSizeOf(UTy->getElementType(0), DestTy, true);
-    bool AllSame = true;
-    for (unsigned i = 1; i != NumElems; ++i)
-      if (MemberSize !=
-          getFoldedSizeOf(UTy->getElementType(i), DestTy, true)) {
-        AllSame = false;
-        break;
-      }
-    if (AllSame)
-      return MemberSize;
-  }
-
   // Pointer size doesn't depend on the pointee type, so canonicalize them
   // to an arbitrary pointee.
   if (const PointerType *PTy = dyn_cast<PointerType>(Ty))
@@ -438,24 +422,6 @@ static Constant *getFoldedAlignOf(const Type *Ty, const Type *DestTy,
       return MemberAlign;
   }
 
-  if (const UnionType *UTy = dyn_cast<UnionType>(Ty)) {
-    // Union alignment is the maximum alignment of any member.
-    // Without target data, we can't compare much, but we can check to see
-    // if all the members have the same alignment.
-    unsigned NumElems = UTy->getNumElements();
-    // Check for a union with all members having the same alignment.
-    Constant *MemberAlign =
-      getFoldedAlignOf(UTy->getElementType(0), DestTy, true);
-    bool AllSame = true;
-    for (unsigned i = 1; i != NumElems; ++i)
-      if (MemberAlign != getFoldedAlignOf(UTy->getElementType(i), DestTy, true)) {
-        AllSame = false;
-        break;
-      }
-    if (AllSame)
-      return MemberAlign;
-  }
-
   // Pointer alignment doesn't depend on the pointee type, so canonicalize them
   // to an arbitrary pointee.
   if (const PointerType *PTy = dyn_cast<PointerType>(Ty))
@@ -909,8 +875,6 @@ Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
     unsigned numOps;
     if (const ArrayType *AR = dyn_cast<ArrayType>(AggTy))
       numOps = AR->getNumElements();
-    else if (AggTy->isUnionTy())
-      numOps = 1;
     else
       numOps = cast<StructType>(AggTy)->getNumElements();
     
@@ -927,10 +891,6 @@ Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
     
     if (const StructType* ST = dyn_cast<StructType>(AggTy))
       return ConstantStruct::get(ST->getContext(), Ops, ST->isPacked());
-    if (const UnionType* UT = dyn_cast<UnionType>(AggTy)) {
-      assert(Ops.size() == 1 && "Union can only contain a single value!");
-      return ConstantUnion::get(UT, Ops[0]);
-    }
     return ConstantArray::get(cast<ArrayType>(AggTy), Ops);
   }
   
diff --git a/lib/VMCore/Constants.cpp b/lib/VMCore/Constants.cpp
index 00b009401dccd..16eaca81048bb 100644
--- a/lib/VMCore/Constants.cpp
+++ b/lib/VMCore/Constants.cpp
@@ -59,7 +59,6 @@ Constant *Constant::getNullValue(const Type *Ty) {
   case Type::PointerTyID:
     return ConstantPointerNull::get(cast<PointerType>(Ty));
   case Type::StructTyID:
-  case Type::UnionTyID:
   case Type::ArrayTyID:
   case Type::VectorTyID:
     return ConstantAggregateZero::get(Ty);
@@ -526,6 +525,7 @@ Constant* ConstantArray::get(const ArrayType* T, Constant* const* Vals,
 Constant* ConstantArray::get(LLVMContext &Context, StringRef Str,
                              bool AddNull) {
   std::vector<Constant*> ElementVals;
+  ElementVals.reserve(Str.size() + size_t(AddNull));
   for (unsigned i = 0; i < Str.size(); ++i)
     ElementVals.push_back(ConstantInt::get(Type::getInt8Ty(Context), Str[i]));
 
@@ -586,27 +586,6 @@ Constant* ConstantStruct::get(LLVMContext &Context,
   return get(Context, std::vector<Constant*>(Vals, Vals+NumVals), Packed);
 }
 
-ConstantUnion::ConstantUnion(const UnionType *T, Constant* V)
-  : Constant(T, ConstantUnionVal,
-             OperandTraits<ConstantUnion>::op_end(this) - 1, 1) {
-  Use *OL = OperandList;
-  assert(T->getElementTypeIndex(V->getType()) >= 0 &&
-      "Initializer for union element isn't a member of union type!");
-  *OL = V;
-}
-
-// ConstantUnion accessors.
-Constant* ConstantUnion::get(const UnionType* T, Constant* V) {
-  LLVMContextImpl* pImpl = T->getContext().pImpl;
-  
-  // Create a ConstantAggregateZero value if all elements are zeros...
-  if (!V->isNullValue())
-    return pImpl->UnionConstants.getOrCreate(T, V);
-
-  return ConstantAggregateZero::get(T);
-}
-
-
 ConstantVector::ConstantVector(const VectorType *T,
                                const std::vector<Constant*> &V)
   : Constant(T, ConstantVectorVal,
@@ -723,7 +702,7 @@ bool ConstantExpr::isGEPWithNoNotionalOverIndexing() const {
   if (getOpcode() != Instruction::GetElementPtr) return false;
 
   gep_type_iterator GEPI = gep_type_begin(this), E = gep_type_end(this);
-  User::const_op_iterator OI = next(this->op_begin());
+  User::const_op_iterator OI = llvm::next(this->op_begin());
 
   // Skip the first index, as it has no static limit.
   ++GEPI;
@@ -945,8 +924,7 @@ bool ConstantFP::isValueValidForType(const Type *Ty, const APFloat& Val) {
 //                      Factory Function Implementation
 
 ConstantAggregateZero* ConstantAggregateZero::get(const Type* Ty) {
-  assert((Ty->isStructTy() || Ty->isUnionTy()
-         || Ty->isArrayTy() || Ty->isVectorTy()) &&
+  assert((Ty->isStructTy() || Ty->isArrayTy() || Ty->isVectorTy()) &&
          "Cannot create an aggregate zero of non-aggregate type!");
   
   LLVMContextImpl *pImpl = Ty->getContext().pImpl;
@@ -956,14 +934,14 @@ ConstantAggregateZero* ConstantAggregateZero::get(const Type* Ty) {
 /// destroyConstant - Remove the constant from the constant table...
 ///
 void ConstantAggregateZero::destroyConstant() {
-  getType()->getContext().pImpl->AggZeroConstants.remove(this);
+  getRawType()->getContext().pImpl->AggZeroConstants.remove(this);
   destroyConstantImpl();
 }
 
 /// destroyConstant - Remove the constant from the constant table...
 ///
 void ConstantArray::destroyConstant() {
-  getType()->getContext().pImpl->ArrayConstants.remove(this);
+  getRawType()->getContext().pImpl->ArrayConstants.remove(this);
   destroyConstantImpl();
 }
 
@@ -1027,21 +1005,14 @@ namespace llvm {
 // destroyConstant - Remove the constant from the constant table...
 //
 void ConstantStruct::destroyConstant() {
-  getType()->getContext().pImpl->StructConstants.remove(this);
-  destroyConstantImpl();
-}
-
-// destroyConstant - Remove the constant from the constant table...
-//
-void ConstantUnion::destroyConstant() {
-  getType()->getContext().pImpl->UnionConstants.remove(this);
+  getRawType()->getContext().pImpl->StructConstants.remove(this);
   destroyConstantImpl();
 }
 
 // destroyConstant - Remove the constant from the constant table...
 //
 void ConstantVector::destroyConstant() {
-  getType()->getContext().pImpl->VectorConstants.remove(this);
+  getRawType()->getContext().pImpl->VectorConstants.remove(this);
   destroyConstantImpl();
 }
 
@@ -1082,7 +1053,7 @@ ConstantPointerNull *ConstantPointerNull::get(const PointerType *Ty) {
 // destroyConstant - Remove the constant from the constant table...
 //
 void ConstantPointerNull::destroyConstant() {
-  getType()->getContext().pImpl->NullPtrConstants.remove(this);
+  getRawType()->getContext().pImpl->NullPtrConstants.remove(this);
   destroyConstantImpl();
 }
 
@@ -1097,7 +1068,7 @@ UndefValue *UndefValue::get(const Type *Ty) {
 // destroyConstant - Remove the constant from the constant table.
 //
 void UndefValue::destroyConstant() {
-  getType()->getContext().pImpl->UndefValueConstants.remove(this);
+  getRawType()->getContext().pImpl->UndefValueConstants.remove(this);
   destroyConstantImpl();
 }
 
@@ -1131,7 +1102,7 @@ BlockAddress::BlockAddress(Function *F, BasicBlock *BB)
 // destroyConstant - Remove the constant from the constant table.
 //
 void BlockAddress::destroyConstant() {
-  getFunction()->getType()->getContext().pImpl
+  getFunction()->getRawType()->getContext().pImpl
     ->BlockAddresses.erase(std::make_pair(getFunction(), getBasicBlock()));
   getBasicBlock()->AdjustBlockAddressRefCount(-1);
   destroyConstantImpl();
@@ -1930,7 +1901,7 @@ Constant* ConstantExpr::getAShr(Constant* C1, Constant* C2) {
 // destroyConstant - Remove the constant from the constant table...
 //
 void ConstantExpr::destroyConstant() {
-  getType()->getContext().pImpl->ExprConstants.remove(this);
+  getRawType()->getContext().pImpl->ExprConstants.remove(this);
   destroyConstantImpl();
 }
 
@@ -1971,11 +1942,10 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
   assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
   Constant *ToC = cast<Constant>(To);
 
-  LLVMContext &Context = getType()->getContext();
-  LLVMContextImpl *pImpl = Context.pImpl;
+  LLVMContextImpl *pImpl = getRawType()->getContext().pImpl;
 
   std::pair<LLVMContextImpl::ArrayConstantsTy::MapKey, ConstantArray*> Lookup;
-  Lookup.first.first = getType();
+  Lookup.first.first = cast<ArrayType>(getRawType());
   Lookup.second = this;
 
   std::vector<Constant*> &Values = Lookup.first.second;
@@ -2009,7 +1979,7 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
   
   Constant *Replacement = 0;
   if (isAllZeros) {
-    Replacement = ConstantAggregateZero::get(getType());
+    Replacement = ConstantAggregateZero::get(getRawType());
   } else {
     // Check to see if we have this array type already.
     bool Exists;
@@ -2060,7 +2030,7 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
   assert(getOperand(OperandToUpdate) == From && "ReplaceAllUsesWith broken!");
 
   std::pair<LLVMContextImpl::StructConstantsTy::MapKey, ConstantStruct*> Lookup;
-  Lookup.first.first = getType();
+  Lookup.first.first = cast<StructType>(getRawType());
   Lookup.second = this;
   std::vector<Constant*> &Values = Lookup.first.second;
   Values.reserve(getNumOperands());  // Build replacement struct.
@@ -2082,14 +2052,13 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
   }
   Values[OperandToUpdate] = ToC;
   
-  LLVMContext &Context = getType()->getContext();
-  LLVMContextImpl *pImpl = Context.pImpl;
+  LLVMContextImpl *pImpl = getRawType()->getContext().pImpl;
   
   Constant *Replacement = 0;
   if (isAllZeros) {
-    Replacement = ConstantAggregateZero::get(getType());
+    Replacement = ConstantAggregateZero::get(getRawType());
   } else {
-    // Check to see if we have this array type already.
+    // Check to see if we have this struct type already.
     bool Exists;
     LLVMContextImpl::StructConstantsTy::MapTy::iterator I =
       pImpl->StructConstants.InsertOrGetItem(Lookup, Exists);
@@ -2118,56 +2087,6 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
   destroyConstant();
 }
 
-void ConstantUnion::replaceUsesOfWithOnConstant(Value *From, Value *To,
-                                                 Use *U) {
-  assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
-  Constant *ToC = cast<Constant>(To);
-
-  assert(U == OperandList && "Union constants can only have one use!");
-  assert(getNumOperands() == 1 && "Union constants can only have one use!");
-  assert(getOperand(0) == From && "ReplaceAllUsesWith broken!");
-
-  std::pair<LLVMContextImpl::UnionConstantsTy::MapKey, ConstantUnion*> Lookup;
-  Lookup.first.first = getType();
-  Lookup.second = this;
-  Lookup.first.second = ToC;
-
-  LLVMContext &Context = getType()->getContext();
-  LLVMContextImpl *pImpl = Context.pImpl;
-
-  Constant *Replacement = 0;
-  if (ToC->isNullValue()) {
-    Replacement = ConstantAggregateZero::get(getType());
-  } else {
-    // Check to see if we have this union type already.
-    bool Exists;
-    LLVMContextImpl::UnionConstantsTy::MapTy::iterator I =
-      pImpl->UnionConstants.InsertOrGetItem(Lookup, Exists);
-    
-    if (Exists) {
-      Replacement = I->second;
-    } else {
-      // Okay, the new shape doesn't exist in the system yet.  Instead of
-      // creating a new constant union, inserting it, replaceallusesof'ing the
-      // old with the new, then deleting the old... just update the current one
-      // in place!
-      pImpl->UnionConstants.MoveConstantToNewSlot(this, I);
-      
-      // Update to the new value.
-      setOperand(0, ToC);
-      return;
-    }
-  }
-  
-  assert(Replacement != this && "I didn't contain From!");
-  
-  // Everyone using this now uses the replacement.
-  uncheckedReplaceAllUsesWith(Replacement);
-  
-  // Delete the old constant!
-  destroyConstant();
-}
-
 void ConstantVector::replaceUsesOfWithOnConstant(Value *From, Value *To,
                                                  Use *U) {
   assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
@@ -2180,7 +2099,7 @@ void ConstantVector::replaceUsesOfWithOnConstant(Value *From, Value *To,
     Values.push_back(Val);
   }
   
-  Constant *Replacement = get(getType(), Values);
+  Constant *Replacement = get(cast<VectorType>(getRawType()), Values);
   assert(Replacement != this && "I didn't contain From!");
   
   // Everyone using this now uses the replacement.
@@ -2227,7 +2146,7 @@ void ConstantExpr::replaceUsesOfWithOnConstant(Value *From, Value *ToV,
                                                &Indices[0], Indices.size());
   } else if (isCast()) {
     assert(getOperand(0) == From && "Cast only has one use!");
-    Replacement = ConstantExpr::getCast(getOpcode(), To, getType());
+    Replacement = ConstantExpr::getCast(getOpcode(), To, getRawType());
   } else if (getOpcode() == Instruction::Select) {
     Constant *C1 = getOperand(0);
     Constant *C2 = getOperand(1);
diff --git a/lib/VMCore/ConstantsContext.h b/lib/VMCore/ConstantsContext.h
index 2f2fac53f062e..1c04c3e1987e2 100644
--- a/lib/VMCore/ConstantsContext.h
+++ b/lib/VMCore/ConstantsContext.h
@@ -511,14 +511,6 @@ struct ConstantKeyData<ConstantStruct> {
   }
 };
 
-template<>
-struct ConstantKeyData<ConstantUnion> {
-  typedef Constant* ValType;
-  static ValType getValType(ConstantUnion *CU) {
-    return cast<Constant>(CU->getOperand(0));
-  }
-};
-
 // ConstantPointerNull does not take extra "value" argument...
 template<class ValType>
 struct ConstantCreator<ConstantPointerNull, PointerType, ValType> {
@@ -757,9 +749,13 @@ public:
       
     // If this constant is the representative element for its abstract type,
     // update the AbstractTypeMap so that the representative element is I.
-    if (C->getType()->isAbstract()) {
+    //
+    // This must use getRawType() because if the type is under refinement, we
+    // will get the refineAbstractType callback below, and we don't want to
+    // kick union find in on the constant.
+    if (C->getRawType()->isAbstract()) {
       typename AbstractTypeMapTy::iterator ATI =
-          AbstractTypeMap.find(C->getType());
+          AbstractTypeMap.find(cast<DerivedType>(C->getRawType()));
       assert(ATI != AbstractTypeMap.end() &&
              "Abstract type not in AbstractTypeMap?");
       if (ATI->second == OldI)
diff --git a/lib/VMCore/Core.cpp b/lib/VMCore/Core.cpp
index ca1a399fe8aae..5aad19dd2a4ad 100644
--- a/lib/VMCore/Core.cpp
+++ b/lib/VMCore/Core.cpp
@@ -22,6 +22,7 @@
 #include "llvm/TypeSymbolTable.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/PassManager.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -155,8 +156,6 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
     return LLVMFunctionTypeKind;
   case Type::StructTyID:
     return LLVMStructTypeKind;
-  case Type::UnionTyID:
-    return LLVMUnionTypeKind;
   case Type::ArrayTyID:
     return LLVMArrayTypeKind;
   case Type::PointerTyID:
@@ -315,34 +314,6 @@ LLVMBool LLVMIsPackedStruct(LLVMTypeRef StructTy) {
   return unwrap<StructType>(StructTy)->isPacked();
 }
 
-/*--.. Operations on union types ..........................................--*/
-
-LLVMTypeRef LLVMUnionTypeInContext(LLVMContextRef C, LLVMTypeRef *ElementTypes,
-                                   unsigned ElementCount) {
-  SmallVector<const Type*, 8> Tys;
-  for (LLVMTypeRef *I = ElementTypes,
-                   *E = ElementTypes + ElementCount; I != E; ++I)
-    Tys.push_back(unwrap(*I));
-  
-  return wrap(UnionType::get(&Tys[0], Tys.size()));
-}
-
-LLVMTypeRef LLVMUnionType(LLVMTypeRef *ElementTypes, unsigned ElementCount) {
-  return LLVMUnionTypeInContext(LLVMGetGlobalContext(), ElementTypes,
-                                ElementCount);
-}
-
-unsigned LLVMCountUnionElementTypes(LLVMTypeRef UnionTy) {
-  return unwrap<UnionType>(UnionTy)->getNumElements();
-}
-
-void LLVMGetUnionElementTypes(LLVMTypeRef UnionTy, LLVMTypeRef *Dest) {
-  UnionType *Ty = unwrap<UnionType>(UnionTy);
-  for (FunctionType::param_iterator I = Ty->element_begin(),
-                                    E = Ty->element_end(); I != E; ++I)
-    *Dest++ = wrap(*I);
-}
-
 /*--.. Operations on array, pointer, and vector types (sequence types) .....--*/
 
 LLVMTypeRef LLVMArrayType(LLVMTypeRef ElementType, unsigned ElementCount) {
@@ -488,6 +459,14 @@ LLVMValueRef LLVMGetOperand(LLVMValueRef Val, unsigned Index) {
   return wrap(unwrap<User>(Val)->getOperand(Index));
 }
 
+void LLVMSetOperand(LLVMValueRef Val, unsigned Index, LLVMValueRef Op) {
+  unwrap<User>(Val)->setOperand(Index, unwrap(Op));
+}
+
+int LLVMGetNumOperands(LLVMValueRef Val) {
+  return unwrap<User>(Val)->getNumOperands();
+}
+
 /*--.. Operations on constants of any type .................................--*/
 
 LLVMValueRef LLVMConstNull(LLVMTypeRef Ty) {
@@ -619,10 +598,6 @@ LLVMValueRef LLVMConstVector(LLVMValueRef *ScalarConstantVals, unsigned Size) {
   return wrap(ConstantVector::get(
                             unwrap<Constant>(ScalarConstantVals, Size), Size));
 }
-LLVMValueRef LLVMConstUnion(LLVMTypeRef Ty, LLVMValueRef Val) {
-  return wrap(ConstantUnion::get(unwrap<UnionType>(Ty), unwrap<Constant>(Val)));
-}
-
 /*--.. Constant expressions ................................................--*/
 
 LLVMOpcode LLVMGetConstOpcode(LLVMValueRef ConstantVal) {
@@ -1060,6 +1035,8 @@ LLVMLinkage LLVMGetLinkage(LLVMValueRef Global) {
     return LLVMLinkerPrivateLinkage;
   case GlobalValue::LinkerPrivateWeakLinkage:
     return LLVMLinkerPrivateWeakLinkage;
+  case GlobalValue::LinkerPrivateWeakDefAutoLinkage:
+    return LLVMLinkerPrivateWeakDefAutoLinkage;
   case GlobalValue::DLLImportLinkage:
     return LLVMDLLImportLinkage;
   case GlobalValue::DLLExportLinkage:
@@ -1113,6 +1090,9 @@ void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
   case LLVMLinkerPrivateWeakLinkage:
     GV->setLinkage(GlobalValue::LinkerPrivateWeakLinkage);
     break;
+  case LLVMLinkerPrivateWeakDefAutoLinkage:
+    GV->setLinkage(GlobalValue::LinkerPrivateWeakDefAutoLinkage);
+    break;
   case LLVMDLLImportLinkage:
     GV->setLinkage(GlobalValue::DLLImportLinkage);
     break;
@@ -1515,6 +1495,14 @@ void LLVMDeleteBasicBlock(LLVMBasicBlockRef BBRef) {
   unwrap(BBRef)->eraseFromParent();
 }
 
+void LLVMMoveBasicBlockBefore(LLVMBasicBlockRef BB, LLVMBasicBlockRef MovePos) {
+  unwrap(BB)->moveBefore(unwrap(MovePos));
+}
+
+void LLVMMoveBasicBlockAfter(LLVMBasicBlockRef BB, LLVMBasicBlockRef MovePos) {
+  unwrap(BB)->moveAfter(unwrap(MovePos));
+}
+
 /*--.. Operations on instructions ..........................................--*/
 
 LLVMBasicBlockRef LLVMGetInstructionParent(LLVMValueRef Inst) {
@@ -2223,3 +2211,39 @@ LLVMBool LLVMCreateMemoryBufferWithSTDIN(LLVMMemoryBufferRef *OutMemBuf,
 void LLVMDisposeMemoryBuffer(LLVMMemoryBufferRef MemBuf) {
   delete unwrap(MemBuf);
 }
+
+
+/*===-- Pass Manager ------------------------------------------------------===*/
+
+LLVMPassManagerRef LLVMCreatePassManager() {
+  return wrap(new PassManager());
+}
+
+LLVMPassManagerRef LLVMCreateFunctionPassManagerForModule(LLVMModuleRef M) {
+  return wrap(new FunctionPassManager(unwrap(M)));
+}
+
+LLVMPassManagerRef LLVMCreateFunctionPassManager(LLVMModuleProviderRef P) {
+  return LLVMCreateFunctionPassManagerForModule(
+                                            reinterpret_cast<LLVMModuleRef>(P));
+}
+
+LLVMBool LLVMRunPassManager(LLVMPassManagerRef PM, LLVMModuleRef M) {
+  return unwrap<PassManager>(PM)->run(*unwrap(M));
+}
+
+LLVMBool LLVMInitializeFunctionPassManager(LLVMPassManagerRef FPM) {
+  return unwrap<FunctionPassManager>(FPM)->doInitialization();
+}
+
+LLVMBool LLVMRunFunctionPassManager(LLVMPassManagerRef FPM, LLVMValueRef F) {
+  return unwrap<FunctionPassManager>(FPM)->run(*unwrap<Function>(F));
+}
+
+LLVMBool LLVMFinalizeFunctionPassManager(LLVMPassManagerRef FPM) {
+  return unwrap<FunctionPassManager>(FPM)->doFinalization();
+}
+
+void LLVMDisposePassManager(LLVMPassManagerRef PM) {
+  delete unwrap(PM);
+}
diff --git a/lib/VMCore/Dominators.cpp b/lib/VMCore/Dominators.cpp
index 10a866fab6226..f3dad824461dd 100644
--- a/lib/VMCore/Dominators.cpp
+++ b/lib/VMCore/Dominators.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -51,8 +52,8 @@ TEMPLATE_INSTANTIATION(class llvm::DomTreeNodeBase<BasicBlock>);
 TEMPLATE_INSTANTIATION(class llvm::DominatorTreeBase<BasicBlock>);
 
 char DominatorTree::ID = 0;
-static RegisterPass<DominatorTree>
-E("domtree", "Dominator Tree Construction", true, true);
+INITIALIZE_PASS(DominatorTree, "domtree",
+                "Dominator Tree Construction", true, true);
 
 bool DominatorTree::runOnFunction(Function &F) {
   DT->recalculate(F);
@@ -105,8 +106,8 @@ bool DominatorTree::dominates(const Instruction *A, const Instruction *B) const{
 //===----------------------------------------------------------------------===//
 
 char DominanceFrontier::ID = 0;
-static RegisterPass<DominanceFrontier>
-G("domfrontier", "Dominance Frontier Construction", true, true);
+INITIALIZE_PASS(DominanceFrontier, "domfrontier",
+                "Dominance Frontier Construction", true, true);
 
 void DominanceFrontier::verifyAnalysis() const {
   if (!VerifyDomInfo) return;
@@ -122,36 +123,23 @@ void DominanceFrontier::verifyAnalysis() const {
 // NewBB is split and now it has one successor. Update dominance frontier to
 // reflect this change.
 void DominanceFrontier::splitBlock(BasicBlock *NewBB) {
-  assert(NewBB->getTerminator()->getNumSuccessors() == 1
-         && "NewBB should have a single successor!");
+  assert(NewBB->getTerminator()->getNumSuccessors() == 1 &&
+         "NewBB should have a single successor!");
   BasicBlock *NewBBSucc = NewBB->getTerminator()->getSuccessor(0);
 
-  SmallVector<BasicBlock*, 8> PredBlocks;
-  for (pred_iterator PI = pred_begin(NewBB), PE = pred_end(NewBB);
-       PI != PE; ++PI)
-    PredBlocks.push_back(*PI);  
-
-  if (PredBlocks.empty())
-    // If NewBB does not have any predecessors then it is a entry block.
-    // In this case, NewBB and its successor NewBBSucc dominates all
-    // other blocks.
-    return;
-
   // NewBBSucc inherits original NewBB frontier.
   DominanceFrontier::iterator NewBBI = find(NewBB);
-  if (NewBBI != end()) {
-    DominanceFrontier::DomSetType NewBBSet = NewBBI->second;
-    DominanceFrontier::DomSetType NewBBSuccSet;
-    NewBBSuccSet.insert(NewBBSet.begin(), NewBBSet.end());
-    addBasicBlock(NewBBSucc, NewBBSuccSet);
-  }
+  if (NewBBI != end())
+    addBasicBlock(NewBBSucc, NewBBI->second);
 
   // If NewBB dominates NewBBSucc, then DF(NewBB) is now going to be the
-  // DF(PredBlocks[0]) without the stuff that the new block does not dominate
+  // DF(NewBBSucc) without the stuff that the new block does not dominate
   // a predecessor of.
   DominatorTree &DT = getAnalysis<DominatorTree>();
-  if (DT.dominates(NewBB, NewBBSucc)) {
-    DominanceFrontier::iterator DFI = find(PredBlocks[0]);
+  DomTreeNode *NewBBNode = DT.getNode(NewBB);
+  DomTreeNode *NewBBSuccNode = DT.getNode(NewBBSucc);
+  if (DT.dominates(NewBBNode, NewBBSuccNode)) {
+    DominanceFrontier::iterator DFI = find(NewBBSucc);
     if (DFI != end()) {
       DominanceFrontier::DomSetType Set = DFI->second;
       // Filter out stuff in Set that we do not dominate a predecessor of.
@@ -160,8 +148,10 @@ void DominanceFrontier::splitBlock(BasicBlock *NewBB) {
         bool DominatesPred = false;
         for (pred_iterator PI = pred_begin(*SetI), E = pred_end(*SetI);
              PI != E; ++PI)
-          if (DT.dominates(NewBB, *PI))
+          if (DT.dominates(NewBBNode, DT.getNode(*PI))) {
             DominatesPred = true;
+            break;
+          }
         if (!DominatesPred)
           Set.erase(SetI++);
         else
@@ -186,50 +176,71 @@ void DominanceFrontier::splitBlock(BasicBlock *NewBB) {
     NewDFSet.insert(NewBBSucc);
     addBasicBlock(NewBB, NewDFSet);
   }
-  
-  // Now we must loop over all of the dominance frontiers in the function,
-  // replacing occurrences of NewBBSucc with NewBB in some cases.  All
-  // blocks that dominate a block in PredBlocks and contained NewBBSucc in
-  // their dominance frontier must be updated to contain NewBB instead.
-  //
-  for (Function::iterator FI = NewBB->getParent()->begin(),
-         FE = NewBB->getParent()->end(); FI != FE; ++FI) {
-    DominanceFrontier::iterator DFI = find(FI);
-    if (DFI == end()) continue;  // unreachable block.
-    
-    // Only consider nodes that have NewBBSucc in their dominator frontier.
-    if (!DFI->second.count(NewBBSucc)) continue;
-
-    // Verify whether this block dominates a block in predblocks.  If not, do
-    // not update it.
-    bool BlockDominatesAny = false;
-    for (SmallVectorImpl<BasicBlock*>::const_iterator BI = PredBlocks.begin(), 
-           BE = PredBlocks.end(); BI != BE; ++BI) {
-      if (DT.dominates(FI, *BI)) {
-        BlockDominatesAny = true;
+
+  // Now update dominance frontiers which either used to contain NewBBSucc
+  // or which now need to include NewBB.
+
+  // Collect the set of blocks which dominate a predecessor of NewBB or
+  // NewSuccBB and which don't dominate both. This is an initial
+  // approximation of the blocks whose dominance frontiers will need updates.
+  SmallVector<DomTreeNode *, 16> AllPredDoms;
+
+  // Compute the block which dominates both NewBBSucc and NewBB. This is
+  // the immediate dominator of NewBBSucc unless NewBB dominates NewBBSucc.
+  // The code below which climbs dominator trees will stop at this point,
+  // because from this point up, dominance frontiers are unaffected.
+  DomTreeNode *DominatesBoth = 0;
+  if (NewBBSuccNode) {
+    DominatesBoth = NewBBSuccNode->getIDom();
+    if (DominatesBoth == NewBBNode)
+      DominatesBoth = NewBBNode->getIDom();
+  }
+
+  // Collect the set of all blocks which dominate a predecessor of NewBB.
+  SmallPtrSet<DomTreeNode *, 8> NewBBPredDoms;
+  for (pred_iterator PI = pred_begin(NewBB), E = pred_end(NewBB); PI != E; ++PI)
+    for (DomTreeNode *DTN = DT.getNode(*PI); DTN; DTN = DTN->getIDom()) {
+      if (DTN == DominatesBoth)
         break;
-      }
+      if (!NewBBPredDoms.insert(DTN))
+        break;
+      AllPredDoms.push_back(DTN);
     }
 
-    // If NewBBSucc should not stay in our dominator frontier, remove it.
-    // We remove it unless there is a predecessor of NewBBSucc that we
-    // dominate, but we don't strictly dominate NewBBSucc.
-    bool ShouldRemove = true;
-    if ((BasicBlock*)FI == NewBBSucc || !DT.dominates(FI, NewBBSucc)) {
-      // Okay, we know that PredDom does not strictly dominate NewBBSucc.
-      // Check to see if it dominates any predecessors of NewBBSucc.
-      for (pred_iterator PI = pred_begin(NewBBSucc),
-           E = pred_end(NewBBSucc); PI != E; ++PI)
-        if (DT.dominates(FI, *PI)) {
-          ShouldRemove = false;
-          break;
-        }
+  // Collect the set of all blocks which dominate a predecessor of NewSuccBB.
+  SmallPtrSet<DomTreeNode *, 8> NewBBSuccPredDoms;
+  for (pred_iterator PI = pred_begin(NewBBSucc),
+       E = pred_end(NewBBSucc); PI != E; ++PI)
+    for (DomTreeNode *DTN = DT.getNode(*PI); DTN; DTN = DTN->getIDom()) {
+      if (DTN == DominatesBoth)
+        break;
+      if (!NewBBSuccPredDoms.insert(DTN))
+        break;
+      if (!NewBBPredDoms.count(DTN))
+        AllPredDoms.push_back(DTN);
     }
-    
-    if (ShouldRemove)
-      removeFromFrontier(DFI, NewBBSucc);
-    if (BlockDominatesAny && (&*FI == NewBB || !DT.dominates(FI, NewBB)))
+
+  // Visit all relevant dominance frontiers and make any needed updates.
+  for (SmallVectorImpl<DomTreeNode *>::const_iterator I = AllPredDoms.begin(),
+       E = AllPredDoms.end(); I != E; ++I) {
+    DomTreeNode *DTN = *I;
+    iterator DFI = find((*I)->getBlock());
+
+    // Only consider nodes that have NewBBSucc in their dominator frontier.
+    if (DFI == end() || !DFI->second.count(NewBBSucc)) continue;
+
+    // If the block dominates a predecessor of NewBB but does not properly
+    // dominate NewBB itself, add NewBB to its dominance frontier.
+    if (NewBBPredDoms.count(DTN) &&
+        !DT.properlyDominates(DTN, NewBBNode))
       addToFrontier(DFI, NewBB);
+
+    // If the block does not dominate a predecessor of NewBBSucc or
+    // properly dominates NewBBSucc itself, remove NewBBSucc from its
+    // dominance frontier.
+    if (!NewBBSuccPredDoms.count(DTN) ||
+        DT.properlyDominates(DTN, NewBBSuccNode))
+      removeFromFrontier(DFI, NewBBSucc);
   }
 }
 
@@ -343,3 +354,7 @@ void DominanceFrontierBase::print(raw_ostream &OS, const Module* ) const {
   }
 }
 
+void DominanceFrontierBase::dump() const {
+  print(dbgs());
+}
+
diff --git a/lib/VMCore/Globals.cpp b/lib/VMCore/Globals.cpp
index b758eb8702aea..96716eeb349b9 100644
--- a/lib/VMCore/Globals.cpp
+++ b/lib/VMCore/Globals.cpp
@@ -102,7 +102,14 @@ void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
   setVisibility(Src->getVisibility());
 }
 
-
+void GlobalValue::setAlignment(unsigned Align) {
+  assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
+  assert(Align <= MaximumAlignment &&
+         "Alignment is greater than MaximumAlignment!");
+  Alignment = Log2_32(Align) + 1;
+  assert(getAlignment() == Align && "Alignment representation error!");
+}
+  
 //===----------------------------------------------------------------------===//
 // GlobalVariable Implementation
 //===----------------------------------------------------------------------===//
diff --git a/lib/VMCore/InlineAsm.cpp b/lib/VMCore/InlineAsm.cpp
index 0d2eca9c3dea6..69f713b2c42c2 100644
--- a/lib/VMCore/InlineAsm.cpp
+++ b/lib/VMCore/InlineAsm.cpp
@@ -164,7 +164,7 @@ InlineAsm::ParseConstraints(StringRef Constraints) {
     StringRef::iterator ConstraintEnd = std::find(I, E, ',');
 
     if (ConstraintEnd == I ||  // Empty constraint like ",,"
-        Info.Parse(std::string(I, ConstraintEnd), Result)) {
+        Info.Parse(StringRef(I, ConstraintEnd-I), Result)) {
       Result.clear();          // Erroneous constraint?
       break;
     }
diff --git a/lib/VMCore/Instruction.cpp b/lib/VMCore/Instruction.cpp
index 9792adaaa122d..05bed4c64316f 100644
--- a/lib/VMCore/Instruction.cpp
+++ b/lib/VMCore/Instruction.cpp
@@ -49,8 +49,8 @@ Instruction::Instruction(const Type *ty, unsigned it, Use *Ops, unsigned NumOps,
 // Out of line virtual method, so the vtable, etc has a home.
 Instruction::~Instruction() {
   assert(Parent == 0 && "Instruction still linked in the program!");
-  if (hasMetadata())
-    removeAllMetadata();
+  if (hasMetadataHashEntry())
+    clearMetadataHashEntries();
 }
 
 
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index c13696f229022..401802ed13d5e 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -33,10 +33,8 @@ using namespace llvm;
 User::op_iterator CallSite::getCallee() const {
   Instruction *II(getInstruction());
   return isCall()
-    ? (CallInst::ArgOffset
-       ? cast</*FIXME: CallInst*/User>(II)->op_begin()
-       : cast</*FIXME: CallInst*/User>(II)->op_end() - 1)
-    : cast<InvokeInst>(II)->op_end() - 3; // Skip BB, BB, Function
+    ? cast<CallInst>(II)->op_end() - 1 // Skip Callee
+    : cast<InvokeInst>(II)->op_end() - 3; // Skip BB, BB, Callee
 }
 
 //===----------------------------------------------------------------------===//
@@ -233,7 +231,7 @@ CallInst::~CallInst() {
 
 void CallInst::init(Value *Func, Value* const *Params, unsigned NumParams) {
   assert(NumOperands == NumParams+1 && "NumOperands not set up?");
-  Op<ArgOffset -1>() = Func;
+  Op<-1>() = Func;
 
   const FunctionType *FTy =
     cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
@@ -246,15 +244,15 @@ void CallInst::init(Value *Func, Value* const *Params, unsigned NumParams) {
     assert((i >= FTy->getNumParams() || 
             FTy->getParamType(i) == Params[i]->getType()) &&
            "Calling a function with a bad signature!");
-    OperandList[i + ArgOffset] = Params[i];
+    OperandList[i] = Params[i];
   }
 }
 
 void CallInst::init(Value *Func, Value *Actual1, Value *Actual2) {
   assert(NumOperands == 3 && "NumOperands not set up?");
-  Op<ArgOffset -1>() = Func;
-  Op<ArgOffset + 0>() = Actual1;
-  Op<ArgOffset + 1>() = Actual2;
+  Op<-1>() = Func;
+  Op<0>() = Actual1;
+  Op<1>() = Actual2;
 
   const FunctionType *FTy =
     cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
@@ -273,8 +271,8 @@ void CallInst::init(Value *Func, Value *Actual1, Value *Actual2) {
 
 void CallInst::init(Value *Func, Value *Actual) {
   assert(NumOperands == 2 && "NumOperands not set up?");
-  Op<ArgOffset -1>() = Func;
-  Op<ArgOffset + 0>() = Actual;
+  Op<-1>() = Func;
+  Op<0>() = Actual;
 
   const FunctionType *FTy =
     cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
@@ -290,7 +288,7 @@ void CallInst::init(Value *Func, Value *Actual) {
 
 void CallInst::init(Value *Func) {
   assert(NumOperands == 1 && "NumOperands not set up?");
-  Op<ArgOffset -1>() = Func;
+  Op<-1>() = Func;
 
   const FunctionType *FTy =
     cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
@@ -893,6 +891,8 @@ AllocaInst::~AllocaInst() {
 
 void AllocaInst::setAlignment(unsigned Align) {
   assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
+  assert(Align <= MaximumAlignment &&
+         "Alignment is greater than MaximumAlignment!");
   setInstructionSubclassData(Log2_32(Align) + 1);
   assert(getAlignment() == Align && "Alignment representation error!");
 }
@@ -1028,8 +1028,11 @@ LoadInst::LoadInst(Value *Ptr, const char *Name, bool isVolatile,
 
 void LoadInst::setAlignment(unsigned Align) {
   assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
+  assert(Align <= MaximumAlignment &&
+         "Alignment is greater than MaximumAlignment!");
   setInstructionSubclassData((getSubclassDataFromInstruction() & 1) |
                              ((Log2_32(Align)+1)<<1));
+  assert(getAlignment() == Align && "Alignment representation error!");
 }
 
 //===----------------------------------------------------------------------===//
@@ -1124,8 +1127,11 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
 
 void StoreInst::setAlignment(unsigned Align) {
   assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
+  assert(Align <= MaximumAlignment &&
+         "Alignment is greater than MaximumAlignment!");
   setInstructionSubclassData((getSubclassDataFromInstruction() & 1) |
                              ((Log2_32(Align)+1) << 1));
+  assert(getAlignment() == Align && "Alignment representation error!");
 }
 
 //===----------------------------------------------------------------------===//
@@ -1424,9 +1430,24 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
     return false;
   
   const VectorType *MaskTy = dyn_cast<VectorType>(Mask->getType());
-  if (!isa<Constant>(Mask) || MaskTy == 0 ||
-      !MaskTy->getElementType()->isIntegerTy(32))
+  if (MaskTy == 0 || !MaskTy->getElementType()->isIntegerTy(32))
+    return false;
+
+  // Check to see if Mask is valid.
+  if (const ConstantVector *MV = dyn_cast<ConstantVector>(Mask)) {
+    const VectorType *VTy = cast<VectorType>(V1->getType());
+    for (unsigned i = 0, e = MV->getNumOperands(); i != e; ++i) {
+      if (ConstantInt* CI = dyn_cast<ConstantInt>(MV->getOperand(i))) {
+        if (CI->uge(VTy->getNumElements()*2))
+          return false;
+      } else if (!isa<UndefValue>(MV->getOperand(i))) {
+        return false;
+      }
+    }
+  }
+  else if (!isa<UndefValue>(Mask) && !isa<ConstantAggregateZero>(Mask))
     return false;
+  
   return true;
 }
 
diff --git a/lib/VMCore/LLVMContext.cpp b/lib/VMCore/LLVMContext.cpp
index 4d61363b9394d..563c651315a33 100644
--- a/lib/VMCore/LLVMContext.cpp
+++ b/lib/VMCore/LLVMContext.cpp
@@ -110,21 +110,18 @@ static bool isValidName(StringRef MDName) {
 /// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
 unsigned LLVMContext::getMDKindID(StringRef Name) const {
   assert(isValidName(Name) && "Invalid MDNode name");
-  
-  unsigned &Entry = pImpl->CustomMDKindNames[Name];
-  
+
   // If this is new, assign it its ID.
-  if (Entry == 0) Entry = pImpl->CustomMDKindNames.size();
-  return Entry;
+  return
+    pImpl->CustomMDKindNames.GetOrCreateValue(
+      Name, pImpl->CustomMDKindNames.size()).second;
 }
 
 /// getHandlerNames - Populate client supplied smallvector using custome
 /// metadata name and ID.
 void LLVMContext::getMDKindNames(SmallVectorImpl<StringRef> &Names) const {
-  Names.resize(pImpl->CustomMDKindNames.size()+1);
-  Names[0] = "";
+  Names.resize(pImpl->CustomMDKindNames.size());
   for (StringMap<unsigned>::const_iterator I = pImpl->CustomMDKindNames.begin(),
        E = pImpl->CustomMDKindNames.end(); I != E; ++I)
-    // MD Handlers are numbered from 1.
     Names[I->second] = I->first();
 }
diff --git a/lib/VMCore/LLVMContextImpl.cpp b/lib/VMCore/LLVMContextImpl.cpp
index 9e41a08156084..93a075f0fccbb 100644
--- a/lib/VMCore/LLVMContextImpl.cpp
+++ b/lib/VMCore/LLVMContextImpl.cpp
@@ -57,14 +57,11 @@ LLVMContextImpl::~LLVMContextImpl() {
                 DropReferences());
   std::for_each(StructConstants.map_begin(), StructConstants.map_end(),
                 DropReferences());
-  std::for_each(UnionConstants.map_begin(), UnionConstants.map_end(),
-                DropReferences());
   std::for_each(VectorConstants.map_begin(), VectorConstants.map_end(),
                 DropReferences());
   ExprConstants.freeConstants();
   ArrayConstants.freeConstants();
   StructConstants.freeConstants();
-  UnionConstants.freeConstants();
   VectorConstants.freeConstants();
   AggZeroConstants.freeConstants();
   NullPtrConstants.freeConstants();
diff --git a/lib/VMCore/LLVMContextImpl.h b/lib/VMCore/LLVMContextImpl.h
index 4876f5d5075a8..51b2992898c02 100644
--- a/lib/VMCore/LLVMContextImpl.h
+++ b/lib/VMCore/LLVMContextImpl.h
@@ -144,10 +144,6 @@ public:
     ConstantStruct, true /*largekey*/> StructConstantsTy;
   StructConstantsTy StructConstants;
   
-  typedef ConstantUniqueMap<Constant*, UnionType, ConstantUnion>
-      UnionConstantsTy;
-  UnionConstantsTy UnionConstants;
-  
   typedef ConstantUniqueMap<std::vector<Constant*>, VectorType,
                             ConstantVector> VectorConstantsTy;
   VectorConstantsTy VectorConstants;
@@ -192,7 +188,6 @@ public:
   TypeMap<PointerValType, PointerType> PointerTypes;
   TypeMap<FunctionValType, FunctionType> FunctionTypes;
   TypeMap<StructValType, StructType> StructTypes;
-  TypeMap<UnionValType, UnionType> UnionTypes;
   TypeMap<IntegerValType, IntegerType> IntegerTypes;
 
   // Opaque types are not structurally uniqued, so don't use TypeMap.
diff --git a/lib/VMCore/Metadata.cpp b/lib/VMCore/Metadata.cpp
index 3100d4ac7c9c3..da69c43ff7359 100644
--- a/lib/VMCore/Metadata.cpp
+++ b/lib/VMCore/Metadata.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "SymbolTableListTraitsImpl.h"
+#include "llvm/Support/LeakDetector.h"
 #include "llvm/Support/ValueHandle.h"
 using namespace llvm;
 
@@ -186,6 +187,21 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, Value *const *Vals,
                           unsigned NumVals, FunctionLocalness FL,
                           bool Insert) {
   LLVMContextImpl *pImpl = Context.pImpl;
+
+  // Add all the operand pointers. Note that we don't have to add the
+  // isFunctionLocal bit because that's implied by the operands.
+  // Note that if the operands are later nulled out, the node will be
+  // removed from the uniquing map.
+  FoldingSetNodeID ID;
+  for (unsigned i = 0; i != NumVals; ++i)
+    ID.AddPointer(Vals[i]);
+
+  void *InsertPoint;
+  MDNode *N = NULL;
+  
+  if ((N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint)))
+    return N;
+    
   bool isFunctionLocal = false;
   switch (FL) {
   case FL_Unknown:
@@ -206,20 +222,6 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, Value *const *Vals,
     break;
   }
 
-  FoldingSetNodeID ID;
-  for (unsigned i = 0; i != NumVals; ++i)
-    ID.AddPointer(Vals[i]);
-  ID.AddBoolean(isFunctionLocal);
-
-  void *InsertPoint;
-  MDNode *N = NULL;
-  
-  if ((N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint)))
-    return N;
-    
-  if (!Insert)
-    return NULL;
-    
   // Coallocate space for the node and Operands together, then placement new.
   void *Ptr = malloc(sizeof(MDNode)+NumVals*sizeof(MDNodeOperand));
   N = new (Ptr) MDNode(Context, Vals, NumVals, isFunctionLocal);
@@ -244,15 +246,42 @@ MDNode *MDNode::getIfExists(LLVMContext &Context, Value *const *Vals,
   return getMDNode(Context, Vals, NumVals, FL_Unknown, false);
 }
 
+MDNode *MDNode::getTemporary(LLVMContext &Context, Value *const *Vals,
+                             unsigned NumVals) {
+  MDNode *N = (MDNode *)malloc(sizeof(MDNode)+NumVals*sizeof(MDNodeOperand));
+  N = new (N) MDNode(Context, Vals, NumVals, FL_No);
+  N->setValueSubclassData(N->getSubclassDataFromValue() |
+                          NotUniquedBit);
+  LeakDetector::addGarbageObject(N);
+  return N;
+}
+
+void MDNode::deleteTemporary(MDNode *N) {
+  assert(N->use_empty() && "Temporary MDNode has uses!");
+  assert(!N->getContext().pImpl->MDNodeSet.RemoveNode(N) &&
+         "Deleting a non-temporary uniqued node!");
+  assert(!N->getContext().pImpl->NonUniquedMDNodes.erase(N) &&
+         "Deleting a non-temporary non-uniqued node!");
+  assert((N->getSubclassDataFromValue() & NotUniquedBit) &&
+         "Temporary MDNode does not have NotUniquedBit set!");
+  assert((N->getSubclassDataFromValue() & DestroyFlag) == 0 &&
+         "Temporary MDNode has DestroyFlag set!");
+  LeakDetector::removeGarbageObject(N);
+  N->destroy();
+}
+
 /// getOperand - Return specified operand.
 Value *MDNode::getOperand(unsigned i) const {
   return *getOperandPtr(const_cast<MDNode*>(this), i);
 }
 
 void MDNode::Profile(FoldingSetNodeID &ID) const {
+  // Add all the operand pointers. Note that we don't have to add the
+  // isFunctionLocal bit because that's implied by the operands.
+  // Note that if the operands are later nulled out, the node will be
+  // removed from the uniquing map.
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
     ID.AddPointer(getOperand(i));
-  ID.AddBoolean(isFunctionLocal());
 }
 
 void MDNode::setIsNotUniqued() {
@@ -301,7 +330,8 @@ void MDNode::replaceOperand(MDNodeOperand *Op, Value *To) {
 
   // If we are dropping an argument to null, we choose to not unique the MDNode
   // anymore.  This commonly occurs during destruction, and uniquing these
-  // brings little reuse.
+  // brings little reuse.  Also, this means we don't need to include
+  // isFunctionLocal bits in FoldingSetNodeIDs for MDNodes.
   if (To == 0) {
     setIsNotUniqued();
     return;
@@ -324,59 +354,35 @@ void MDNode::replaceOperand(MDNodeOperand *Op, Value *To) {
 
   // InsertPoint will have been set by the FindNodeOrInsertPos call.
   pImpl->MDNodeSet.InsertNode(this, InsertPoint);
+
+  // If this MDValue was previously function-local but no longer is, clear
+  // its function-local flag.
+  if (isFunctionLocal() && !isFunctionLocalValue(To)) {
+    bool isStillFunctionLocal = false;
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+      Value *V = getOperand(i);
+      if (!V) continue;
+      if (isFunctionLocalValue(V)) {
+        isStillFunctionLocal = true;
+        break;
+      }
+    }
+    if (!isStillFunctionLocal)
+      setValueSubclassData(getSubclassDataFromValue() & ~FunctionLocalBit);
+  }
 }
 
 //===----------------------------------------------------------------------===//
 // NamedMDNode implementation.
 //
 
-namespace llvm {
-// SymbolTableListTraits specialization for MDSymbolTable.
-void ilist_traits<NamedMDNode>
-::addNodeToList(NamedMDNode *N) {
-  assert(N->getParent() == 0 && "Value already in a container!!");
-  Module *Owner = getListOwner();
-  N->setParent(Owner);
-  MDSymbolTable &ST = Owner->getMDSymbolTable();
-  ST.insert(N->getName(), N);
-}
-
-void ilist_traits<NamedMDNode>::removeNodeFromList(NamedMDNode *N) {
-  N->setParent(0);
-  Module *Owner = getListOwner();
-  MDSymbolTable &ST = Owner->getMDSymbolTable();
-  ST.remove(N->getName());
-}
-}
-
-static SmallVector<WeakVH, 4> &getNMDOps(void *Operands) {
-  return *(SmallVector<WeakVH, 4>*)Operands;
-}
-
-NamedMDNode::NamedMDNode(LLVMContext &C, const Twine &N,
-                         MDNode *const *MDs,
-                         unsigned NumMDs, Module *ParentModule)
-  : Value(Type::getMetadataTy(C), Value::NamedMDNodeVal), Parent(0) {
-  setName(N);
-  Operands = new SmallVector<WeakVH, 4>();
-
-  SmallVector<WeakVH, 4> &Node = getNMDOps(Operands);
-  for (unsigned i = 0; i != NumMDs; ++i)
-    Node.push_back(WeakVH(MDs[i]));
-
-  if (ParentModule)
-    ParentModule->getNamedMDList().push_back(this);
+static SmallVector<TrackingVH<MDNode>, 4> &getNMDOps(void *Operands) {
+  return *(SmallVector<TrackingVH<MDNode>, 4>*)Operands;
 }
 
-NamedMDNode *NamedMDNode::Create(const NamedMDNode *NMD, Module *M) {
-  assert(NMD && "Invalid source NamedMDNode!");
-  SmallVector<MDNode *, 4> Elems;
-  Elems.reserve(NMD->getNumOperands());
-
-  for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
-    Elems.push_back(NMD->getOperand(i));
-  return new NamedMDNode(NMD->getContext(), NMD->getName().data(),
-                         Elems.data(), Elems.size(), M);
+NamedMDNode::NamedMDNode(const Twine &N)
+  : Name(N.str()), Parent(0),
+    Operands(new SmallVector<TrackingVH<MDNode>, 4>()) {
 }
 
 NamedMDNode::~NamedMDNode() {
@@ -392,18 +398,20 @@ unsigned NamedMDNode::getNumOperands() const {
 /// getOperand - Return specified operand.
 MDNode *NamedMDNode::getOperand(unsigned i) const {
   assert(i < getNumOperands() && "Invalid Operand number!");
-  return dyn_cast_or_null<MDNode>(getNMDOps(Operands)[i]);
+  return dyn_cast<MDNode>(&*getNMDOps(Operands)[i]);
 }
 
 /// addOperand - Add metadata Operand.
 void NamedMDNode::addOperand(MDNode *M) {
-  getNMDOps(Operands).push_back(WeakVH(M));
+  assert(!M->isFunctionLocal() &&
+         "NamedMDNode operands must not be function-local!");
+  getNMDOps(Operands).push_back(TrackingVH<MDNode>(M));
 }
 
 /// eraseFromParent - Drop all references and remove the node from parent
 /// module.
 void NamedMDNode::eraseFromParent() {
-  getParent()->getNamedMDList().erase(this);
+  getParent()->eraseNamedMetadata(this);
 }
 
 /// dropAllReferences - Remove all uses and clear node vector.
@@ -411,22 +419,6 @@ void NamedMDNode::dropAllReferences() {
   getNMDOps(Operands).clear();
 }
 
-/// setName - Set the name of this named metadata.
-void NamedMDNode::setName(const Twine &NewName) {
-  assert (!NewName.isTriviallyEmpty() && "Invalid named metadata name!");
-
-  SmallString<256> NameData;
-  StringRef NameRef = NewName.toStringRef(NameData);
-
-  // Name isn't changing?
-  if (getName() == NameRef)
-    return;
-
-  Name = NameRef.str();
-  if (Parent)
-    Parent->getMDSymbolTable().insert(NameRef, this);
-}
-
 /// getName - Return a constant reference to this named metadata's name.
 StringRef NamedMDNode::getName() const {
   return StringRef(Name);
@@ -445,10 +437,6 @@ MDNode *Instruction::getMetadataImpl(const char *Kind) const {
   return getMetadataImpl(getContext().getMDKindID(Kind));
 }
 
-void Instruction::setDbgMetadata(MDNode *Node) {
-  DbgLoc = DebugLoc::getFromDILocation(Node);
-}
-
 /// setMetadata - Set the metadata of of the specified kind to the specified
 /// node.  This updates/replaces metadata if already present, or removes it if
 /// Node is null.
@@ -567,13 +555,11 @@ getAllMetadataOtherThanDebugLocImpl(SmallVectorImpl<std::pair<unsigned,
 }
 
 
-/// removeAllMetadata - Remove all metadata from this instruction.
-void Instruction::removeAllMetadata() {
-  assert(hasMetadata() && "Caller should check");
-  DbgLoc = DebugLoc();
-  if (hasMetadataHashEntry()) {
-    getContext().pImpl->MetadataStore.erase(this);
-    setHasMetadataHashEntry(false);
-  }
+/// clearMetadataHashEntries - Clear all hashtable-based metadata from
+/// this instruction.
+void Instruction::clearMetadataHashEntries() {
+  assert(hasMetadataHashEntry() && "Caller should check");
+  getContext().pImpl->MetadataStore.erase(this);
+  setHasMetadataHashEntry(false);
 }
 
diff --git a/lib/VMCore/Module.cpp b/lib/VMCore/Module.cpp
index 38a51dfd5d388..d7ddf96cb0700 100644
--- a/lib/VMCore/Module.cpp
+++ b/lib/VMCore/Module.cpp
@@ -58,10 +58,10 @@ template class llvm::SymbolTableListTraits<GlobalAlias, Module>;
 //
 
 Module::Module(StringRef MID, LLVMContext& C)
-  : Context(C), Materializer(NULL), ModuleID(MID), DataLayout("")  {
+  : Context(C), Materializer(NULL), ModuleID(MID) {
   ValSymTab = new ValueSymbolTable();
   TypeSymTab = new TypeSymbolTable();
-  NamedMDSymTab = new MDSymbolTable();
+  NamedMDSymTab = new StringMap<NamedMDNode *>();
 }
 
 Module::~Module() {
@@ -73,7 +73,7 @@ Module::~Module() {
   NamedMDList.clear();
   delete ValSymTab;
   delete TypeSymTab;
-  delete NamedMDSymTab;
+  delete static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab);
 }
 
 /// Target endian information...
@@ -316,19 +316,28 @@ GlobalAlias *Module::getNamedAlias(StringRef Name) const {
 NamedMDNode *Module::getNamedMetadata(const Twine &Name) const {
   SmallString<256> NameData;
   StringRef NameRef = Name.toStringRef(NameData);
-  return NamedMDSymTab->lookup(NameRef);
+  return static_cast<StringMap<NamedMDNode*> *>(NamedMDSymTab)->lookup(NameRef);
 }
 
 /// getOrInsertNamedMetadata - Return the first named MDNode in the module 
 /// with the specified name. This method returns a new NamedMDNode if a 
 /// NamedMDNode with the specified name is not found.
 NamedMDNode *Module::getOrInsertNamedMetadata(StringRef Name) {
-  NamedMDNode *NMD = NamedMDSymTab->lookup(Name);
-  if (!NMD)
-    NMD = NamedMDNode::Create(getContext(), Name, NULL, 0, this);
+  NamedMDNode *&NMD =
+    (*static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab))[Name];
+  if (!NMD) {
+    NMD = new NamedMDNode(Name);
+    NMD->setParent(this);
+    NamedMDList.push_back(NMD);
+  }
   return NMD;
 }
 
+void Module::eraseNamedMetadata(NamedMDNode *NMD) {
+  static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab)->erase(NMD->getName());
+  NamedMDList.erase(NMD);
+}
+
 //===----------------------------------------------------------------------===//
 // Methods for easy access to the types in the module.
 //
diff --git a/lib/VMCore/Pass.cpp b/lib/VMCore/Pass.cpp
index efd98af0f443c..a7d7f61dd7622 100644
--- a/lib/VMCore/Pass.cpp
+++ b/lib/VMCore/Pass.cpp
@@ -14,35 +14,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Pass.h"
-#include "llvm/PassManager.h"
-#include "llvm/Module.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringMap.h"
+#include "llvm/PassRegistry.h"
 #include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PassNameParser.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/System/Atomic.h"
-#include "llvm/System/Mutex.h"
-#include "llvm/System/Threading.h"
-#include <algorithm>
-#include <map>
-#include <set>
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
 // Pass Implementation
 //
 
-Pass::Pass(PassKind K, intptr_t pid) : Resolver(0), PassID(pid), Kind(K) {
-  assert(pid && "pid cannot be 0");
-}
-
-Pass::Pass(PassKind K, const void *pid)
-  : Resolver(0), PassID((intptr_t)pid), Kind(K) {
-  assert(pid && "pid cannot be 0");
-}
+Pass::Pass(PassKind K, char &pid) : Resolver(0), PassID(&pid), Kind(K) { }
 
 // Force out-of-line virtual method.
 Pass::~Pass() { 
@@ -61,8 +44,8 @@ PassManagerType ModulePass::getPotentialPassManagerType() const {
   return PMT_ModulePassManager;
 }
 
-bool Pass::mustPreserveAnalysisID(const PassInfo *AnalysisID) const {
-  return Resolver->getAnalysisIfAvailable(AnalysisID, true) != 0;
+bool Pass::mustPreserveAnalysisID(char &AID) const {
+  return Resolver->getAnalysisIfAvailable(&AID, true) != 0;
 }
 
 // dumpPassStructure - Implement the -debug-passes=Structure option
@@ -75,7 +58,9 @@ void Pass::dumpPassStructure(unsigned Offset) {
 /// Registration templates, but can be overloaded directly.
 ///
 const char *Pass::getPassName() const {
-  if (const PassInfo *PI = getPassInfo())
+  AnalysisID AID =  getPassID();
+  const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(AID);
+  if (PI)
     return PI->getPassName();
   return "Unnamed pass: implement Pass::getPassName()";
 }
@@ -101,7 +86,7 @@ void Pass::verifyAnalysis() const {
   // By default, don't do anything.
 }
 
-void *Pass::getAdjustedAnalysisPointer(const PassInfo *) {
+void *Pass::getAdjustedAnalysisPointer(AnalysisID AID) {
   return this;
 }
 
@@ -150,30 +135,6 @@ Pass *FunctionPass::createPrinterPass(raw_ostream &O,
   return createPrintFunctionPass(Banner, &O);
 }
 
-// run - On a module, we run this pass by initializing, runOnFunction'ing once
-// for every function in the module, then by finalizing.
-//
-bool FunctionPass::runOnModule(Module &M) {
-  bool Changed = doInitialization(M);
-
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    if (!I->isDeclaration())      // Passes are not run on external functions!
-    Changed |= runOnFunction(*I);
-
-  return Changed | doFinalization(M);
-}
-
-// run - On a function, we simply initialize, run the function, then finalize.
-//
-bool FunctionPass::run(Function &F) {
-  // Passes are not run on external functions!
-  if (F.isDeclaration()) return false;
-
-  bool Changed = doInitialization(*F.getParent());
-  Changed |= runOnFunction(F);
-  return Changed | doFinalization(*F.getParent());
-}
-
 bool FunctionPass::doInitialization(Module &) {
   // By default, don't do anything.
   return false;
@@ -199,16 +160,6 @@ Pass *BasicBlockPass::createPrinterPass(raw_ostream &O,
   return 0;
 }
 
-// To run this pass on a function, we simply call runOnBasicBlock once for each
-// function.
-//
-bool BasicBlockPass::runOnFunction(Function &F) {
-  bool Changed = doInitialization(F);
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
-    Changed |= runOnBasicBlock(*I);
-  return Changed | doFinalization(F);
-}
-
 bool BasicBlockPass::doInitialization(Module &) {
   // By default, don't do anything.
   return false;
@@ -233,161 +184,12 @@ PassManagerType BasicBlockPass::getPotentialPassManagerType() const {
   return PMT_BasicBlockPassManager; 
 }
 
-//===----------------------------------------------------------------------===//
-// Pass Registration mechanism
-//
-namespace {
-class PassRegistrar {
-  /// Guards the contents of this class.
-  mutable sys::SmartMutex<true> Lock;
-
-  /// PassInfoMap - Keep track of the passinfo object for each registered llvm
-  /// pass.
-  typedef std::map<intptr_t, const PassInfo*> MapType;
-  MapType PassInfoMap;
-
-  typedef StringMap<const PassInfo*> StringMapType;
-  StringMapType PassInfoStringMap;
-  
-  /// AnalysisGroupInfo - Keep track of information for each analysis group.
-  struct AnalysisGroupInfo {
-    std::set<const PassInfo *> Implementations;
-  };
-  
-  /// AnalysisGroupInfoMap - Information for each analysis group.
-  std::map<const PassInfo *, AnalysisGroupInfo> AnalysisGroupInfoMap;
-
-public:
-  
-  const PassInfo *GetPassInfo(intptr_t TI) const {
-    sys::SmartScopedLock<true> Guard(Lock);
-    MapType::const_iterator I = PassInfoMap.find(TI);
-    return I != PassInfoMap.end() ? I->second : 0;
-  }
-  
-  const PassInfo *GetPassInfo(StringRef Arg) const {
-    sys::SmartScopedLock<true> Guard(Lock);
-    StringMapType::const_iterator I = PassInfoStringMap.find(Arg);
-    return I != PassInfoStringMap.end() ? I->second : 0;
-  }
-  
-  void RegisterPass(const PassInfo &PI) {
-    sys::SmartScopedLock<true> Guard(Lock);
-    bool Inserted =
-      PassInfoMap.insert(std::make_pair(PI.getTypeInfo(),&PI)).second;
-    assert(Inserted && "Pass registered multiple times!"); Inserted=Inserted;
-    PassInfoStringMap[PI.getPassArgument()] = &PI;
-  }
-  
-  void UnregisterPass(const PassInfo &PI) {
-    sys::SmartScopedLock<true> Guard(Lock);
-    MapType::iterator I = PassInfoMap.find(PI.getTypeInfo());
-    assert(I != PassInfoMap.end() && "Pass registered but not in map!");
-    
-    // Remove pass from the map.
-    PassInfoMap.erase(I);
-    PassInfoStringMap.erase(PI.getPassArgument());
-  }
-  
-  void EnumerateWith(PassRegistrationListener *L) {
-    sys::SmartScopedLock<true> Guard(Lock);
-    for (MapType::const_iterator I = PassInfoMap.begin(),
-         E = PassInfoMap.end(); I != E; ++I)
-      L->passEnumerate(I->second);
-  }
-  
-  
-  /// Analysis Group Mechanisms.
-  void RegisterAnalysisGroup(PassInfo *InterfaceInfo,
-                             const PassInfo *ImplementationInfo,
-                             bool isDefault) {
-    sys::SmartScopedLock<true> Guard(Lock);
-    AnalysisGroupInfo &AGI = AnalysisGroupInfoMap[InterfaceInfo];
-    assert(AGI.Implementations.count(ImplementationInfo) == 0 &&
-           "Cannot add a pass to the same analysis group more than once!");
-    AGI.Implementations.insert(ImplementationInfo);
-    if (isDefault) {
-      assert(InterfaceInfo->getNormalCtor() == 0 &&
-             "Default implementation for analysis group already specified!");
-      assert(ImplementationInfo->getNormalCtor() &&
-           "Cannot specify pass as default if it does not have a default ctor");
-      InterfaceInfo->setNormalCtor(ImplementationInfo->getNormalCtor());
-    }
-  }
-};
-}
-
-static std::vector<PassRegistrationListener*> *Listeners = 0;
-static sys::SmartMutex<true> ListenersLock;
-
-static PassRegistrar *PassRegistrarObj = 0;
-static PassRegistrar *getPassRegistrar() {
-  // Use double-checked locking to safely initialize the registrar when
-  // we're running in multithreaded mode.
-  PassRegistrar* tmp = PassRegistrarObj;
-  if (llvm_is_multithreaded()) {
-    sys::MemoryFence();
-    if (!tmp) {
-      llvm_acquire_global_lock();
-      tmp = PassRegistrarObj;
-      if (!tmp) {
-        tmp = new PassRegistrar();
-        sys::MemoryFence();
-        PassRegistrarObj = tmp;
-      }
-      llvm_release_global_lock();
-    }
-  } else if (!tmp) {
-    PassRegistrarObj = new PassRegistrar();
-  }
-  
-  return PassRegistrarObj;
-}
-
-namespace {
-
-// FIXME: We use ManagedCleanup to erase the pass registrar on shutdown.
-// Unfortunately, passes are registered with static ctors, and having
-// llvm_shutdown clear this map prevents successful ressurection after 
-// llvm_shutdown is run.  Ideally we should find a solution so that we don't
-// leak the map, AND can still resurrect after shutdown.
-void cleanupPassRegistrar(void*) {
-  if (PassRegistrarObj) {
-    delete PassRegistrarObj;
-    PassRegistrarObj = 0;
-  }
-}
-ManagedCleanup<&cleanupPassRegistrar> registrarCleanup ATTRIBUTE_USED;
-
-}
-
-// getPassInfo - Return the PassInfo data structure that corresponds to this
-// pass...
-const PassInfo *Pass::getPassInfo() const {
-  return lookupPassInfo(PassID);
-}
-
-const PassInfo *Pass::lookupPassInfo(intptr_t TI) {
-  return getPassRegistrar()->GetPassInfo(TI);
+const PassInfo *Pass::lookupPassInfo(const void *TI) {
+  return PassRegistry::getPassRegistry()->getPassInfo(TI);
 }
 
 const PassInfo *Pass::lookupPassInfo(StringRef Arg) {
-  return getPassRegistrar()->GetPassInfo(Arg);
-}
-
-void PassInfo::registerPass() {
-  getPassRegistrar()->RegisterPass(*this);
-
-  // Notify any listeners.
-  sys::SmartScopedLock<true> Lock(ListenersLock);
-  if (Listeners)
-    for (std::vector<PassRegistrationListener*>::iterator
-           I = Listeners->begin(), E = Listeners->end(); I != E; ++I)
-      (*I)->passRegistered(this);
-}
-
-void PassInfo::unregisterPass() {
-  getPassRegistrar()->UnregisterPass(*this);
+  return PassRegistry::getPassRegistry()->getPassInfo(Arg);
 }
 
 Pass *PassInfo::createPass() const {
@@ -404,32 +206,11 @@ Pass *PassInfo::createPass() const {
 
 // RegisterAGBase implementation
 //
-RegisterAGBase::RegisterAGBase(const char *Name, intptr_t InterfaceID,
-                               intptr_t PassID, bool isDefault)
-  : PassInfo(Name, InterfaceID) {
-
-  PassInfo *InterfaceInfo =
-    const_cast<PassInfo*>(Pass::lookupPassInfo(InterfaceID));
-  if (InterfaceInfo == 0) {
-    // First reference to Interface, register it now.
-    registerPass();
-    InterfaceInfo = this;
-  }
-  assert(isAnalysisGroup() &&
-         "Trying to join an analysis group that is a normal pass!");
-
-  if (PassID) {
-    const PassInfo *ImplementationInfo = Pass::lookupPassInfo(PassID);
-    assert(ImplementationInfo &&
-           "Must register pass before adding to AnalysisGroup!");
-
-    // Make sure we keep track of the fact that the implementation implements
-    // the interface.
-    PassInfo *IIPI = const_cast<PassInfo*>(ImplementationInfo);
-    IIPI->addInterfaceImplemented(InterfaceInfo);
-    
-    getPassRegistrar()->RegisterAnalysisGroup(InterfaceInfo, IIPI, isDefault);
-  }
+RegisterAGBase::RegisterAGBase(const char *Name, const void *InterfaceID,
+                               const void *PassID, bool isDefault)
+    : PassInfo(Name, InterfaceID) {
+  PassRegistry::getPassRegistry()->registerAnalysisGroup(InterfaceID, PassID,
+                                                         *this, isDefault);
 }
 
 
@@ -440,31 +221,19 @@ RegisterAGBase::RegisterAGBase(const char *Name, intptr_t InterfaceID,
 // PassRegistrationListener ctor - Add the current object to the list of
 // PassRegistrationListeners...
 PassRegistrationListener::PassRegistrationListener() {
-  sys::SmartScopedLock<true> Lock(ListenersLock);
-  if (!Listeners) Listeners = new std::vector<PassRegistrationListener*>();
-  Listeners->push_back(this);
+  PassRegistry::getPassRegistry()->addRegistrationListener(this);
 }
 
 // dtor - Remove object from list of listeners...
 PassRegistrationListener::~PassRegistrationListener() {
-  sys::SmartScopedLock<true> Lock(ListenersLock);
-  std::vector<PassRegistrationListener*>::iterator I =
-    std::find(Listeners->begin(), Listeners->end(), this);
-  assert(Listeners && I != Listeners->end() &&
-         "PassRegistrationListener not registered!");
-  Listeners->erase(I);
-
-  if (Listeners->empty()) {
-    delete Listeners;
-    Listeners = 0;
-  }
+  PassRegistry::getPassRegistry()->removeRegistrationListener(this);
 }
 
 // enumeratePasses - Iterate over the registered passes, calling the
 // passEnumerate callback on each PassInfo object.
 //
 void PassRegistrationListener::enumeratePasses() {
-  getPassRegistrar()->EnumerateWith(this);
+  PassRegistry::getPassRegistry()->enumerateWith(this);
 }
 
 PassNameParser::~PassNameParser() {}
@@ -481,7 +250,7 @@ namespace {
     
     void passEnumerate(const PassInfo *P) {
       if (P->isCFGOnlyPass())
-        CFGOnlyList.push_back(P);
+        CFGOnlyList.push_back(P->getTypeInfo());
     }
   };
 }
@@ -501,15 +270,25 @@ void AnalysisUsage::setPreservesCFG() {
   GetCFGOnlyPasses(Preserved).enumeratePasses();
 }
 
-AnalysisUsage &AnalysisUsage::addRequiredID(AnalysisID ID) {
-  assert(ID && "Pass class not registered!");
-  Required.push_back(ID);
+AnalysisUsage &AnalysisUsage::addPreserved(StringRef Arg) {
+  const PassInfo *PI = Pass::lookupPassInfo(Arg);
+  // If the pass exists, preserve it. Otherwise silently do nothing.
+  if (PI) Preserved.push_back(PI->getTypeInfo());
   return *this;
 }
 
-AnalysisUsage &AnalysisUsage::addRequiredTransitiveID(AnalysisID ID) {
-  assert(ID && "Pass class not registered!");
+AnalysisUsage &AnalysisUsage::addRequiredID(const void *ID) {
   Required.push_back(ID);
-  RequiredTransitive.push_back(ID);
+  return *this;
+}
+
+AnalysisUsage &AnalysisUsage::addRequiredID(char &ID) {
+  Required.push_back(&ID);
+  return *this;
+}
+
+AnalysisUsage &AnalysisUsage::addRequiredTransitiveID(char &ID) {
+  Required.push_back(&ID);
+  RequiredTransitive.push_back(&ID);
   return *this;
 }
diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp
index 296b0d13a710b..ab4d4e55c7504 100644
--- a/lib/VMCore/PassManager.cpp
+++ b/lib/VMCore/PassManager.cpp
@@ -7,12 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the LLVM Pass Manager infrastructure. 
+// This file implements the LLVM Pass Manager infrastructure.
 //
 //===----------------------------------------------------------------------===//
 
 
 #include "llvm/PassManagers.h"
+#include "llvm/PassManager.h"
 #include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CommandLine.h"
@@ -24,8 +25,6 @@
 #include "llvm/Support/PassNameParser.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/System/Mutex.h"
-#include "llvm/System/Threading.h"
-#include "llvm-c/Core.h"
 #include <algorithm>
 #include <cstdio>
 #include <map>
@@ -82,30 +81,32 @@ PrintAfterAll("print-after-all",
 /// This is a helper to determine whether to print IR before or
 /// after a pass.
 
-static bool ShouldPrintBeforeOrAfterPass(Pass *P,
+static bool ShouldPrintBeforeOrAfterPass(const void *PassID,
                                          PassOptionList &PassesToPrint) {
-  for (unsigned i = 0, ie = PassesToPrint.size(); i < ie; ++i) {
-    const llvm::PassInfo *PassInf = PassesToPrint[i];
-    if (PassInf && P->getPassInfo())
-      if (PassInf->getPassArgument() ==
-          P->getPassInfo()->getPassArgument()) {
-        return true;
-      }
+  if (const llvm::PassInfo *PI =
+      PassRegistry::getPassRegistry()->getPassInfo(PassID)) {
+    for (unsigned i = 0, ie = PassesToPrint.size(); i < ie; ++i) {
+      const llvm::PassInfo *PassInf = PassesToPrint[i];
+      if (PassInf)
+        if (PassInf->getPassArgument() == PI->getPassArgument()) {
+          return true;
+        }
+    }
   }
   return false;
 }
-  
+
 
 /// This is a utility to check whether a pass should have IR dumped
 /// before it.
-static bool ShouldPrintBeforePass(Pass *P) {
-  return PrintBeforeAll || ShouldPrintBeforeOrAfterPass(P, PrintBefore);
+static bool ShouldPrintBeforePass(const void *PassID) {
+  return PrintBeforeAll || ShouldPrintBeforeOrAfterPass(PassID, PrintBefore);
 }
 
 /// This is a utility to check whether a pass should have IR dumped
 /// after it.
-static bool ShouldPrintAfterPass(Pass *P) {
-  return PrintAfterAll || ShouldPrintBeforeOrAfterPass(P, PrintAfter);
+static bool ShouldPrintAfterPass(const void *PassID) {
+  return PrintAfterAll || ShouldPrintBeforeOrAfterPass(PassID, PrintAfter);
 }
 
 } // End of llvm namespace
@@ -124,9 +125,9 @@ void PassManagerPrettyStackEntry::print(raw_ostream &OS) const {
     OS << "Releasing pass '";
   else
     OS << "Running pass '";
-  
+
   OS << P->getPassName() << "'";
-  
+
   if (M) {
     OS << " on module '" << M->getModuleIdentifier() << "'.\n";
     return;
@@ -162,8 +163,8 @@ class BBPassManager : public PMDataManager, public FunctionPass {
 
 public:
   static char ID;
-  explicit BBPassManager(int Depth) 
-    : PMDataManager(Depth), FunctionPass(&ID) {}
+  explicit BBPassManager(int Depth)
+    : PMDataManager(Depth), FunctionPass(ID) {}
 
   /// Execute all of the passes scheduled for execution.  Keep track of
   /// whether any of the passes modifies the function, and if so, return true.
@@ -202,8 +203,8 @@ public:
     return BP;
   }
 
-  virtual PassManagerType getPassManagerType() const { 
-    return PMT_BasicBlockPassManager; 
+  virtual PassManagerType getPassManagerType() const {
+    return PMT_BasicBlockPassManager;
   }
 };
 
@@ -223,9 +224,9 @@ private:
   bool wasRun;
 public:
   static char ID;
-  explicit FunctionPassManagerImpl(int Depth) : 
-    Pass(PT_PassManager, &ID), PMDataManager(Depth), 
-    PMTopLevelManager(TLM_Function), wasRun(false) { }
+  explicit FunctionPassManagerImpl(int Depth) :
+    Pass(PT_PassManager, ID), PMDataManager(Depth),
+    PMTopLevelManager(new FPPassManager(1)), wasRun(false) {}
 
   /// add - Add a pass to the queue of passes to run.  This passes ownership of
   /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
@@ -234,8 +235,8 @@ public:
   void add(Pass *P) {
     schedulePass(P);
   }
- 
-  /// createPrinterPass - Get a function printer pass. 
+
+  /// createPrinterPass - Get a function printer pass.
   Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
     return createPrintFunctionPass(Banner, &O);
   }
@@ -251,12 +252,12 @@ public:
   /// doInitialization - Run all of the initializers for the function passes.
   ///
   bool doInitialization(Module &M);
-  
+
   /// doFinalization - Run all of the finalizers for the function passes.
   ///
   bool doFinalization(Module &M);
 
-                                  
+
   virtual PMDataManager *getAsPMDataManager() { return this; }
   virtual Pass *getAsPass() { return this; }
 
@@ -265,7 +266,7 @@ public:
     Info.setPreservesAll();
   }
 
-  inline void addTopLevelPass(Pass *P) {
+  void addTopLevelPass(Pass *P) {
     if (ImmutablePass *IP = P->getAsImmutablePass()) {
       // P is a immutable pass and it will be managed by this
       // top level manager. Set up analysis resolver to connect them.
@@ -288,6 +289,7 @@ public:
 };
 
 char FunctionPassManagerImpl::ID = 0;
+
 //===----------------------------------------------------------------------===//
 // MPPassManager
 //
@@ -298,11 +300,11 @@ class MPPassManager : public Pass, public PMDataManager {
 public:
   static char ID;
   explicit MPPassManager(int Depth) :
-    Pass(PT_PassManager, &ID), PMDataManager(Depth) { }
+    Pass(PT_PassManager, ID), PMDataManager(Depth) { }
 
   // Delete on the fly managers.
   virtual ~MPPassManager() {
-    for (std::map<Pass *, FunctionPassManagerImpl *>::iterator 
+    for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
            I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
          I != E; ++I) {
       FunctionPassManagerImpl *FPP = I->second;
@@ -310,7 +312,7 @@ public:
     }
   }
 
-  /// createPrinterPass - Get a module printer pass. 
+  /// createPrinterPass - Get a module printer pass.
   Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
     return createPrintModulePass(&O, false, Banner);
   }
@@ -329,10 +331,10 @@ public:
   /// through getAnalysis interface.
   virtual void addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass);
 
-  /// Return function pass corresponding to PassInfo PI, that is 
+  /// Return function pass corresponding to PassInfo PI, that is
   /// required by module pass MP. Instantiate analysis pass, by using
   /// its runOnFunction() for function F.
-  virtual Pass* getOnTheFlyPass(Pass *MP, const PassInfo *PI, Function &F);
+  virtual Pass* getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F);
 
   virtual const char *getPassName() const {
     return "Module Pass Manager";
@@ -360,8 +362,8 @@ public:
     return static_cast<ModulePass *>(PassVector[N]);
   }
 
-  virtual PassManagerType getPassManagerType() const { 
-    return PMT_ModulePassManager; 
+  virtual PassManagerType getPassManagerType() const {
+    return PMT_ModulePassManager;
   }
 
  private:
@@ -383,8 +385,8 @@ class PassManagerImpl : public Pass,
 public:
   static char ID;
   explicit PassManagerImpl(int Depth) :
-    Pass(PT_PassManager, &ID), PMDataManager(Depth),
-                               PMTopLevelManager(TLM_Pass) { }
+    Pass(PT_PassManager, ID), PMDataManager(Depth),
+                              PMTopLevelManager(new MPPassManager(1)) {}
 
   /// add - Add a pass to the queue of passes to run.  This passes ownership of
   /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
@@ -393,8 +395,8 @@ public:
   void add(Pass *P) {
     schedulePass(P);
   }
- 
-  /// createPrinterPass - Get a module printer pass. 
+
+  /// createPrinterPass - Get a module printer pass.
   Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
     return createPrintModulePass(&O, false, Banner);
   }
@@ -408,7 +410,7 @@ public:
     Info.setPreservesAll();
   }
 
-  inline void addTopLevelPass(Pass *P) {
+  void addTopLevelPass(Pass *P) {
     if (ImmutablePass *IP = P->getAsImmutablePass()) {
       // P is a immutable pass and it will be managed by this
       // top level manager. Set up analysis resolver to connect them.
@@ -451,7 +453,7 @@ class TimingInfo {
 public:
   // Use 'create' member to get this.
   TimingInfo() : TG("... Pass execution timing report ...") {}
-  
+
   // TimingDtor - Print out information about timing information
   ~TimingInfo() {
     // Delete all of the timers, which accumulate their info into the
@@ -469,7 +471,7 @@ public:
 
   /// getPassTimer - Return the timer for the specified pass if it exists.
   Timer *getPassTimer(Pass *P) {
-    if (P->getAsPMDataManager()) 
+    if (P->getAsPMDataManager())
       return 0;
 
     sys::SmartScopedLock<true> Lock(*TimingInfoMutex);
@@ -488,28 +490,20 @@ static TimingInfo *TheTimeInfo;
 // PMTopLevelManager implementation
 
 /// Initialize top level manager. Create first pass manager.
-PMTopLevelManager::PMTopLevelManager(enum TopLevelManagerType t) {
-  if (t == TLM_Pass) {
-    MPPassManager *MPP = new MPPassManager(1);
-    MPP->setTopLevelManager(this);
-    addPassManager(MPP);
-    activeStack.push(MPP);
-  } else if (t == TLM_Function) {
-    FPPassManager *FPP = new FPPassManager(1);
-    FPP->setTopLevelManager(this);
-    addPassManager(FPP);
-    activeStack.push(FPP);
-  } 
+PMTopLevelManager::PMTopLevelManager(PMDataManager *PMDM) {
+  PMDM->setTopLevelManager(this);
+  addPassManager(PMDM);
+  activeStack.push(PMDM);
 }
 
 /// Set pass P as the last user of the given analysis passes.
-void PMTopLevelManager::setLastUser(SmallVector<Pass *, 12> &AnalysisPasses, 
+void PMTopLevelManager::setLastUser(SmallVector<Pass *, 12> &AnalysisPasses,
                                     Pass *P) {
   for (SmallVector<Pass *, 12>::iterator I = AnalysisPasses.begin(),
          E = AnalysisPasses.end(); I != E; ++I) {
     Pass *AP = *I;
     LastUser[AP] = P;
-    
+
     if (P == AP)
       continue;
 
@@ -528,7 +522,7 @@ void PMTopLevelManager::setLastUser(SmallVector<Pass *, 12> &AnalysisPasses,
 /// Collect passes whose last user is P
 void PMTopLevelManager::collectLastUses(SmallVector<Pass *, 12> &LastUses,
                                         Pass *P) {
-  DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator DMI = 
+  DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator DMI =
     InversedLastUser.find(P);
   if (DMI == InversedLastUser.end())
     return;
@@ -544,7 +538,7 @@ void PMTopLevelManager::collectLastUses(SmallVector<Pass *, 12> &LastUses,
 AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
   AnalysisUsage *AnUsage = NULL;
   DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.find(P);
-  if (DMI != AnUsageMap.end()) 
+  if (DMI != AnUsageMap.end())
     AnUsage = DMI->second;
   else {
     AnUsage = new AnalysisUsage();
@@ -568,8 +562,9 @@ void PMTopLevelManager::schedulePass(Pass *P) {
   // If P is an analysis pass and it is available then do not
   // generate the analysis again. Stale analysis info should not be
   // available at this point.
-  if (P->getPassInfo() &&
-      P->getPassInfo()->isAnalysis() && findAnalysisPass(P->getPassInfo())) {
+  const PassInfo *PI =
+    PassRegistry::getPassRegistry()->getPassInfo(P->getPassID());
+  if (PI && PI->isAnalysis() && findAnalysisPass(P->getPassID())) {
     delete P;
     return;
   }
@@ -579,14 +574,15 @@ void PMTopLevelManager::schedulePass(Pass *P) {
   bool checkAnalysis = true;
   while (checkAnalysis) {
     checkAnalysis = false;
-  
+
     const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
     for (AnalysisUsage::VectorType::const_iterator I = RequiredSet.begin(),
            E = RequiredSet.end(); I != E; ++I) {
-      
+
       Pass *AnalysisPass = findAnalysisPass(*I);
       if (!AnalysisPass) {
-        AnalysisPass = (*I)->createPass();
+        const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+        AnalysisPass = PI->createPass();
         if (P->getPotentialPassManagerType () ==
             AnalysisPass->getPotentialPassManagerType())
           // Schedule analysis pass that is managed by the same pass manager.
@@ -595,12 +591,12 @@ void PMTopLevelManager::schedulePass(Pass *P) {
                  AnalysisPass->getPotentialPassManagerType()) {
           // Schedule analysis pass that is managed by a new manager.
           schedulePass(AnalysisPass);
-          // Recheck analysis passes to ensure that required analysises that
+          // Recheck analysis passes to ensure that required analyses that
           // are already checked are still available.
           checkAnalysis = true;
         }
         else
-          // Do not schedule this analysis. Lower level analsyis 
+          // Do not schedule this analysis. Lower level analsyis
           // passes are run on the fly.
           delete AnalysisPass;
       }
@@ -632,16 +628,21 @@ Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
 
   for (SmallVector<ImmutablePass *, 8>::iterator I = ImmutablePasses.begin(),
          E = ImmutablePasses.end(); P == NULL && I != E; ++I) {
-    const PassInfo *PI = (*I)->getPassInfo();
+    AnalysisID PI = (*I)->getPassID();
     if (PI == AID)
       P = *I;
 
     // If Pass not found then check the interfaces implemented by Immutable Pass
     if (!P) {
+      const PassInfo *PassInf =
+        PassRegistry::getPassRegistry()->getPassInfo(PI);
       const std::vector<const PassInfo*> &ImmPI =
-        PI->getInterfacesImplemented();
-      if (std::find(ImmPI.begin(), ImmPI.end(), AID) != ImmPI.end())
-        P = *I;
+        PassInf->getInterfacesImplemented();
+      for (std::vector<const PassInfo*>::const_iterator II = ImmPI.begin(),
+           EE = ImmPI.end(); II != EE; ++II) {
+        if ((*II)->getTypeInfo() == AID)
+          P = *I;
+      }
     }
   }
 
@@ -658,7 +659,7 @@ void PMTopLevelManager::dumpPasses() const {
   for (unsigned i = 0, e = ImmutablePasses.size(); i != e; ++i) {
     ImmutablePasses[i]->dumpPassStructure(0);
   }
-  
+
   // Every class that derives from PMDataManager also derives from Pass
   // (sometimes indirectly), but there's no inheritance relationship
   // between PMDataManager and Pass, so we have to getAsPass to get
@@ -684,15 +685,16 @@ void PMTopLevelManager::initializeAllAnalysisInfo() {
   for (SmallVector<PMDataManager *, 8>::iterator I = PassManagers.begin(),
          E = PassManagers.end(); I != E; ++I)
     (*I)->initializeAnalysisInfo();
-  
+
   // Initailize other pass managers
-  for (SmallVector<PMDataManager *, 8>::iterator I = IndirectPassManagers.begin(),
-         E = IndirectPassManagers.end(); I != E; ++I)
+  for (SmallVector<PMDataManager *, 8>::iterator
+       I = IndirectPassManagers.begin(), E = IndirectPassManagers.end();
+       I != E; ++I)
     (*I)->initializeAnalysisInfo();
 
   for (DenseMap<Pass *, Pass *>::iterator DMI = LastUser.begin(),
         DME = LastUser.end(); DMI != DME; ++DMI) {
-    DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator InvDMI = 
+    DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator InvDMI =
       InversedLastUser.find(DMI->second);
     if (InvDMI != InversedLastUser.end()) {
       SmallPtrSet<Pass *, 8> &L = InvDMI->second;
@@ -709,7 +711,7 @@ PMTopLevelManager::~PMTopLevelManager() {
   for (SmallVector<PMDataManager *, 8>::iterator I = PassManagers.begin(),
          E = PassManagers.end(); I != E; ++I)
     delete *I;
-  
+
   for (SmallVector<ImmutablePass *, 8>::iterator
          I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
     delete *I;
@@ -724,16 +726,19 @@ PMTopLevelManager::~PMTopLevelManager() {
 
 /// Augement AvailableAnalysis by adding analysis made available by pass P.
 void PMDataManager::recordAvailableAnalysis(Pass *P) {
-  const PassInfo *PI = P->getPassInfo();
-  if (PI == 0) return;
-  
+  AnalysisID PI = P->getPassID();
+
   AvailableAnalysis[PI] = P;
 
-  //This pass is the current implementation of all of the interfaces it
-  //implements as well.
-  const std::vector<const PassInfo*> &II = PI->getInterfacesImplemented();
+  assert(!AvailableAnalysis.empty());
+
+  // This pass is the current implementation of all of the interfaces it
+  // implements as well.
+  const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI);
+  if (PInf == 0) return;
+  const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
   for (unsigned i = 0, e = II.size(); i != e; ++i)
-    AvailableAnalysis[II[i]] = P;
+    AvailableAnalysis[II[i]->getTypeInfo()] = P;
 }
 
 // Return true if P preserves high level analysis used by other
@@ -742,18 +747,18 @@ bool PMDataManager::preserveHigherLevelAnalysis(Pass *P) {
   AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
   if (AnUsage->getPreservesAll())
     return true;
-  
+
   const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
   for (SmallVector<Pass *, 8>::iterator I = HigherLevelAnalysis.begin(),
          E = HigherLevelAnalysis.end(); I  != E; ++I) {
     Pass *P1 = *I;
     if (P1->getAsImmutablePass() == 0 &&
         std::find(PreservedSet.begin(), PreservedSet.end(),
-                  P1->getPassInfo()) == 
+                  P1->getPassID()) ==
            PreservedSet.end())
       return false;
   }
-  
+
   return true;
 }
 
@@ -788,7 +793,7 @@ void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
          E = AvailableAnalysis.end(); I != E; ) {
     std::map<AnalysisID, Pass*>::iterator Info = I++;
     if (Info->second->getAsImmutablePass() == 0 &&
-        std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) == 
+        std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
         PreservedSet.end()) {
       // Remove this analysis
       if (PassDebugging >= Details) {
@@ -807,12 +812,12 @@ void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
     if (!InheritedAnalysis[Index])
       continue;
 
-    for (std::map<AnalysisID, Pass*>::iterator 
+    for (std::map<AnalysisID, Pass*>::iterator
            I = InheritedAnalysis[Index]->begin(),
            E = InheritedAnalysis[Index]->end(); I != E; ) {
       std::map<AnalysisID, Pass *>::iterator Info = I++;
       if (Info->second->getAsImmutablePass() == 0 &&
-          std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) == 
+          std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
              PreservedSet.end()) {
         // Remove this analysis
         if (PassDebugging >= Details) {
@@ -861,23 +866,24 @@ void PMDataManager::freePass(Pass *P, StringRef Msg,
     P->releaseMemory();
   }
 
-  if (const PassInfo *PI = P->getPassInfo()) {
+  AnalysisID PI = P->getPassID();
+  if (const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI)) {
     // Remove the pass itself (if it is not already removed).
     AvailableAnalysis.erase(PI);
 
     // Remove all interfaces this pass implements, for which it is also
     // listed as the available implementation.
-    const std::vector<const PassInfo*> &II = PI->getInterfacesImplemented();
+    const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
     for (unsigned i = 0, e = II.size(); i != e; ++i) {
       std::map<AnalysisID, Pass*>::iterator Pos =
-        AvailableAnalysis.find(II[i]);
+        AvailableAnalysis.find(II[i]->getTypeInfo());
       if (Pos != AvailableAnalysis.end() && Pos->second == P)
         AvailableAnalysis.erase(Pos);
     }
   }
 }
 
-/// Add pass P into the PassVector. Update 
+/// Add pass P into the PassVector. Update
 /// AvailableAnalysis appropriately if ProcessAnalysis is true.
 void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
   // This manager is going to manage pass P. Set up analysis resolver
@@ -902,7 +908,7 @@ void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
 
   unsigned PDepth = this->getDepth();
 
-  collectRequiredAnalysis(RequiredPasses, 
+  collectRequiredAnalysis(RequiredPasses,
                           ReqAnalysisNotAvailable, P);
   for (SmallVector<Pass *, 8>::iterator I = RequiredPasses.begin(),
          E = RequiredPasses.end(); I != E; ++I) {
@@ -920,7 +926,7 @@ void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
       TransferLastUses.push_back(PRequired);
       // Keep track of higher level analysis used by this manager.
       HigherLevelAnalysis.push_back(PRequired);
-    } else 
+    } else
       llvm_unreachable("Unable to accomodate Required Pass");
   }
 
@@ -937,11 +943,12 @@ void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
     TransferLastUses.clear();
   }
 
-  // Now, take care of required analysises that are not available.
-  for (SmallVector<AnalysisID, 8>::iterator 
-         I = ReqAnalysisNotAvailable.begin(), 
+  // Now, take care of required analyses that are not available.
+  for (SmallVector<AnalysisID, 8>::iterator
+         I = ReqAnalysisNotAvailable.begin(),
          E = ReqAnalysisNotAvailable.end() ;I != E; ++I) {
-    Pass *AnalysisPass = (*I)->createPass();
+    const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+    Pass *AnalysisPass = PI->createPass();
     this->addLowerLevelRequiredPass(P, AnalysisPass);
   }
 
@@ -963,10 +970,10 @@ void PMDataManager::collectRequiredAnalysis(SmallVector<Pass *, 8>&RP,
                                             Pass *P) {
   AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
   const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
-  for (AnalysisUsage::VectorType::const_iterator 
+  for (AnalysisUsage::VectorType::const_iterator
          I = RequiredSet.begin(), E = RequiredSet.end(); I != E; ++I) {
     if (Pass *AnalysisPass = findAnalysisPass(*I, true))
-      RP.push_back(AnalysisPass);   
+      RP.push_back(AnalysisPass);
     else
       RP_NotAvail.push_back(*I);
   }
@@ -975,7 +982,7 @@ void PMDataManager::collectRequiredAnalysis(SmallVector<Pass *, 8>&RP,
   for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
          E = IDs.end(); I != E; ++I) {
     if (Pass *AnalysisPass = findAnalysisPass(*I, true))
-      RP.push_back(AnalysisPass);   
+      RP.push_back(AnalysisPass);
     else
       RP_NotAvail.push_back(*I);
   }
@@ -1016,7 +1023,7 @@ Pass *PMDataManager::findAnalysisPass(AnalysisID AID, bool SearchParent) {
   // Search Parents through TopLevelManager
   if (SearchParent)
     return TPM->findAnalysisPass(AID);
-  
+
   return NULL;
 }
 
@@ -1030,7 +1037,7 @@ void PMDataManager::dumpLastUses(Pass *P, unsigned Offset) const{
     return;
 
   TPM->collectLastUses(LUses, P);
-  
+
   for (SmallVector<Pass *, 12>::iterator I = LUses.begin(),
          E = LUses.end(); I != E; ++I) {
     llvm::dbgs() << "--" << std::string(Offset*2, ' ');
@@ -1044,7 +1051,8 @@ void PMDataManager::dumpPassArguments() const {
     if (PMDataManager *PMD = (*I)->getAsPMDataManager())
       PMD->dumpPassArguments();
     else
-      if (const PassInfo *PI = (*I)->getPassInfo())
+      if (const PassInfo *PI =
+            PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID()))
         if (!PI->isAnalysisGroup())
           dbgs() << " -" << PI->getPassArgument();
   }
@@ -1093,7 +1101,7 @@ void PMDataManager::dumpPassInfo(Pass *P, enum PassDebuggingString S1,
 void PMDataManager::dumpRequiredSet(const Pass *P) const {
   if (PassDebugging < Details)
     return;
-    
+
   AnalysisUsage analysisUsage;
   P->getAnalysisUsage(analysisUsage);
   dumpAnalysisUsage("Required", P, analysisUsage.getRequiredSet());
@@ -1102,7 +1110,7 @@ void PMDataManager::dumpRequiredSet(const Pass *P) const {
 void PMDataManager::dumpPreservedSet(const Pass *P) const {
   if (PassDebugging < Details)
     return;
-    
+
   AnalysisUsage analysisUsage;
   P->getAnalysisUsage(analysisUsage);
   dumpAnalysisUsage("Preserved", P, analysisUsage.getPreservedSet());
@@ -1116,7 +1124,8 @@ void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P,
   dbgs() << (void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
   for (unsigned i = 0; i != Set.size(); ++i) {
     if (i) dbgs() << ',';
-    dbgs() << ' ' << Set[i]->getPassName();
+    const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]);
+    dbgs() << ' ' << PInf->getPassName();
   }
   dbgs() << '\n';
 }
@@ -1131,14 +1140,14 @@ void PMDataManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
     TPM->dumpPasses();
   }
 
-  // Module Level pass may required Function Level analysis info 
-  // (e.g. dominator info). Pass manager uses on the fly function pass manager 
-  // to provide this on demand. In that case, in Pass manager terminology, 
+  // Module Level pass may required Function Level analysis info
+  // (e.g. dominator info). Pass manager uses on the fly function pass manager
+  // to provide this on demand. In that case, in Pass manager terminology,
   // module level pass is requiring lower level analysis info managed by
   // lower level pass manager.
 
   // When Pass manager is not able to order required analysis info, Pass manager
-  // checks whether any lower level manager will be able to provide this 
+  // checks whether any lower level manager will be able to provide this
   // analysis info on demand or not.
 #ifndef NDEBUG
   dbgs() << "Unable to schedule '" << RequiredPass->getPassName();
@@ -1147,7 +1156,7 @@ void PMDataManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
   llvm_unreachable("Unable to schedule pass");
 }
 
-Pass *PMDataManager::getOnTheFlyPass(Pass *P, const PassInfo *PI, Function &F) {
+Pass *PMDataManager::getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F) {
   assert(0 && "Unable to find on the fly pass");
   return NULL;
 }
@@ -1166,7 +1175,7 @@ Pass *AnalysisResolver::getAnalysisIfAvailable(AnalysisID ID, bool dir) const {
   return PM.findAnalysisPass(ID, dir);
 }
 
-Pass *AnalysisResolver::findImplPass(Pass *P, const PassInfo *AnalysisPI, 
+Pass *AnalysisResolver::findImplPass(Pass *P, AnalysisID AnalysisPI,
                                      Function &F) {
   return PM.getOnTheFlyPass(P, AnalysisPI, F);
 }
@@ -1174,8 +1183,8 @@ Pass *AnalysisResolver::findImplPass(Pass *P, const PassInfo *AnalysisPI,
 //===----------------------------------------------------------------------===//
 // BBPassManager implementation
 
-/// Execute all of the passes scheduled for execution by invoking 
-/// runOnBasicBlock method.  Keep track of whether any of the passes modifies 
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnBasicBlock method.  Keep track of whether any of the passes modifies
 /// the function, and if so, return true.
 bool BBPassManager::runOnFunction(Function &F) {
   if (F.isDeclaration())
@@ -1202,7 +1211,7 @@ bool BBPassManager::runOnFunction(Function &F) {
       }
 
       Changed |= LocalChanged;
-      if (LocalChanged) 
+      if (LocalChanged)
         dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG,
                      I->getName());
       dumpPreservedSet(BP);
@@ -1286,17 +1295,18 @@ void FunctionPassManager::addImpl(Pass *P) {
 /// PassManager_X is destroyed, the pass will be destroyed as well, so
 /// there is no need to delete the pass. (TODO delete passes.)
 /// This implies that all passes MUST be allocated with 'new'.
-void FunctionPassManager::add(Pass *P) { 
+void FunctionPassManager::add(Pass *P) {
   // If this is a not a function pass, don't add a printer for it.
+  const void *PassID = P->getPassID();
   if (P->getPassKind() == PT_Function)
-    if (ShouldPrintBeforePass(P))
+    if (ShouldPrintBeforePass(PassID))
       addImpl(P->createPrinterPass(dbgs(), std::string("*** IR Dump Before ")
                                    + P->getPassName() + " ***"));
 
   addImpl(P);
 
   if (P->getPassKind() == PT_Function)
-    if (ShouldPrintAfterPass(P))
+    if (ShouldPrintAfterPass(PassID))
       addImpl(P->createPrinterPass(dbgs(), std::string("*** IR Dump After ")
                                    + P->getPassName() + " ***"));
 }
@@ -1405,8 +1415,8 @@ void FPPassManager::dumpPassStructure(unsigned Offset) {
 }
 
 
-/// Execute all of the passes scheduled for execution by invoking 
-/// runOnFunction method.  Keep track of whether any of the passes modifies 
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnFunction method.  Keep track of whether any of the passes modifies
 /// the function, and if so, return true.
 bool FPPassManager::runOnFunction(Function &F) {
   if (F.isDeclaration())
@@ -1476,8 +1486,8 @@ bool FPPassManager::doFinalization(Module &M) {
 //===----------------------------------------------------------------------===//
 // MPPassManager implementation
 
-/// Execute all of the passes scheduled for execution by invoking 
-/// runOnModule method.  Keep track of whether any of the passes modifies 
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnModule method.  Keep track of whether any of the passes modifies
 /// the module, and if so, return true.
 bool
 MPPassManager::runOnModule(Module &M) {
@@ -1512,7 +1522,7 @@ MPPassManager::runOnModule(Module &M) {
       dumpPassInfo(MP, MODIFICATION_MSG, ON_MODULE_MSG,
                    M.getModuleIdentifier());
     dumpPreservedSet(MP);
-    
+
     verifyPreservedAnalysis(MP);
     removeNotPreservedAnalysis(MP);
     recordAvailableAnalysis(MP);
@@ -1538,7 +1548,7 @@ MPPassManager::runOnModule(Module &M) {
 void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
   assert(P->getPotentialPassManagerType() == PMT_ModulePassManager &&
          "Unable to handle Pass that requires lower level Analysis pass");
-  assert((P->getPotentialPassManagerType() < 
+  assert((P->getPotentialPassManagerType() <
           RequiredPass->getPotentialPassManagerType()) &&
          "Unable to handle Pass that requires lower level Analysis pass");
 
@@ -1558,13 +1568,13 @@ void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
   FPP->setLastUser(LU,  P);
 }
 
-/// Return function pass corresponding to PassInfo PI, that is 
+/// Return function pass corresponding to PassInfo PI, that is
 /// required by module pass MP. Instantiate analysis pass, by using
 /// its runOnFunction() for function F.
-Pass* MPPassManager::getOnTheFlyPass(Pass *MP, const PassInfo *PI, Function &F){
+Pass* MPPassManager::getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F){
   FunctionPassManagerImpl *FPP = OnTheFlyManagers[MP];
   assert(FPP && "Unable to find on the fly pass");
-  
+
   FPP->releaseMemoryOnTheFly();
   FPP->run(F);
   return ((PMTopLevelManager*)FPP)->findAnalysisPass(PI);
@@ -1614,13 +1624,14 @@ void PassManager::addImpl(Pass *P) {
 /// will be destroyed as well, so there is no need to delete the pass.  This
 /// implies that all passes MUST be allocated with 'new'.
 void PassManager::add(Pass *P) {
-  if (ShouldPrintBeforePass(P))
+  const void* PassID = P->getPassID();
+  if (ShouldPrintBeforePass(PassID))
     addImpl(P->createPrinterPass(dbgs(), std::string("*** IR Dump Before ")
                                  + P->getPassName() + " ***"));
 
   addImpl(P);
 
-  if (ShouldPrintAfterPass(P))
+  if (ShouldPrintAfterPass(PassID))
     addImpl(P->createPrinterPass(dbgs(), std::string("*** IR Dump After ")
                                  + P->getPassName() + " ***"));
 }
@@ -1656,7 +1667,7 @@ void TimingInfo::createTheTimeInfo() {
 
 /// If TimingInfo is enabled then start pass timer.
 Timer *llvm::getPassTimer(Pass *P) {
-  if (TheTimeInfo) 
+  if (TheTimeInfo)
     return TheTimeInfo->getPassTimer(P);
   return 0;
 }
@@ -1690,8 +1701,8 @@ void PMStack::push(PMDataManager *PM) {
 }
 
 // Dump content of the pass manager stack.
-void PMStack::dump() {
-  for (std::deque<PMDataManager *>::iterator I = S.begin(),
+void PMStack::dump() const {
+  for (std::vector<PMDataManager *>::const_iterator I = S.begin(),
          E = S.end(); I != E; ++I)
     printf("%s ", (*I)->getAsPass()->getPassName());
 
@@ -1700,11 +1711,11 @@ void PMStack::dump() {
 }
 
 /// Find appropriate Module Pass Manager in the PM Stack and
-/// add self into that manager. 
-void ModulePass::assignPassManager(PMStack &PMS, 
+/// add self into that manager.
+void ModulePass::assignPassManager(PMStack &PMS,
                                    PassManagerType PreferredType) {
   // Find Module Pass Manager
-  while(!PMS.empty()) {
+  while (!PMS.empty()) {
     PassManagerType TopPMType = PMS.top()->getPassManagerType();
     if (TopPMType == PreferredType)
       break; // We found desired pass manager
@@ -1718,7 +1729,7 @@ void ModulePass::assignPassManager(PMStack &PMS,
 }
 
 /// Find appropriate Function Pass Manager or Call Graph Pass Manager
-/// in the PM Stack and add self into that manager. 
+/// in the PM Stack and add self into that manager.
 void FunctionPass::assignPassManager(PMStack &PMS,
                                      PassManagerType PreferredType) {
 
@@ -1727,7 +1738,7 @@ void FunctionPass::assignPassManager(PMStack &PMS,
     if (PMS.top()->getPassManagerType() > PMT_FunctionPassManager)
       PMS.pop();
     else
-      break; 
+      break;
   }
 
   // Create new Function Pass Manager if needed.
@@ -1759,14 +1770,14 @@ void FunctionPass::assignPassManager(PMStack &PMS,
 }
 
 /// Find appropriate Basic Pass Manager or Call Graph Pass Manager
-/// in the PM Stack and add self into that manager. 
+/// in the PM Stack and add self into that manager.
 void BasicBlockPass::assignPassManager(PMStack &PMS,
                                        PassManagerType PreferredType) {
   BBPassManager *BBP;
 
   // Basic Pass Manager is a leaf pass manager. It does not handle
   // any other pass manager.
-  if (!PMS.empty() && 
+  if (!PMS.empty() &&
       PMS.top()->getPassManagerType() == PMT_BasicBlockPassManager) {
     BBP = (BBPassManager *)PMS.top();
   } else {
@@ -1796,38 +1807,3 @@ void BasicBlockPass::assignPassManager(PMStack &PMS,
 }
 
 PassManagerBase::~PassManagerBase() {}
-  
-/*===-- C Bindings --------------------------------------------------------===*/
-
-LLVMPassManagerRef LLVMCreatePassManager() {
-  return wrap(new PassManager());
-}
-
-LLVMPassManagerRef LLVMCreateFunctionPassManagerForModule(LLVMModuleRef M) {
-  return wrap(new FunctionPassManager(unwrap(M)));
-}
-
-LLVMPassManagerRef LLVMCreateFunctionPassManager(LLVMModuleProviderRef P) {
-  return LLVMCreateFunctionPassManagerForModule(
-                                            reinterpret_cast<LLVMModuleRef>(P));
-}
-
-LLVMBool LLVMRunPassManager(LLVMPassManagerRef PM, LLVMModuleRef M) {
-  return unwrap<PassManager>(PM)->run(*unwrap(M));
-}
-
-LLVMBool LLVMInitializeFunctionPassManager(LLVMPassManagerRef FPM) {
-  return unwrap<FunctionPassManager>(FPM)->doInitialization();
-}
-
-LLVMBool LLVMRunFunctionPassManager(LLVMPassManagerRef FPM, LLVMValueRef F) {
-  return unwrap<FunctionPassManager>(FPM)->run(*unwrap<Function>(F));
-}
-
-LLVMBool LLVMFinalizeFunctionPassManager(LLVMPassManagerRef FPM) {
-  return unwrap<FunctionPassManager>(FPM)->doFinalization();
-}
-
-void LLVMDisposePassManager(LLVMPassManagerRef PM) {
-  delete unwrap(PM);
-}
diff --git a/lib/VMCore/PassRegistry.cpp b/lib/VMCore/PassRegistry.cpp
new file mode 100644
index 0000000000000..21dba56aad728
--- /dev/null
+++ b/lib/VMCore/PassRegistry.cpp
@@ -0,0 +1,159 @@
+//===- PassRegistry.cpp - Pass Registration Implementation ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PassRegistry, with which passes are registered on
+// initialization, and supports the PassManager in dependency resolution.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace llvm;
+
+static PassRegistry *PassRegistryObj = 0;
+PassRegistry *PassRegistry::getPassRegistry() {
+  // Use double-checked locking to safely initialize the registrar when
+  // we're running in multithreaded mode.
+  PassRegistry* tmp = PassRegistryObj;
+  if (llvm_is_multithreaded()) {
+    sys::MemoryFence();
+    if (!tmp) {
+      llvm_acquire_global_lock();
+      tmp = PassRegistryObj;
+      if (!tmp) {
+        tmp = new PassRegistry();
+        sys::MemoryFence();
+        PassRegistryObj = tmp;
+      }
+      llvm_release_global_lock();
+    }
+  } else if (!tmp) {
+    PassRegistryObj = new PassRegistry();
+  }
+  
+  return PassRegistryObj;
+}
+
+namespace {
+
+// FIXME: We use ManagedCleanup to erase the pass registrar on shutdown.
+// Unfortunately, passes are registered with static ctors, and having
+// llvm_shutdown clear this map prevents successful ressurection after 
+// llvm_shutdown is run.  Ideally we should find a solution so that we don't
+// leak the map, AND can still resurrect after shutdown.
+void cleanupPassRegistry(void*) {
+  if (PassRegistryObj) {
+    delete PassRegistryObj;
+    PassRegistryObj = 0;
+  }
+}
+ManagedCleanup<&cleanupPassRegistry> registryCleanup ATTRIBUTE_USED;
+
+}
+
+const PassInfo *PassRegistry::getPassInfo(const void *TI) const {
+  sys::SmartScopedLock<true> Guard(Lock);
+  MapType::const_iterator I = PassInfoMap.find(TI);
+  return I != PassInfoMap.end() ? I->second : 0;
+}
+
+const PassInfo *PassRegistry::getPassInfo(StringRef Arg) const {
+  sys::SmartScopedLock<true> Guard(Lock);
+  StringMapType::const_iterator I = PassInfoStringMap.find(Arg);
+  return I != PassInfoStringMap.end() ? I->second : 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Registration mechanism
+//
+
+void PassRegistry::registerPass(const PassInfo &PI) {
+  sys::SmartScopedLock<true> Guard(Lock);
+  bool Inserted =
+    PassInfoMap.insert(std::make_pair(PI.getTypeInfo(),&PI)).second;
+  assert(Inserted && "Pass registered multiple times!"); Inserted=Inserted;
+  PassInfoStringMap[PI.getPassArgument()] = &PI;
+  
+  // Notify any listeners.
+  for (std::vector<PassRegistrationListener*>::iterator
+       I = Listeners.begin(), E = Listeners.end(); I != E; ++I)
+    (*I)->passRegistered(&PI);
+}
+
+void PassRegistry::unregisterPass(const PassInfo &PI) {
+  sys::SmartScopedLock<true> Guard(Lock);
+  MapType::iterator I = PassInfoMap.find(PI.getTypeInfo());
+  assert(I != PassInfoMap.end() && "Pass registered but not in map!");
+  
+  // Remove pass from the map.
+  PassInfoMap.erase(I);
+  PassInfoStringMap.erase(PI.getPassArgument());
+}
+
+void PassRegistry::enumerateWith(PassRegistrationListener *L) {
+  sys::SmartScopedLock<true> Guard(Lock);
+  for (MapType::const_iterator I = PassInfoMap.begin(),
+       E = PassInfoMap.end(); I != E; ++I)
+    L->passEnumerate(I->second);
+}
+
+
+/// Analysis Group Mechanisms.
+void PassRegistry::registerAnalysisGroup(const void *InterfaceID, 
+                                         const void *PassID,
+                                         PassInfo& Registeree,
+                                         bool isDefault) {
+  PassInfo *InterfaceInfo =  const_cast<PassInfo*>(getPassInfo(InterfaceID));
+  if (InterfaceInfo == 0) {
+    // First reference to Interface, register it now.
+    registerPass(Registeree);
+    InterfaceInfo = &Registeree;
+  }
+  assert(Registeree.isAnalysisGroup() && 
+         "Trying to join an analysis group that is a normal pass!");
+
+  if (PassID) {
+    PassInfo *ImplementationInfo = const_cast<PassInfo*>(getPassInfo(PassID));
+    assert(ImplementationInfo &&
+           "Must register pass before adding to AnalysisGroup!");
+
+    // Make sure we keep track of the fact that the implementation implements
+    // the interface.
+    ImplementationInfo->addInterfaceImplemented(InterfaceInfo);
+
+    sys::SmartScopedLock<true> Guard(Lock);
+    AnalysisGroupInfo &AGI = AnalysisGroupInfoMap[InterfaceInfo];
+    assert(AGI.Implementations.count(ImplementationInfo) == 0 &&
+           "Cannot add a pass to the same analysis group more than once!");
+    AGI.Implementations.insert(ImplementationInfo);
+    if (isDefault) {
+      assert(InterfaceInfo->getNormalCtor() == 0 &&
+             "Default implementation for analysis group already specified!");
+      assert(ImplementationInfo->getNormalCtor() &&
+           "Cannot specify pass as default if it does not have a default ctor");
+      InterfaceInfo->setNormalCtor(ImplementationInfo->getNormalCtor());
+    }
+  }
+}
+
+void PassRegistry::addRegistrationListener(PassRegistrationListener *L) {
+  sys::SmartScopedLock<true> Guard(Lock);
+  Listeners.push_back(L);
+}
+
+void PassRegistry::removeRegistrationListener(PassRegistrationListener *L) {
+  sys::SmartScopedLock<true> Guard(Lock);
+  std::vector<PassRegistrationListener*>::iterator I =
+    std::find(Listeners.begin(), Listeners.end(), L);
+  assert(I != Listeners.end() && "PassRegistrationListener not registered!");
+  Listeners.erase(I);
+}
diff --git a/lib/VMCore/PrintModulePass.cpp b/lib/VMCore/PrintModulePass.cpp
index 2d69dce07f3f7..2ee49d235963c 100644
--- a/lib/VMCore/PrintModulePass.cpp
+++ b/lib/VMCore/PrintModulePass.cpp
@@ -28,10 +28,10 @@ namespace {
     bool DeleteStream;      // Delete the ostream in our dtor?
   public:
     static char ID;
-    PrintModulePass() : ModulePass(&ID), Out(&dbgs()), 
+    PrintModulePass() : ModulePass(ID), Out(&dbgs()), 
       DeleteStream(false) {}
     PrintModulePass(const std::string &B, raw_ostream *o, bool DS)
-        : ModulePass(&ID), Banner(B), Out(o), DeleteStream(DS) {}
+        : ModulePass(ID), Banner(B), Out(o), DeleteStream(DS) {}
     
     ~PrintModulePass() {
       if (DeleteStream) delete Out;
@@ -53,12 +53,12 @@ namespace {
     bool DeleteStream;      // Delete the ostream in our dtor?
   public:
     static char ID;
-    PrintFunctionPass() : FunctionPass(&ID), Banner(""), Out(&dbgs()), 
+    PrintFunctionPass() : FunctionPass(ID), Banner(""), Out(&dbgs()), 
                           DeleteStream(false) {}
     PrintFunctionPass(const std::string &B, raw_ostream *o, bool DS)
-      : FunctionPass(&ID), Banner(B), Out(o), DeleteStream(DS) {}
+      : FunctionPass(ID), Banner(B), Out(o), DeleteStream(DS) {}
     
-    inline ~PrintFunctionPass() {
+    ~PrintFunctionPass() {
       if (DeleteStream) delete Out;
     }
     
@@ -77,11 +77,11 @@ namespace {
 }
 
 char PrintModulePass::ID = 0;
-static RegisterPass<PrintModulePass>
-X("print-module", "Print module to stderr");
+INITIALIZE_PASS(PrintModulePass, "print-module",
+                "Print module to stderr", false, false);
 char PrintFunctionPass::ID = 0;
-static RegisterPass<PrintFunctionPass>
-Y("print-function","Print function to stderr");
+INITIALIZE_PASS(PrintFunctionPass, "print-function",
+                "Print function to stderr", false, false);
 
 /// createPrintModulePass - Create and return a pass that writes the
 /// module to the specified raw_ostream.
diff --git a/lib/VMCore/Type.cpp b/lib/VMCore/Type.cpp
index 845b523c24216..c55e6267836ac 100644
--- a/lib/VMCore/Type.cpp
+++ b/lib/VMCore/Type.cpp
@@ -50,7 +50,7 @@ void AbstractTypeUser::setType(Value *V, const Type *NewTy) {
 
 /// Because of the way Type subclasses are allocated, this function is necessary
 /// to use the correct kind of "delete" operator to deallocate the Type object.
-/// Some type objects (FunctionTy, StructTy, UnionTy) allocate additional space
+/// Some type objects (FunctionTy, StructTy) allocate additional space
 /// after the space for their derived type to hold the contained types array of
 /// PATypeHandles. Using this allocation scheme means all the PATypeHandles are
 /// allocated with the type object, decreasing allocations and eliminating the
@@ -66,8 +66,7 @@ void Type::destroy() const {
   // Structures and Functions allocate their contained types past the end of
   // the type object itself. These need to be destroyed differently than the
   // other types.
-  if (this->isFunctionTy() || this->isStructTy() ||
-      this->isUnionTy()) {
+  if (this->isFunctionTy() || this->isStructTy()) {
     // First, make sure we destruct any PATypeHandles allocated by these
     // subclasses.  They must be manually destructed. 
     for (unsigned i = 0; i < NumContainedTys; ++i)
@@ -77,10 +76,10 @@ void Type::destroy() const {
     // to delete this as an array of char.
     if (this->isFunctionTy())
       static_cast<const FunctionType*>(this)->FunctionType::~FunctionType();
-    else if (this->isStructTy())
+    else {
+      assert(isStructTy());
       static_cast<const StructType*>(this)->StructType::~StructType();
-    else
-      static_cast<const UnionType*>(this)->UnionType::~UnionType();
+    }
 
     // Finally, remove the memory as an array deallocation of the chars it was
     // constructed from.
@@ -234,7 +233,7 @@ bool Type::isSizedDerivedType() const {
   if (const VectorType *PTy = dyn_cast<VectorType>(this))
     return PTy->getElementType()->isSized();
 
-  if (!this->isStructTy() && !this->isUnionTy()) 
+  if (!this->isStructTy()) 
     return false;
 
   // Okay, our struct is sized if all of the elements are...
@@ -319,31 +318,6 @@ const Type *StructType::getTypeAtIndex(unsigned Idx) const {
 }
 
 
-bool UnionType::indexValid(const Value *V) const {
-  // Union indexes require 32-bit integer constants.
-  if (V->getType()->isIntegerTy(32))
-    if (const ConstantInt *CU = dyn_cast<ConstantInt>(V))
-      return indexValid(CU->getZExtValue());
-  return false;
-}
-
-bool UnionType::indexValid(unsigned V) const {
-  return V < NumContainedTys;
-}
-
-// getTypeAtIndex - Given an index value into the type, return the type of the
-// element.  For a structure type, this must be a constant value...
-//
-const Type *UnionType::getTypeAtIndex(const Value *V) const {
-  unsigned Idx = (unsigned)cast<ConstantInt>(V)->getZExtValue();
-  return getTypeAtIndex(Idx);
-}
-
-const Type *UnionType::getTypeAtIndex(unsigned Idx) const {
-  assert(indexValid(Idx) && "Invalid structure index!");
-  return ContainedTys[Idx];
-}
-
 //===----------------------------------------------------------------------===//
 //                          Primitive 'Type' data
 //===----------------------------------------------------------------------===//
@@ -455,8 +429,8 @@ const PointerType *Type::getInt64PtrTy(LLVMContext &C, unsigned AS) {
 /// isValidReturnType - Return true if the specified type is valid as a return
 /// type.
 bool FunctionType::isValidReturnType(const Type *RetTy) {
-  return RetTy->getTypeID() != LabelTyID &&
-         RetTy->getTypeID() != MetadataTyID;
+  return !RetTy->isFunctionTy() && !RetTy->isLabelTy() &&
+         !RetTy->isMetadataTy();
 }
 
 /// isValidArgumentType - Return true if the specified type is valid as an
@@ -507,23 +481,6 @@ StructType::StructType(LLVMContext &C,
   setAbstract(isAbstract);
 }
 
-UnionType::UnionType(LLVMContext &C,const Type* const* Types, unsigned NumTypes)
-  : CompositeType(C, UnionTyID) {
-  ContainedTys = reinterpret_cast<PATypeHandle*>(this + 1);
-  NumContainedTys = NumTypes;
-  bool isAbstract = false;
-  for (unsigned i = 0; i < NumTypes; ++i) {
-    assert(Types[i] && "<null> type for union field!");
-    assert(isValidElementType(Types[i]) &&
-           "Invalid type for union element!");
-    new (&ContainedTys[i]) PATypeHandle(Types[i], this);
-    isAbstract |= Types[i]->isAbstract();
-  }
-
-  // Calculate whether or not this type is abstract
-  setAbstract(isAbstract);
-}
-
 ArrayType::ArrayType(const Type *ElType, uint64_t NumEl)
   : SequentialType(ArrayTyID, ElType) {
   NumElements = NumEl;
@@ -603,8 +560,8 @@ namespace llvm {
     static inline ChildIteratorType child_begin(NodeType *N) {
       if (N->isAbstract())
         return N->subtype_begin();
-      else           // No need to process children of concrete types.
-        return N->subtype_end();
+      // No need to process children of concrete types.
+      return N->subtype_end();
     }
     static inline ChildIteratorType child_end(NodeType *N) {
       return N->subtype_end();
@@ -627,35 +584,35 @@ void Type::PromoteAbstractToConcrete() {
 
     // Concrete types are leaves in the tree.  Since an SCC will either be all
     // abstract or all concrete, we only need to check one type.
-    if (SCC[0]->isAbstract()) {
-      if (SCC[0]->isOpaqueTy())
-        return;     // Not going to be concrete, sorry.
-
-      // If all of the children of all of the types in this SCC are concrete,
-      // then this SCC is now concrete as well.  If not, neither this SCC, nor
-      // any parent SCCs will be concrete, so we might as well just exit.
-      for (unsigned i = 0, e = SCC.size(); i != e; ++i)
-        for (Type::subtype_iterator CI = SCC[i]->subtype_begin(),
-               E = SCC[i]->subtype_end(); CI != E; ++CI)
-          if ((*CI)->isAbstract())
-            // If the child type is in our SCC, it doesn't make the entire SCC
-            // abstract unless there is a non-SCC abstract type.
-            if (std::find(SCC.begin(), SCC.end(), *CI) == SCC.end())
-              return;               // Not going to be concrete, sorry.
-
-      // Okay, we just discovered this whole SCC is now concrete, mark it as
-      // such!
-      for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
-        assert(SCC[i]->isAbstract() && "Why are we processing concrete types?");
-
-        SCC[i]->setAbstract(false);
-      }
-
-      for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
-        assert(!SCC[i]->isAbstract() && "Concrete type became abstract?");
-        // The type just became concrete, notify all users!
-        cast<DerivedType>(SCC[i])->notifyUsesThatTypeBecameConcrete();
-      }
+    if (!SCC[0]->isAbstract()) continue;
+    
+    if (SCC[0]->isOpaqueTy())
+      return;     // Not going to be concrete, sorry.
+
+    // If all of the children of all of the types in this SCC are concrete,
+    // then this SCC is now concrete as well.  If not, neither this SCC, nor
+    // any parent SCCs will be concrete, so we might as well just exit.
+    for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+      for (Type::subtype_iterator CI = SCC[i]->subtype_begin(),
+             E = SCC[i]->subtype_end(); CI != E; ++CI)
+        if ((*CI)->isAbstract())
+          // If the child type is in our SCC, it doesn't make the entire SCC
+          // abstract unless there is a non-SCC abstract type.
+          if (std::find(SCC.begin(), SCC.end(), *CI) == SCC.end())
+            return;               // Not going to be concrete, sorry.
+
+    // Okay, we just discovered this whole SCC is now concrete, mark it as
+    // such!
+    for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+      assert(SCC[i]->isAbstract() && "Why are we processing concrete types?");
+
+      SCC[i]->setAbstract(false);
+    }
+
+    for (unsigned i = 0, e = SCC.size(); i != e; ++i) {
+      assert(!SCC[i]->isAbstract() && "Concrete type became abstract?");
+      // The type just became concrete, notify all users!
+      cast<DerivedType>(SCC[i])->notifyUsesThatTypeBecameConcrete();
     }
   }
 }
@@ -693,11 +650,15 @@ static bool TypesEqual(const Type *Ty, const Type *Ty2,
   if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty)) {
     const IntegerType *ITy2 = cast<IntegerType>(Ty2);
     return ITy->getBitWidth() == ITy2->getBitWidth();
-  } else if (const PointerType *PTy = dyn_cast<PointerType>(Ty)) {
+  }
+  
+  if (const PointerType *PTy = dyn_cast<PointerType>(Ty)) {
     const PointerType *PTy2 = cast<PointerType>(Ty2);
     return PTy->getAddressSpace() == PTy2->getAddressSpace() &&
            TypesEqual(PTy->getElementType(), PTy2->getElementType(), EqTypes);
-  } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+  }
+  
+  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
     const StructType *STy2 = cast<StructType>(Ty2);
     if (STy->getNumElements() != STy2->getNumElements()) return false;
     if (STy->isPacked() != STy2->isPacked()) return false;
@@ -705,22 +666,21 @@ static bool TypesEqual(const Type *Ty, const Type *Ty2,
       if (!TypesEqual(STy->getElementType(i), STy2->getElementType(i), EqTypes))
         return false;
     return true;
-  } else if (const UnionType *UTy = dyn_cast<UnionType>(Ty)) {
-    const UnionType *UTy2 = cast<UnionType>(Ty2);
-    if (UTy->getNumElements() != UTy2->getNumElements()) return false;
-    for (unsigned i = 0, e = UTy2->getNumElements(); i != e; ++i)
-      if (!TypesEqual(UTy->getElementType(i), UTy2->getElementType(i), EqTypes))
-        return false;
-    return true;
-  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+  }
+  
+  if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     const ArrayType *ATy2 = cast<ArrayType>(Ty2);
     return ATy->getNumElements() == ATy2->getNumElements() &&
            TypesEqual(ATy->getElementType(), ATy2->getElementType(), EqTypes);
-  } else if (const VectorType *PTy = dyn_cast<VectorType>(Ty)) {
+  }
+  
+  if (const VectorType *PTy = dyn_cast<VectorType>(Ty)) {
     const VectorType *PTy2 = cast<VectorType>(Ty2);
     return PTy->getNumElements() == PTy2->getNumElements() &&
            TypesEqual(PTy->getElementType(), PTy2->getElementType(), EqTypes);
-  } else if (const FunctionType *FTy = dyn_cast<FunctionType>(Ty)) {
+  }
+  
+  if (const FunctionType *FTy = dyn_cast<FunctionType>(Ty)) {
     const FunctionType *FTy2 = cast<FunctionType>(Ty2);
     if (FTy->isVarArg() != FTy2->isVarArg() ||
         FTy->getNumParams() != FTy2->getNumParams() ||
@@ -731,10 +691,10 @@ static bool TypesEqual(const Type *Ty, const Type *Ty2,
         return false;
     }
     return true;
-  } else {
-    llvm_unreachable("Unknown derived type!");
-    return false;
   }
+  
+  llvm_unreachable("Unknown derived type!");
+  return false;
 }
 
 namespace llvm { // in namespace llvm so findable by ADL
@@ -808,13 +768,13 @@ const IntegerType *IntegerType::get(LLVMContext &C, unsigned NumBits) {
 
   // Check for the built-in integer types
   switch (NumBits) {
-    case  1: return cast<IntegerType>(Type::getInt1Ty(C));
-    case  8: return cast<IntegerType>(Type::getInt8Ty(C));
-    case 16: return cast<IntegerType>(Type::getInt16Ty(C));
-    case 32: return cast<IntegerType>(Type::getInt32Ty(C));
-    case 64: return cast<IntegerType>(Type::getInt64Ty(C));
-    default: 
-      break;
+  case  1: return cast<IntegerType>(Type::getInt1Ty(C));
+  case  8: return cast<IntegerType>(Type::getInt8Ty(C));
+  case 16: return cast<IntegerType>(Type::getInt16Ty(C));
+  case 32: return cast<IntegerType>(Type::getInt32Ty(C));
+  case 64: return cast<IntegerType>(Type::getInt64Ty(C));
+  default: 
+    break;
   }
 
   LLVMContextImpl *pImpl = C.pImpl;
@@ -902,8 +862,8 @@ ArrayType *ArrayType::get(const Type *ElementType, uint64_t NumElements) {
 }
 
 bool ArrayType::isValidElementType(const Type *ElemTy) {
-  return ElemTy->getTypeID() != VoidTyID && ElemTy->getTypeID() != LabelTyID &&
-         ElemTy->getTypeID() != MetadataTyID && !ElemTy->isFunctionTy();
+  return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
+         !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy();
 }
 
 VectorType *VectorType::get(const Type *ElementType, unsigned NumElements) {
@@ -975,60 +935,6 @@ bool StructType::isValidElementType(const Type *ElemTy) {
 
 
 //===----------------------------------------------------------------------===//
-// Union Type Factory...
-//
-
-UnionType *UnionType::get(const Type* const* Types, unsigned NumTypes) {
-  assert(NumTypes > 0 && "union must have at least one member type!");
-  UnionValType UTV(Types, NumTypes);
-  UnionType *UT = 0;
-  
-  LLVMContextImpl *pImpl = Types[0]->getContext().pImpl;
-  
-  UT = pImpl->UnionTypes.get(UTV);
-    
-  if (!UT) {
-    // Value not found.  Derive a new type!
-    UT = (UnionType*) operator new(sizeof(UnionType) +
-                                   sizeof(PATypeHandle) * NumTypes);
-    new (UT) UnionType(Types[0]->getContext(), Types, NumTypes);
-    pImpl->UnionTypes.add(UTV, UT);
-  }
-#ifdef DEBUG_MERGE_TYPES
-  DEBUG(dbgs() << "Derived new type: " << *UT << "\n");
-#endif
-  return UT;
-}
-
-UnionType *UnionType::get(const Type *type, ...) {
-  va_list ap;
-  SmallVector<const llvm::Type*, 8> UnionFields;
-  va_start(ap, type);
-  while (type) {
-    UnionFields.push_back(type);
-    type = va_arg(ap, llvm::Type*);
-  }
-  unsigned NumTypes = UnionFields.size();
-  assert(NumTypes > 0 && "union must have at least one member type!");
-  return llvm::UnionType::get(&UnionFields[0], NumTypes);
-}
-
-bool UnionType::isValidElementType(const Type *ElemTy) {
-  return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
-         !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy();
-}
-
-int UnionType::getElementTypeIndex(const Type *ElemTy) const {
-  int index = 0;
-  for (UnionType::element_iterator I = element_begin(), E = element_end();
-       I != E; ++I, ++index) {
-     if (ElemTy == *I) return index;
-  }
-  
-  return -1;
-}
-
-//===----------------------------------------------------------------------===//
 // Pointer Type Factory...
 //
 
@@ -1060,9 +966,8 @@ const PointerType *Type::getPointerTo(unsigned addrs) const {
 }
 
 bool PointerType::isValidElementType(const Type *ElemTy) {
-  return ElemTy->getTypeID() != VoidTyID &&
-         ElemTy->getTypeID() != LabelTyID &&
-         ElemTy->getTypeID() != MetadataTyID;
+  return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
+         !ElemTy->isMetadataTy();
 }
 
 
@@ -1071,8 +976,7 @@ bool PointerType::isValidElementType(const Type *ElemTy) {
 //
 
 OpaqueType *OpaqueType::get(LLVMContext &C) {
-  OpaqueType *OT = new OpaqueType(C);           // All opaque types are distinct
-  
+  OpaqueType *OT = new OpaqueType(C);       // All opaque types are distinct.
   LLVMContextImpl *pImpl = C.pImpl;
   pImpl->OpaqueTypes.insert(OT);
   return OT;
@@ -1123,18 +1027,17 @@ void Type::removeAbstractTypeUser(AbstractTypeUser *U) const {
                  << ">[" << (void*)this << "]" << "\n");
 #endif
   
-  this->destroy();
+    this->destroy();
   }
-  
 }
 
-// unlockedRefineAbstractTypeTo - This function is used when it is discovered
+// refineAbstractTypeTo - This function is used when it is discovered
 // that the 'this' abstract type is actually equivalent to the NewType
 // specified. This causes all users of 'this' to switch to reference the more 
 // concrete type NewType and for 'this' to be deleted.  Only used for internal
 // callers.
 //
-void DerivedType::unlockedRefineAbstractTypeTo(const Type *NewType) {
+void DerivedType::refineAbstractTypeTo(const Type *NewType) {
   assert(isAbstract() && "refineAbstractTypeTo: Current type is not abstract!");
   assert(this != NewType && "Can't refine to myself!");
   assert(ForwardType == 0 && "This type has already been refined!");
@@ -1199,15 +1102,6 @@ void DerivedType::unlockedRefineAbstractTypeTo(const Type *NewType) {
   // destroyed.
 }
 
-// refineAbstractTypeTo - This function is used by external callers to notify
-// us that this abstract type is equivalent to another type.
-//
-void DerivedType::refineAbstractTypeTo(const Type *NewType) {
-  // All recursive calls will go through unlockedRefineAbstractTypeTo,
-  // to avoid deadlock problems.
-  unlockedRefineAbstractTypeTo(NewType);
-}
-
 // notifyUsesThatTypeBecameConcrete - Notify AbstractTypeUsers of this type that
 // the current type has transitioned from being abstract to being concrete.
 //
@@ -1291,21 +1185,6 @@ void StructType::typeBecameConcrete(const DerivedType *AbsTy) {
 // concrete - this could potentially change us from an abstract type to a
 // concrete type.
 //
-void UnionType::refineAbstractType(const DerivedType *OldType,
-                                    const Type *NewType) {
-  LLVMContextImpl *pImpl = OldType->getContext().pImpl;
-  pImpl->UnionTypes.RefineAbstractType(this, OldType, NewType);
-}
-
-void UnionType::typeBecameConcrete(const DerivedType *AbsTy) {
-  LLVMContextImpl *pImpl = AbsTy->getContext().pImpl;
-  pImpl->UnionTypes.TypeBecameConcrete(this, AbsTy);
-}
-
-// refineAbstractType - Called when a contained type is found to be more
-// concrete - this could potentially change us from an abstract type to a
-// concrete type.
-//
 void PointerType::refineAbstractType(const DerivedType *OldType,
                                      const Type *NewType) {
   LLVMContextImpl *pImpl = OldType->getContext().pImpl;
diff --git a/lib/VMCore/TypesContext.h b/lib/VMCore/TypesContext.h
index 02ab1135b32cf..5a90917977b02 100644
--- a/lib/VMCore/TypesContext.h
+++ b/lib/VMCore/TypesContext.h
@@ -180,32 +180,6 @@ public:
   }
 };
 
-// UnionValType - Define a class to hold the key that goes into the TypeMap
-//
-class UnionValType {
-  std::vector<const Type*> ElTypes;
-public:
-  UnionValType(const Type* const* Types, unsigned NumTypes)
-    : ElTypes(&Types[0], &Types[NumTypes]) {}
-
-  static UnionValType get(const UnionType *UT) {
-    std::vector<const Type *> ElTypes;
-    ElTypes.reserve(UT->getNumElements());
-    for (unsigned i = 0, e = UT->getNumElements(); i != e; ++i)
-      ElTypes.push_back(UT->getElementType(i));
-
-    return UnionValType(&ElTypes[0], ElTypes.size());
-  }
-
-  static unsigned hashTypeStructure(const UnionType *UT) {
-    return UT->getNumElements();
-  }
-
-  inline bool operator<(const UnionValType &UTV) const {
-    return (ElTypes < UTV.ElTypes);
-  }
-};
-
 // FunctionValType - Define a class to hold the key that goes into the TypeMap
 //
 class FunctionValType {
@@ -370,7 +344,7 @@ public:
         // We already have this type in the table.  Get rid of the newly refined
         // type.
         TypeClass *NewTy = cast<TypeClass>((Type*)I->second.get());
-        Ty->unlockedRefineAbstractTypeTo(NewTy);
+        Ty->refineAbstractTypeTo(NewTy);
         return;
       }
     } else {
@@ -385,31 +359,33 @@ public:
         if (I->second == Ty) {
           // Remember the position of the old type if we see it in our scan.
           Entry = I;
+          continue;
+        }
+        
+        if (!TypesEqual(Ty, I->second))
+          continue;
+        
+        TypeClass *NewTy = cast<TypeClass>((Type*)I->second.get());
+
+        // Remove the old entry form TypesByHash.  If the hash values differ
+        // now, remove it from the old place.  Otherwise, continue scanning
+        // withing this hashcode to reduce work.
+        if (NewTypeHash != OldTypeHash) {
+          RemoveFromTypesByHash(OldTypeHash, Ty);
         } else {
-          if (TypesEqual(Ty, I->second)) {
-            TypeClass *NewTy = cast<TypeClass>((Type*)I->second.get());
-
-            // Remove the old entry form TypesByHash.  If the hash values differ
-            // now, remove it from the old place.  Otherwise, continue scanning
-            // withing this hashcode to reduce work.
-            if (NewTypeHash != OldTypeHash) {
-              RemoveFromTypesByHash(OldTypeHash, Ty);
-            } else {
-              if (Entry == E) {
-                // Find the location of Ty in the TypesByHash structure if we
-                // haven't seen it already.
-                while (I->second != Ty) {
-                  ++I;
-                  assert(I != E && "Structure doesn't contain type??");
-                }
-                Entry = I;
-              }
-              TypesByHash.erase(Entry);
+          if (Entry == E) {
+            // Find the location of Ty in the TypesByHash structure if we
+            // haven't seen it already.
+            while (I->second != Ty) {
+              ++I;
+              assert(I != E && "Structure doesn't contain type??");
             }
-            Ty->unlockedRefineAbstractTypeTo(NewTy);
-            return;
+            Entry = I;
           }
+          TypesByHash.erase(Entry);
         }
+        Ty->refineAbstractTypeTo(NewTy);
+        return;
       }
 
       // If there is no existing type of the same structure, we reinsert an
diff --git a/lib/VMCore/Use.cpp b/lib/VMCore/Use.cpp
index b7fd92f9b0660..fec710b39459b 100644
--- a/lib/VMCore/Use.cpp
+++ b/lib/VMCore/Use.cpp
@@ -86,14 +86,27 @@ const Use *Use::getImpliedUser() const {
 //===----------------------------------------------------------------------===//
 
 Use *Use::initTags(Use * const Start, Use *Stop, ptrdiff_t Done) {
+  while (Done < 20) {
+    if (Start == Stop--)
+      return Start;
+    static const PrevPtrTag tags[20] = { fullStopTag, oneDigitTag, stopTag,
+                                         oneDigitTag, oneDigitTag, stopTag,
+                                         zeroDigitTag, oneDigitTag, oneDigitTag,
+                                         stopTag, zeroDigitTag, oneDigitTag,
+                                         zeroDigitTag, oneDigitTag, stopTag,
+                                         oneDigitTag, oneDigitTag, oneDigitTag,
+                                         oneDigitTag, stopTag
+                                       };
+    Stop->Prev.setFromOpaqueValue(reinterpret_cast<Use**>(tags[Done++]));
+    Stop->Val = 0;
+  }
+
   ptrdiff_t Count = Done;
   while (Start != Stop) {
     --Stop;
     Stop->Val = 0;
     if (!Count) {
-      Stop->Prev.setFromOpaqueValue(reinterpret_cast<Use**>(Done == 0
-                                                            ? fullStopTag
-                                                            : stopTag));
+      Stop->Prev.setFromOpaqueValue(reinterpret_cast<Use**>(stopTag));
       ++Done;
       Count = Done;
     } else {
diff --git a/lib/VMCore/Value.cpp b/lib/VMCore/Value.cpp
index 585edf09c9e5a..b8c677565467a 100644
--- a/lib/VMCore/Value.cpp
+++ b/lib/VMCore/Value.cpp
@@ -139,10 +139,6 @@ static bool getSymTab(Value *V, ValueSymbolTable *&ST) {
   } else if (Argument *A = dyn_cast<Argument>(V)) {
     if (Function *P = A->getParent())
       ST = &P->getValueSymbolTable();
-  } else if (NamedMDNode *N = dyn_cast<NamedMDNode>(V)) {
-    if (Module *P = N->getParent()) {
-      ST = &P->getValueSymbolTable();
-    }
   } else if (isa<MDString>(V))
     return true;
   else {
@@ -492,10 +488,15 @@ void ValueHandleBase::ValueIsDeleted(Value *V) {
   ValueHandleBase *Entry = pImpl->ValueHandles[V];
   assert(Entry && "Value bit set but no entries exist");
 
-  // We use a local ValueHandleBase as an iterator so that
-  // ValueHandles can add and remove themselves from the list without
-  // breaking our iteration.  This is not really an AssertingVH; we
-  // just have to give ValueHandleBase some kind.
+  // We use a local ValueHandleBase as an iterator so that ValueHandles can add
+  // and remove themselves from the list without breaking our iteration.  This
+  // is not really an AssertingVH; we just have to give ValueHandleBase a kind.
+  // Note that we deliberately do not the support the case when dropping a value
+  // handle results in a new value handle being permanently added to the list
+  // (as might occur in theory for CallbackVH's): the new value handle will not
+  // be processed and the checking code will mete out righteous punishment if
+  // the handle is still present once we have finished processing all the other
+  // value handles (it is fine to momentarily add then remove a value handle).
   for (ValueHandleBase Iterator(Assert, *Entry); Entry; Entry = Iterator.Next) {
     Iterator.RemoveFromUseList();
     Iterator.AddToExistingUseListAfter(Entry);
@@ -576,6 +577,24 @@ void ValueHandleBase::ValueIsRAUWd(Value *Old, Value *New) {
       break;
     }
   }
+
+#ifndef NDEBUG
+  // If any new tracking or weak value handles were added while processing the
+  // list, then complain about it now.
+  if (Old->HasValueHandle)
+    for (Entry = pImpl->ValueHandles[Old]; Entry; Entry = Entry->Next)
+      switch (Entry->getKind()) {
+      case Tracking:
+      case Weak:
+        dbgs() << "After RAUW from " << *Old->getType() << " %"
+          << Old->getNameStr() << " to " << *New->getType() << " %"
+          << New->getNameStr() << "\n";
+        llvm_unreachable("A tracking or weak value handle still pointed to the"
+                         " old value!\n");
+      default:
+        break;
+      }
+#endif
 }
 
 /// ~CallbackVH. Empty, but defined here to avoid emitting the vtable
diff --git a/lib/VMCore/ValueSymbolTable.cpp b/lib/VMCore/ValueSymbolTable.cpp
index 449d61a2cbb18..254bf06439d9b 100644
--- a/lib/VMCore/ValueSymbolTable.cpp
+++ b/lib/VMCore/ValueSymbolTable.cpp
@@ -115,5 +115,3 @@ void ValueSymbolTable::dump() const {
     //DEBUG(dbgs() << "\n");
   }
 }
-
-MDSymbolTable::~MDSymbolTable() { }
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index f97699dabd890..e3ecc979bf128 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -72,7 +72,7 @@ namespace {  // Anonymous namespace for class
   struct PreVerifier : public FunctionPass {
     static char ID; // Pass ID, replacement for typeid
 
-    PreVerifier() : FunctionPass(&ID) { }
+    PreVerifier() : FunctionPass(ID) { }
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
@@ -102,9 +102,9 @@ namespace {  // Anonymous namespace for class
 }
 
 char PreVerifier::ID = 0;
-static RegisterPass<PreVerifier>
-PreVer("preverify", "Preliminary module verification");
-static const PassInfo *const PreVerifyID = &PreVer;
+INITIALIZE_PASS(PreVerifier, "preverify", "Preliminary module verification", 
+                false, false);
+char &PreVerifyID = PreVerifier::ID;
 
 namespace {
   class TypeSet : public AbstractTypeUser {
@@ -182,23 +182,13 @@ namespace {
     SmallPtrSet<MDNode *, 32> MDNodes;
 
     Verifier()
-      : FunctionPass(&ID), 
+      : FunctionPass(ID), 
       Broken(false), RealPass(true), action(AbortProcessAction),
       Mod(0), Context(0), DT(0), MessagesStr(Messages) {}
     explicit Verifier(VerifierFailureAction ctn)
-      : FunctionPass(&ID), 
+      : FunctionPass(ID), 
       Broken(false), RealPass(true), action(ctn), Mod(0), Context(0), DT(0),
       MessagesStr(Messages) {}
-    explicit Verifier(bool AB)
-      : FunctionPass(&ID), 
-      Broken(false), RealPass(true),
-      action( AB ? AbortProcessAction : PrintMessageAction), Mod(0),
-      Context(0), DT(0), MessagesStr(Messages) {}
-    explicit Verifier(DominatorTree &dt)
-      : FunctionPass(&ID), 
-      Broken(false), RealPass(false), action(PrintMessageAction), Mod(0),
-      Context(0), DT(&dt), MessagesStr(Messages) {}
-
 
     bool doInitialization(Module &M) {
       Mod = &M;
@@ -331,6 +321,7 @@ namespace {
     void visitBranchInst(BranchInst &BI);
     void visitReturnInst(ReturnInst &RI);
     void visitSwitchInst(SwitchInst &SI);
+    void visitIndirectBrInst(IndirectBrInst &BI);
     void visitSelectInst(SelectInst &SI);
     void visitUserOp1(Instruction &I);
     void visitUserOp2(Instruction &I) { visitUserOp1(I); }
@@ -402,7 +393,7 @@ namespace {
 } // End anonymous namespace
 
 char Verifier::ID = 0;
-static RegisterPass<Verifier> X("verify", "Module Verifier");
+INITIALIZE_PASS(Verifier, "verify", "Module Verifier", false, false);
 
 // Assert - We know that cond should be true, if not print an error message.
 #define Assert(C, M) \
@@ -445,6 +436,10 @@ void Verifier::visitGlobalValue(GlobalValue &GV) {
     Assert1(GVar && GVar->getType()->getElementType()->isArrayTy(),
             "Only global arrays can have appending linkage!", GVar);
   }
+
+  Assert1(!GV.hasLinkerPrivateWeakDefAutoLinkage() || GV.hasDefaultVisibility(),
+          "linker_private_weak_def_auto can only have default visibility!",
+          &GV);
 }
 
 void Verifier::visitGlobalVariable(GlobalVariable &GV) {
@@ -504,8 +499,8 @@ void Verifier::visitNamedMDNode(NamedMDNode &NMD) {
     if (!MD)
       continue;
 
-    Assert2(!MD->isFunctionLocal(),
-            "Named metadata operand cannot be function local!", &NMD, MD);
+    Assert1(!MD->isFunctionLocal(),
+            "Named metadata operand cannot be function local!", MD);
     visitMDNode(*MD, 0);
   }
 }
@@ -520,7 +515,7 @@ void Verifier::visitMDNode(MDNode &MD, Function *F) {
     Value *Op = MD.getOperand(i);
     if (!Op)
       continue;
-    if (isa<Constant>(Op) || isa<MDString>(Op) || isa<NamedMDNode>(Op))
+    if (isa<Constant>(Op) || isa<MDString>(Op))
       continue;
     if (MDNode *N = dyn_cast<MDNode>(Op)) {
       Assert2(MD.isFunctionLocal() || !N->isFunctionLocal(),
@@ -864,6 +859,16 @@ void Verifier::visitSwitchInst(SwitchInst &SI) {
   visitTerminatorInst(SI);
 }
 
+void Verifier::visitIndirectBrInst(IndirectBrInst &BI) {
+  Assert1(BI.getAddress()->getType()->isPointerTy(),
+          "Indirectbr operand must have pointer type!", &BI);
+  for (unsigned i = 0, e = BI.getNumDestinations(); i != e; ++i)
+    Assert1(BI.getDestination(i)->getType()->isLabelTy(),
+            "Indirectbr destinations must all have pointer type!", &BI);
+
+  visitTerminatorInst(BI);
+}
+
 void Verifier::visitSelectInst(SelectInst &SI) {
   Assert1(!SelectInst::areInvalidOperands(SI.getOperand(0), SI.getOperand(1),
                                           SI.getOperand(2)),
@@ -1202,6 +1207,7 @@ void Verifier::visitCallInst(CallInst &CI) {
 
 void Verifier::visitInvokeInst(InvokeInst &II) {
   VerifyCallSite(&II);
+  visitTerminatorInst(II);
 }
 
 /// visitBinaryOperator - Check that both arguments to the binary operator are
@@ -1266,28 +1272,37 @@ void Verifier::visitBinaryOperator(BinaryOperator &B) {
   visitInstruction(B);
 }
 
-void Verifier::visitICmpInst(ICmpInst& IC) {
+void Verifier::visitICmpInst(ICmpInst &IC) {
   // Check that the operands are the same type
-  const Type* Op0Ty = IC.getOperand(0)->getType();
-  const Type* Op1Ty = IC.getOperand(1)->getType();
+  const Type *Op0Ty = IC.getOperand(0)->getType();
+  const Type *Op1Ty = IC.getOperand(1)->getType();
   Assert1(Op0Ty == Op1Ty,
           "Both operands to ICmp instruction are not of the same type!", &IC);
   // Check that the operands are the right type
   Assert1(Op0Ty->isIntOrIntVectorTy() || Op0Ty->isPointerTy(),
           "Invalid operand types for ICmp instruction", &IC);
+  // Check that the predicate is valid.
+  Assert1(IC.getPredicate() >= CmpInst::FIRST_ICMP_PREDICATE &&
+          IC.getPredicate() <= CmpInst::LAST_ICMP_PREDICATE,
+          "Invalid predicate in ICmp instruction!", &IC);
 
   visitInstruction(IC);
 }
 
-void Verifier::visitFCmpInst(FCmpInst& FC) {
+void Verifier::visitFCmpInst(FCmpInst &FC) {
   // Check that the operands are the same type
-  const Type* Op0Ty = FC.getOperand(0)->getType();
-  const Type* Op1Ty = FC.getOperand(1)->getType();
+  const Type *Op0Ty = FC.getOperand(0)->getType();
+  const Type *Op1Ty = FC.getOperand(1)->getType();
   Assert1(Op0Ty == Op1Ty,
           "Both operands to FCmp instruction are not of the same type!", &FC);
   // Check that the operands are the right type
   Assert1(Op0Ty->isFPOrFPVectorTy(),
           "Invalid operand types for FCmp instruction", &FC);
+  // Check that the predicate is valid.
+  Assert1(FC.getPredicate() >= CmpInst::FIRST_FCMP_PREDICATE &&
+          FC.getPredicate() <= CmpInst::LAST_FCMP_PREDICATE,
+          "Invalid predicate in FCmp instruction!", &FC);
+
   visitInstruction(FC);
 }
 
@@ -1310,27 +1325,6 @@ void Verifier::visitShuffleVectorInst(ShuffleVectorInst &SV) {
   Assert1(ShuffleVectorInst::isValidOperands(SV.getOperand(0), SV.getOperand(1),
                                              SV.getOperand(2)),
           "Invalid shufflevector operands!", &SV);
-
-  const VectorType *VTy = dyn_cast<VectorType>(SV.getOperand(0)->getType());
-  Assert1(VTy, "Operands are not a vector type", &SV);
-
-  // Check to see if Mask is valid.
-  if (const ConstantVector *MV = dyn_cast<ConstantVector>(SV.getOperand(2))) {
-    for (unsigned i = 0, e = MV->getNumOperands(); i != e; ++i) {
-      if (ConstantInt* CI = dyn_cast<ConstantInt>(MV->getOperand(i))) {
-        Assert1(!CI->uge(VTy->getNumElements()*2),
-                "Invalid shufflevector shuffle mask!", &SV);
-      } else {
-        Assert1(isa<UndefValue>(MV->getOperand(i)),
-                "Invalid shufflevector shuffle mask!", &SV);
-      }
-    }
-  } else {
-    Assert1(isa<UndefValue>(SV.getOperand(2)) || 
-            isa<ConstantAggregateZero>(SV.getOperand(2)),
-            "Invalid shufflevector shuffle mask!", &SV);
-  }
-
   visitInstruction(SV);
 }
 
@@ -1408,10 +1402,6 @@ void Verifier::visitInstruction(Instruction &I) {
               "Only PHI nodes may reference their own value!", &I);
   }
 
-  // Verify that if this is a terminator that it is at the end of the block.
-  if (isa<TerminatorInst>(I))
-    Assert1(BB->getTerminator() == &I, "Terminator not at end of block!", &I);
-
   // Check that void typed values don't have names
   Assert1(!I.getType()->isVoidTy() || !I.hasName(),
           "Instruction has a name, but provides a void value!", &I);
@@ -1570,7 +1560,8 @@ void Verifier::VerifyType(const Type *Ty) {
               "Function type with invalid parameter type", ElTy, FTy);
       VerifyType(ElTy);
     }
-  } break;
+    break;
+  }
   case Type::StructTyID: {
     const StructType *STy = cast<StructType>(Ty);
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
@@ -1579,34 +1570,29 @@ void Verifier::VerifyType(const Type *Ty) {
               "Structure type with invalid element type", ElTy, STy);
       VerifyType(ElTy);
     }
-  } break;
-  case Type::UnionTyID: {
-    const UnionType *UTy = cast<UnionType>(Ty);
-    for (unsigned i = 0, e = UTy->getNumElements(); i != e; ++i) {
-      const Type *ElTy = UTy->getElementType(i);
-      Assert2(UnionType::isValidElementType(ElTy),
-              "Union type with invalid element type", ElTy, UTy);
-      VerifyType(ElTy);
-    }
-  } break;
+    break;
+  }
   case Type::ArrayTyID: {
     const ArrayType *ATy = cast<ArrayType>(Ty);
     Assert1(ArrayType::isValidElementType(ATy->getElementType()),
             "Array type with invalid element type", ATy);
     VerifyType(ATy->getElementType());
-  } break;
+    break;
+  }
   case Type::PointerTyID: {
     const PointerType *PTy = cast<PointerType>(Ty);
     Assert1(PointerType::isValidElementType(PTy->getElementType()),
             "Pointer type with invalid element type", PTy);
     VerifyType(PTy->getElementType());
-  } break;
+    break;
+  }
   case Type::VectorTyID: {
     const VectorType *VTy = cast<VectorType>(Ty);
     Assert1(VectorType::isValidElementType(VTy->getElementType()),
             "Vector type with invalid element type", VTy);
     VerifyType(VTy->getElementType());
-  } break;
+    break;
+  }
   default:
     break;
   }
@@ -1832,8 +1818,13 @@ bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
     // and iPTR. In the verifier, we can not distinguish which case we have so
     // allow either case to be legal.
     if (const PointerType* PTyp = dyn_cast<PointerType>(Ty)) {
-      Suffix += ".p" + utostr(PTyp->getAddressSpace()) + 
-        EVT::getEVT(PTyp->getElementType()).getEVTString();
+      EVT PointeeVT = EVT::getEVT(PTyp->getElementType(), true);
+      if (PointeeVT == MVT::Other) {
+        CheckFailed("Intrinsic has pointer to complex type.");
+        return false;
+      }
+      Suffix += ".p" + utostr(PTyp->getAddressSpace()) +
+        PointeeVT.getEVTString();
     } else {
       CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not a "
                   "pointer and a pointer is required.", F);