28 files changed, 1146 insertions, 493 deletions
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 03914ef98952b..e48ba38453268 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -3837,11 +3837,11 @@ void CodeGenDAGPatterns::GenerateVariants() {
       if (AlreadyExists) continue;
 
       // Otherwise, add it to the list of patterns we have.
-      PatternsToMatch.emplace_back(
+      PatternsToMatch.push_back(PatternToMatch(
           PatternsToMatch[i].getSrcRecord(), PatternsToMatch[i].getPredicates(),
           Variant, PatternsToMatch[i].getDstPattern(),
           PatternsToMatch[i].getDstRegs(),
-          PatternsToMatch[i].getAddedComplexity(), Record::getNewUID());
+          PatternsToMatch[i].getAddedComplexity(), Record::getNewUID()));
     }
 
     DEBUG(errs() << "\n");
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index 3907336221a44..d4a21a986c585 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -915,6 +915,84 @@ void CodeGenRegisterClass::computeSubClasses(CodeGenRegBank &RegBank) {
       RC.inheritProperties(RegBank);
 }
 
+Optional<std::pair<CodeGenRegisterClass *, CodeGenRegisterClass *>>
+CodeGenRegisterClass::getMatchingSubClassWithSubRegs(
+    CodeGenRegBank &RegBank, const CodeGenSubRegIndex *SubIdx) const {
+  auto SizeOrder = [](const CodeGenRegisterClass *A,
+                      const CodeGenRegisterClass *B) {
+    return A->getMembers().size() > B->getMembers().size();
+  };
+
+  auto &RegClasses = RegBank.getRegClasses();
+
+  // Find all the subclasses of this one that fully support the sub-register
+  // index and order them by size. BiggestSuperRC should always be first.
+  CodeGenRegisterClass *BiggestSuperRegRC = getSubClassWithSubReg(SubIdx);
+  if (!BiggestSuperRegRC)
+    return None;
+  BitVector SuperRegRCsBV = BiggestSuperRegRC->getSubClasses();
+  std::vector<CodeGenRegisterClass *> SuperRegRCs;
+  for (auto &RC : RegClasses)
+    if (SuperRegRCsBV[RC.EnumValue])
+      SuperRegRCs.emplace_back(&RC);
+  std::sort(SuperRegRCs.begin(), SuperRegRCs.end(), SizeOrder);
+  assert(SuperRegRCs.front() == BiggestSuperRegRC && "Biggest class wasn't first");
+
+  // Find all the subreg classes and order them by size too.
+  std::vector<std::pair<CodeGenRegisterClass *, BitVector>> SuperRegClasses;
+  for (auto &RC: RegClasses) {
+    BitVector SuperRegClassesBV(RegClasses.size());
+    RC.getSuperRegClasses(SubIdx, SuperRegClassesBV);
+    if (SuperRegClassesBV.any())
+      SuperRegClasses.push_back(std::make_pair(&RC, SuperRegClassesBV));
+  }
+  std::sort(SuperRegClasses.begin(), SuperRegClasses.end(),
+            [&](const std::pair<CodeGenRegisterClass *, BitVector> &A,
+                const std::pair<CodeGenRegisterClass *, BitVector> &B) {
+              return SizeOrder(A.first, B.first);
+            });
+
+  // Find the biggest subclass and subreg class such that R:subidx is in the
+  // subreg class for all R in subclass.
+  //
+  // For example:
+  // All registers in X86's GR64 have a sub_32bit subregister but no class
+  // exists that contains all the 32-bit subregisters because GR64 contains RIP
+  // but GR32 does not contain EIP. Instead, we constrain SuperRegRC to
+  // GR32_with_sub_8bit (which is identical to GR32_with_sub_32bit) and then,
+  // having excluded RIP, we are able to find a SubRegRC (GR32).
+  CodeGenRegisterClass *ChosenSuperRegClass = nullptr;
+  CodeGenRegisterClass *SubRegRC = nullptr;
+  for (auto *SuperRegRC : SuperRegRCs) {
+    for (const auto &SuperRegClassPair : SuperRegClasses) {
+      const BitVector &SuperRegClassBV = SuperRegClassPair.second;
+      if (SuperRegClassBV[SuperRegRC->EnumValue]) {
+        SubRegRC = SuperRegClassPair.first;
+        ChosenSuperRegClass = SuperRegRC;
+
+        // If SubRegRC is bigger than SuperRegRC then there are members of
+        // SubRegRC that don't have super registers via SubIdx. Keep looking to
+        // find a better fit and fall back on this one if there isn't one.
+        //
+        // This is intended to prevent X86 from making odd choices such as
+        // picking LOW32_ADDR_ACCESS_RBP instead of GR32 in the example above.
+        // LOW32_ADDR_ACCESS_RBP is a valid choice but contains registers that
+        // aren't subregisters of SuperRegRC whereas GR32 has a direct 1:1
+        // mapping.
+        if (SuperRegRC->getMembers().size() >= SubRegRC->getMembers().size())
+          return std::make_pair(ChosenSuperRegClass, SubRegRC);
+      }
+    }
+
+    // If we found a fit but it wasn't quite ideal because SubRegRC had excess
+    // registers, then we're done.
+    if (ChosenSuperRegClass)
+      return std::make_pair(ChosenSuperRegClass, SubRegRC);
+  }
+
+  return None;
+}
+
 void CodeGenRegisterClass::getSuperRegClasses(const CodeGenSubRegIndex *SubIdx,
                                               BitVector &Out) const {
   auto FindI = SuperRegClasses.find(SubIdx);
diff --git a/utils/TableGen/CodeGenRegisters.h b/utils/TableGen/CodeGenRegisters.h
index 1fcba8a135d18..d0f96a035ea19 100644
--- a/utils/TableGen/CodeGenRegisters.h
+++ b/utils/TableGen/CodeGenRegisters.h
@@ -329,6 +329,9 @@ namespace llvm {
     const std::string &getName() const { return Name; }
     std::string getQualifiedName() const;
     ArrayRef<MVT::SimpleValueType> getValueTypes() const {return VTs;}
+    bool hasValueType(MVT::SimpleValueType VT) const {
+      return std::find(VTs.begin(), VTs.end(), VT) != VTs.end();
+    }
     unsigned getNumValueTypes() const { return VTs.size(); }
 
     MVT::SimpleValueType getValueTypeNum(unsigned VTNum) const {
@@ -360,6 +363,18 @@ namespace llvm {
       return SubClassWithSubReg.lookup(SubIdx);
     }
 
+    /// Find largest subclass where all registers have SubIdx subregisters in
+    /// SubRegClass and the largest subregister class that contains those
+    /// subregisters without (as far as possible) also containing additional registers.
+    ///
+    /// This can be used to find a suitable pair of classes for subregister copies.
+    /// \return std::pair<SubClass, SubRegClass> where SubClass is a SubClass is
+    /// a class where every register has SubIdx and SubRegClass is a class where
+    /// every register is covered by the SubIdx subregister of SubClass.
+    Optional<std::pair<CodeGenRegisterClass *, CodeGenRegisterClass *>>
+    getMatchingSubClassWithSubRegs(CodeGenRegBank &RegBank,
+                                   const CodeGenSubRegIndex *SubIdx) const;
+
     void setSubClassWithSubReg(const CodeGenSubRegIndex *SubIdx,
                                CodeGenRegisterClass *SubRC) {
       SubClassWithSubReg[SubIdx] = SubRC;
@@ -370,7 +385,7 @@ namespace llvm {
     void getSuperRegClasses(const CodeGenSubRegIndex *SubIdx,
                             BitVector &Out) const;
 
-    // addSuperRegClass - Add a class containing only SudIdx super-registers.
+    // addSuperRegClass - Add a class containing only SubIdx super-registers.
     void addSuperRegClass(CodeGenSubRegIndex *SubIdx,
                           CodeGenRegisterClass *SuperRC) {
       SuperRegClasses[SubIdx].insert(SuperRC);
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index 20f6047052ff0..50569b2ad9896 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -140,6 +140,7 @@ CodeGenSchedModels::CodeGenSchedModels(RecordKeeper &RK,
 
   // Populate each CodeGenProcModel's WriteResDefs, ReadAdvanceDefs, and
   // ProcResourceDefs.
+  DEBUG(dbgs() << "\n+++ RESOURCE DEFINITIONS (collectProcResources) +++\n");
   collectProcResources();
 
   checkCompleteness();
@@ -160,6 +161,7 @@ void CodeGenSchedModels::collectProcModels() {
   ProcModelMap[NoModelDef] = 0;
 
   // For each processor, find a unique machine model.
+  DEBUG(dbgs() << "+++ PROCESSOR MODELs (addProcModel) +++\n");
   for (unsigned i = 0, N = ProcRecords.size(); i < N; ++i)
     addProcModel(ProcRecords[i]);
 }
@@ -315,6 +317,7 @@ void CodeGenSchedModels::collectSchedRW() {
     RW.Aliases.push_back(*AI);
   }
   DEBUG(
+    dbgs() << "\n+++ SCHED READS and WRITES (collectSchedRW) +++\n";
     for (unsigned WIdx = 0, WEnd = SchedWrites.size(); WIdx != WEnd; ++WIdx) {
       dbgs() << WIdx << ": ";
       SchedWrites[WIdx].dump();
@@ -531,6 +534,7 @@ void CodeGenSchedModels::collectSchedClasses() {
   // Create classes for InstRW defs.
   RecVec InstRWDefs = Records.getAllDerivedDefinitions("InstRW");
   std::sort(InstRWDefs.begin(), InstRWDefs.end(), LessRecord());
+  DEBUG(dbgs() << "\n+++ SCHED CLASSES (createInstRWClass) +++\n");
   for (RecIter OI = InstRWDefs.begin(), OE = InstRWDefs.end(); OI != OE; ++OI)
     createInstRWClass(*OI);
 
@@ -541,6 +545,7 @@ void CodeGenSchedModels::collectSchedClasses() {
   if (!EnableDump)
     return;
 
+  dbgs() << "\n+++ ITINERARIES and/or MACHINE MODELS (collectSchedClasses) +++\n";
   for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) {
     StringRef InstName = Inst->TheDef->getName();
     unsigned SCIdx = InstrClassMap.lookup(Inst->TheDef);
@@ -790,6 +795,7 @@ bool CodeGenSchedModels::hasItineraries() const {
 
 // Gather the processor itineraries.
 void CodeGenSchedModels::collectProcItins() {
+  DEBUG(dbgs() << "\n+++ PROBLEM ITINERARIES (collectProcItins) +++\n");
   for (CodeGenProcModel &ProcModel : ProcModels) {
     if (!ProcModel.hasItineraries())
       continue;
@@ -860,6 +866,7 @@ void CodeGenSchedModels::collectProcUnsupportedFeatures() {
 /// Infer new classes from existing classes. In the process, this may create new
 /// SchedWrites from sequences of existing SchedWrites.
 void CodeGenSchedModels::inferSchedClasses() {
+  DEBUG(dbgs() << "\n+++ INFERRING SCHED CLASSES (inferSchedClasses) +++\n");
   DEBUG(dbgs() << NumInstrSchedClasses << " instr sched classes.\n");
 
   // Visit all existing classes and newly created classes.
diff --git a/utils/TableGen/DAGISelMatcher.h b/utils/TableGen/DAGISelMatcher.h
index 6bda9ca5f96fa..c672b0acac9fe 100644
--- a/utils/TableGen/DAGISelMatcher.h
+++ b/utils/TableGen/DAGISelMatcher.h
@@ -208,7 +208,7 @@ public:
     Children.resize(NC);
   }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == Scope;
   }
 
@@ -233,7 +233,7 @@ public:
   const std::string &getWhatFor() const { return WhatFor; }
   unsigned getResultNo() const { return ResultNo; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == RecordNode;
   }
 
@@ -265,7 +265,7 @@ public:
   const std::string &getWhatFor() const { return WhatFor; }
   unsigned getResultNo() const { return ResultNo; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == RecordChild;
   }
 
@@ -281,7 +281,7 @@ class RecordMemRefMatcher : public Matcher {
 public:
   RecordMemRefMatcher() : Matcher(RecordMemRef) {}
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == RecordMemRef;
   }
 
@@ -297,7 +297,7 @@ class CaptureGlueInputMatcher : public Matcher {
 public:
   CaptureGlueInputMatcher() : Matcher(CaptureGlueInput) {}
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CaptureGlueInput;
   }
 
@@ -315,7 +315,7 @@ public:
 
   unsigned getChildNo() const { return ChildNo; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == MoveChild;
   }
 
@@ -332,7 +332,7 @@ class MoveParentMatcher : public Matcher {
 public:
   MoveParentMatcher() : Matcher(MoveParent) {}
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == MoveParent;
   }
 
@@ -352,7 +352,7 @@ public:
 
   unsigned getMatchNumber() const { return MatchNumber; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckSame;
   }
 
@@ -376,7 +376,7 @@ public:
   unsigned getChildNo() const { return ChildNo; }
   unsigned getMatchNumber() const { return MatchNumber; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckChildSame;
   }
 
@@ -399,7 +399,7 @@ public:
 
   StringRef getPredicate() const { return Predicate; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckPatternPredicate;
   }
 
@@ -419,7 +419,7 @@ public:
 
   TreePredicateFn getPredicate() const;
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckPredicate;
   }
 
@@ -441,7 +441,7 @@ public:
 
   const SDNodeInfo &getOpcode() const { return Opcode; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckOpcode;
   }
 
@@ -462,7 +462,7 @@ public:
     : Matcher(SwitchOpcode), Cases(cases.begin(), cases.end()) {}
   ~SwitchOpcodeMatcher() override;
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == SwitchOpcode;
   }
 
@@ -489,7 +489,7 @@ public:
   MVT::SimpleValueType getType() const { return Type; }
   unsigned getResNo() const { return ResNo; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckType;
   }
 
@@ -512,7 +512,7 @@ public:
   : Matcher(SwitchType), Cases(cases.begin(), cases.end()) {}
   ~SwitchTypeMatcher() override;
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == SwitchType;
   }
 
@@ -540,7 +540,7 @@ public:
   unsigned getChildNo() const { return ChildNo; }
   MVT::SimpleValueType getType() const { return Type; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckChildType;
   }
 
@@ -564,7 +564,7 @@ public:
 
   int64_t getValue() const { return Value; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckInteger;
   }
 
@@ -588,7 +588,7 @@ public:
   unsigned getChildNo() const { return ChildNo; }
   int64_t getValue() const { return Value; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckChildInteger;
   }
 
@@ -611,7 +611,7 @@ public:
 
   StringRef getCondCodeName() const { return CondCodeName; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckCondCode;
   }
 
@@ -632,7 +632,7 @@ public:
 
   StringRef getTypeName() const { return TypeName; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckValueType;
   }
 
@@ -673,7 +673,7 @@ public:
   const std::string getName() const { return Name; }
   unsigned getFirstResult() const { return FirstResult; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckComplexPat;
   }
 
@@ -695,7 +695,7 @@ public:
 
   int64_t getValue() const { return Value; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckAndImm;
   }
 
@@ -716,7 +716,7 @@ public:
 
   int64_t getValue() const { return Value; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckOrImm;
   }
 
@@ -734,7 +734,7 @@ public:
   CheckFoldableChainNodeMatcher()
     : Matcher(CheckFoldableChainNode) {}
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CheckFoldableChainNode;
   }
 
@@ -754,7 +754,7 @@ public:
   int64_t getValue() const { return Val; }
   MVT::SimpleValueType getVT() const { return VT; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == EmitInteger;
   }
 
@@ -778,7 +778,7 @@ public:
   const std::string &getValue() const { return Val; }
   MVT::SimpleValueType getVT() const { return VT; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == EmitStringInteger;
   }
 
@@ -803,7 +803,7 @@ public:
   const CodeGenRegister *getReg() const { return Reg; }
   MVT::SimpleValueType getVT() const { return VT; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == EmitRegister;
   }
 
@@ -826,7 +826,7 @@ public:
 
   unsigned getSlot() const { return Slot; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == EmitConvertToTarget;
   }
 
@@ -854,7 +854,7 @@ public:
     return ChainNodes[i];
   }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == EmitMergeInputChains;
   }
 
@@ -878,7 +878,7 @@ public:
   unsigned getSrcSlot() const { return SrcSlot; }
   Record *getDestPhysReg() const { return DestPhysReg; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == EmitCopyToReg;
   }
 
@@ -904,7 +904,7 @@ public:
   unsigned getSlot() const { return Slot; }
   Record *getNodeXForm() const { return NodeXForm; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == EmitNodeXForm;
   }
 
@@ -964,7 +964,7 @@ public:
   bool hasMemRefs() const { return HasMemRefs; }
   int getNumFixedArityOperands() const { return NumFixedArityOperands; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == EmitNode || N->getKind() == MorphNodeTo;
   }
 
@@ -991,7 +991,7 @@ public:
 
   unsigned getFirstResultSlot() const { return FirstResultSlot; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == EmitNode;
   }
 
@@ -1015,7 +1015,7 @@ public:
 
   const PatternToMatch &getPattern() const { return Pattern; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == MorphNodeTo;
   }
 };
@@ -1036,7 +1036,7 @@ public:
   unsigned getResult(unsigned R) const { return Results[R]; }
   const PatternToMatch &getPattern() const { return Pattern; }
 
-  static inline bool classof(const Matcher *N) {
+  static bool classof(const Matcher *N) {
     return N->getKind() == CompleteMatch;
   }
 
diff --git a/utils/TableGen/GlobalISelEmitter.cpp b/utils/TableGen/GlobalISelEmitter.cpp
index 03d231a153dc3..924ed8f65c2c9 100644
--- a/utils/TableGen/GlobalISelEmitter.cpp
+++ b/utils/TableGen/GlobalISelEmitter.cpp
@@ -80,8 +80,8 @@ public:
       return;
     }
     if (Ty.isVector()) {
-      OS << "LLT::vector(" << Ty.getNumElements() << ", " << Ty.getScalarSizeInBits()
-         << ")";
+      OS << "LLT::vector(" << Ty.getNumElements() << ", "
+         << Ty.getScalarSizeInBits() << ")";
       return;
     }
     llvm_unreachable("Unhandled LLT");
@@ -96,7 +96,8 @@ class InstructionMatcher;
 static Optional<LLTCodeGen> MVTToLLT(MVT::SimpleValueType SVT) {
   MVT VT(SVT);
   if (VT.isVector() && VT.getVectorNumElements() != 1)
-    return LLTCodeGen(LLT::vector(VT.getVectorNumElements(), VT.getScalarSizeInBits()));
+    return LLTCodeGen(
+        LLT::vector(VT.getVectorNumElements(), VT.getScalarSizeInBits()));
   if (VT.isInteger() || VT.isFloatingPoint())
     return LLTCodeGen(LLT::scalar(VT.getSizeInBits()));
   return None;
@@ -241,12 +242,18 @@ public:
     return *static_cast<Kind *>(Predicates.back().get());
   }
 
-  typename PredicateVec::const_iterator predicates_begin() const { return Predicates.begin(); }
-  typename PredicateVec::const_iterator predicates_end() const { return Predicates.end(); }
+  typename PredicateVec::const_iterator predicates_begin() const {
+    return Predicates.begin();
+  }
+  typename PredicateVec::const_iterator predicates_end() const {
+    return Predicates.end();
+  }
   iterator_range<typename PredicateVec::const_iterator> predicates() const {
     return make_range(predicates_begin(), predicates_end());
   }
-  typename PredicateVec::size_type predicates_size() const { return Predicates.size(); }
+  typename PredicateVec::size_type predicates_size() const {
+    return Predicates.size();
+  }
 
   /// Emit a C++ expression that tests whether all the predicates are met.
   template <class... Args>
@@ -600,7 +607,8 @@ public:
   /// Compare the priority of this object and B.
   ///
   /// Returns true if this object is more important than B.
-  virtual bool isHigherPriorityThan(const InstructionPredicateMatcher &B) const {
+  virtual bool
+  isHigherPriorityThan(const InstructionPredicateMatcher &B) const {
     return Kind < B.Kind;
   };
 
@@ -631,7 +639,8 @@ public:
   /// Compare the priority of this object and B.
   ///
   /// Returns true if this object is more important than B.
-  bool isHigherPriorityThan(const InstructionPredicateMatcher &B) const override {
+  bool
+  isHigherPriorityThan(const InstructionPredicateMatcher &B) const override {
     if (InstructionPredicateMatcher::isHigherPriorityThan(B))
       return true;
     if (B.InstructionPredicateMatcher::isHigherPriorityThan(*this))
@@ -832,7 +841,13 @@ public:
 //===- Actions ------------------------------------------------------------===//
 class OperandRenderer {
 public:
-  enum RendererKind { OR_Copy, OR_Imm, OR_Register, OR_ComplexPattern };
+  enum RendererKind {
+    OR_Copy,
+    OR_CopySubReg,
+    OR_Imm,
+    OR_Register,
+    OR_ComplexPattern
+  };
 
 protected:
   RendererKind Kind;
@@ -877,6 +892,42 @@ public:
   }
 };
 
+/// A CopySubRegRenderer emits code to copy a single register operand from an
+/// existing instruction to the one being built and indicate that only a
+/// subregister should be copied.
+class CopySubRegRenderer : public OperandRenderer {
+protected:
+  /// The matcher for the instruction that this operand is copied from.
+  /// This provides the facility for looking up an a operand by it's name so
+  /// that it can be used as a source for the instruction being built.
+  const InstructionMatcher &Matched;
+  /// The name of the operand.
+  const StringRef SymbolicName;
+  /// The subregister to extract.
+  const CodeGenSubRegIndex *SubReg;
+
+public:
+  CopySubRegRenderer(const InstructionMatcher &Matched, StringRef SymbolicName,
+                     const CodeGenSubRegIndex *SubReg)
+      : OperandRenderer(OR_CopySubReg), Matched(Matched),
+        SymbolicName(SymbolicName), SubReg(SubReg) {}
+
+  static bool classof(const OperandRenderer *R) {
+    return R->getKind() == OR_CopySubReg;
+  }
+
+  const StringRef getSymbolicName() const { return SymbolicName; }
+
+  void emitCxxRenderStmts(raw_ostream &OS, RuleMatcher &Rule) const override {
+    const OperandMatcher &Operand = Matched.getOperand(SymbolicName);
+    StringRef InsnVarName =
+        Rule.getInsnVarName(Operand.getInstructionMatcher());
+    std::string OperandExpr = Operand.getOperandExpr(InsnVarName);
+    OS << "    MIB.addReg(" << OperandExpr << ".getReg() /*" << SymbolicName
+       << "*/, 0, " << SubReg->EnumValue << ");\n";
+  }
+};
+
 /// Adds a specific physical register to the instruction being built.
 /// This is typically useful for WZR/XZR on AArch64.
 class AddRegisterRenderer : public OperandRenderer {
@@ -1076,7 +1127,8 @@ public:
 
   void emitCxxActionStmts(raw_ostream &OS, RuleMatcher &Rule,
                           StringRef RecycleVarName) const override {
-    OS << "      constrainSelectedInstRegOperands(" << Name << ", TII, TRI, RBI);\n";
+    OS << "      constrainSelectedInstRegOperands(" << Name
+       << ", TII, TRI, RBI);\n";
   }
 };
 
@@ -1123,14 +1175,16 @@ std::string RuleMatcher::defineInsnVar(raw_ostream &OS,
   return InsnVarName;
 }
 
-StringRef RuleMatcher::getInsnVarName(const InstructionMatcher &InsnMatcher) const {
+StringRef
+RuleMatcher::getInsnVarName(const InstructionMatcher &InsnMatcher) const {
   const auto &I = InsnVariableNames.find(&InsnMatcher);
   if (I != InsnVariableNames.end())
     return I->second;
   llvm_unreachable("Matched Insn was not captured in a local variable");
 }
 
-/// Emit a C++ initializer_list containing references to every matched instruction.
+/// Emit a C++ initializer_list containing references to every matched
+/// instruction.
 void RuleMatcher::emitCxxCapturedInsnList(raw_ostream &OS) {
   SmallVector<StringRef, 2> Names;
   for (const auto &Pair : InsnVariableNames)
@@ -1292,6 +1346,7 @@ private:
   const RecordKeeper &RK;
   const CodeGenDAGPatterns CGP;
   const CodeGenTarget &Target;
+  CodeGenRegBank CGRegs;
 
   /// Keep track of the equivalence between SDNodes and Instruction.
   /// This is defined using 'GINodeEquiv' in the target description.
@@ -1315,9 +1370,9 @@ private:
   Error importChildMatcher(InstructionMatcher &InsnMatcher,
                            const TreePatternNode *SrcChild, unsigned OpIdx,
                            unsigned &TempOpIdx) const;
-  Expected<BuildMIAction &> createAndImportInstructionRenderer(
-      RuleMatcher &M, const TreePatternNode *Dst,
-      const InstructionMatcher &InsnMatcher) const;
+  Expected<BuildMIAction &>
+  createAndImportInstructionRenderer(RuleMatcher &M, const TreePatternNode *Dst,
+                                     const InstructionMatcher &InsnMatcher);
   Error importExplicitUseRenderer(BuildMIAction &DstMIBuilder,
                                   TreePatternNode *DstChild,
                                   const InstructionMatcher &InsnMatcher) const;
@@ -1354,7 +1409,7 @@ const CodeGenInstruction *GlobalISelEmitter::findNodeEquiv(Record *N) const {
 }
 
 GlobalISelEmitter::GlobalISelEmitter(RecordKeeper &RK)
-    : RK(RK), CGP(RK), Target(CGP.getTargetInfo()) {}
+    : RK(RK), CGP(RK), Target(CGP.getTargetInfo()), CGRegs(RK) {}
 
 //===- Emitter ------------------------------------------------------------===//
 
@@ -1382,7 +1437,8 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       InsnMatcher.addPredicate<InstructionOpcodeMatcher>(
           &Target.getInstruction(RK.getDef("G_CONSTANT")));
     } else
-      return failedImport("Unable to deduce gMIR opcode to handle Src (which is a leaf)");
+      return failedImport(
+          "Unable to deduce gMIR opcode to handle Src (which is a leaf)");
   } else {
     auto SrcGIOrNull = findNodeEquiv(Src->getOperator());
     if (!SrcGIOrNull)
@@ -1415,7 +1471,8 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       OperandMatcher &OM = InsnMatcher.addOperand(OpIdx++, "", TempOpIdx);
       OM.addPredicate<LiteralIntOperandMatcher>(SrcIntInit->getValue());
     } else
-      return failedImport("Unable to deduce gMIR opcode to handle Src (which is a leaf)");
+      return failedImport(
+          "Unable to deduce gMIR opcode to handle Src (which is a leaf)");
   } else {
     // Match the used operands (i.e. the children of the operator).
     for (unsigned i = 0, e = Src->getNumChildren(); i != e; ++i) {
@@ -1585,7 +1642,7 @@ Error GlobalISelEmitter::importExplicitUseRenderer(
 
 Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
     RuleMatcher &M, const TreePatternNode *Dst,
-    const InstructionMatcher &InsnMatcher) const {
+    const InstructionMatcher &InsnMatcher) {
   Record *DstOp = Dst->getOperator();
   if (!DstOp->isSubClassOf("Instruction")) {
     if (DstOp->isSubClassOf("ValueType"))
@@ -1597,13 +1654,17 @@ Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
 
   unsigned DstINumUses = DstI->Operands.size() - DstI->Operands.NumDefs;
   unsigned ExpectedDstINumUses = Dst->getNumChildren();
+  bool IsExtractSubReg = false;
 
   // COPY_TO_REGCLASS is just a copy with a ConstrainOperandToRegClassAction
-  // attached.
+  // attached. Similarly for EXTRACT_SUBREG except that's a subregister copy.
   if (DstI->TheDef->getName() == "COPY_TO_REGCLASS") {
     DstI = &Target.getInstruction(RK.getDef("COPY"));
     DstINumUses--; // Ignore the class constraint.
     ExpectedDstINumUses--;
+  } else if (DstI->TheDef->getName() == "EXTRACT_SUBREG") {
+    DstI = &Target.getInstruction(RK.getDef("COPY"));
+    IsExtractSubReg = true;
   }
 
   auto &DstMIBuilder = M.addAction<BuildMIAction>("NewI", DstI, InsnMatcher);
@@ -1614,6 +1675,33 @@ Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
     DstMIBuilder.addRenderer<CopyRenderer>(InsnMatcher, DstIOperand.Name);
   }
 
+  // EXTRACT_SUBREG needs to use a subregister COPY.
+  if (IsExtractSubReg) {
+    if (!Dst->getChild(0)->isLeaf())
+      return failedImport("EXTRACT_SUBREG child #1 is not a leaf");
+
+    if (DefInit *SubRegInit =
+            dyn_cast<DefInit>(Dst->getChild(1)->getLeafValue())) {
+      CodeGenRegisterClass *RC = CGRegs.getRegClass(
+          getInitValueAsRegClass(Dst->getChild(0)->getLeafValue()));
+      CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef());
+
+      const auto &SrcRCDstRCPair =
+          RC->getMatchingSubClassWithSubRegs(CGRegs, SubIdx);
+      if (SrcRCDstRCPair.hasValue()) {
+        assert(SrcRCDstRCPair->second && "Couldn't find a matching subclass");
+        if (SrcRCDstRCPair->first != RC)
+          return failedImport("EXTRACT_SUBREG requires an additional COPY");
+      }
+
+      DstMIBuilder.addRenderer<CopySubRegRenderer>(
+          InsnMatcher, Dst->getChild(0)->getName(), SubIdx);
+      return DstMIBuilder;
+    }
+
+    return failedImport("EXTRACT_SUBREG child #1 is not a subreg index");
+  }
+
   // Render the explicit uses.
   unsigned Child = 0;
   unsigned NumDefaultOps = 0;
@@ -1740,10 +1828,22 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
       if (DstIOpRec == nullptr)
         return failedImport(
             "COPY_TO_REGCLASS operand #1 isn't a register class");
+    } else if (DstI.TheDef->getName() == "EXTRACT_SUBREG") {
+      if (!Dst->getChild(0)->isLeaf())
+        return failedImport("EXTRACT_SUBREG operand #0 isn't a leaf");
+
+      // We can assume that a subregister is in the same bank as it's super
+      // register.
+      DstIOpRec = getInitValueAsRegClass(Dst->getChild(0)->getLeafValue());
+
+      if (DstIOpRec == nullptr)
+        return failedImport(
+            "EXTRACT_SUBREG operand #0 isn't a register class");
     } else if (DstIOpRec->isSubClassOf("RegisterOperand"))
       DstIOpRec = DstIOpRec->getValueAsDef("RegClass");
     else if (!DstIOpRec->isSubClassOf("RegisterClass"))
-      return failedImport("Dst MI def isn't a register class" + to_string(*Dst));
+      return failedImport("Dst MI def isn't a register class" +
+                          to_string(*Dst));
 
     OperandMatcher &OM = InsnMatcher.getOperand(OpIdx);
     OM.setSymbolicName(DstIOperand.Name);
@@ -1776,6 +1876,52 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
 
     M.addAction<ConstrainOperandToRegClassAction>(
         "NewI", 0, Target.getRegisterClass(DstIOpRec));
+
+    // We're done with this pattern!  It's eligible for GISel emission; return
+    // it.
+    ++NumPatternImported;
+    return std::move(M);
+  }
+
+  if (DstI.TheDef->getName() == "EXTRACT_SUBREG") {
+    // EXTRACT_SUBREG selects into a subregister COPY but unlike most
+    // instructions, the result register class is controlled by the
+    // subregisters of the operand. As a result, we must constrain the result
+    // class rather than check that it's already the right one.
+    if (!Dst->getChild(0)->isLeaf())
+      return failedImport("EXTRACT_SUBREG child #1 is not a leaf");
+
+    DefInit *SubRegInit = dyn_cast<DefInit>(Dst->getChild(1)->getLeafValue());
+    if (!SubRegInit)
+      return failedImport("EXTRACT_SUBREG child #1 is not a subreg index");
+
+    // Constrain the result to the same register bank as the operand.
+    Record *DstIOpRec =
+        getInitValueAsRegClass(Dst->getChild(0)->getLeafValue());
+
+    if (DstIOpRec == nullptr)
+      return failedImport("EXTRACT_SUBREG operand #1 isn't a register class");
+
+    CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef());
+    CodeGenRegisterClass *SrcRC = CGRegs.getRegClass(
+        getInitValueAsRegClass(Dst->getChild(0)->getLeafValue()));
+
+    // It would be nice to leave this constraint implicit but we're required
+    // to pick a register class so constrain the result to a register class
+    // that can hold the correct MVT.
+    //
+    // FIXME: This may introduce an extra copy if the chosen class doesn't
+    //        actually contain the subregisters.
+    assert(Src->getExtTypes().size() == 1 &&
+             "Expected Src of EXTRACT_SUBREG to have one result type");
+
+    const auto &SrcRCDstRCPair =
+        SrcRC->getMatchingSubClassWithSubRegs(CGRegs, SubIdx);
+    assert(SrcRCDstRCPair->second && "Couldn't find a matching subclass");
+    M.addAction<ConstrainOperandToRegClassAction>("NewI", 0,
+                                                  *SrcRCDstRCPair->second);
+    M.addAction<ConstrainOperandToRegClassAction>("NewI", 1,
+                                                  *SrcRCDstRCPair->first);
   } else
     M.addAction<ConstrainOperandsToDefinitionAction>("NewI");
 
@@ -1874,8 +2020,10 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
      << "InstructionSelector::selectImpl(MachineInstr &I) const {\n"
      << "  MachineFunction &MF = *I.getParent()->getParent();\n"
      << "  const MachineRegisterInfo &MRI = MF.getRegInfo();\n"
-     << "  // FIXME: This should be computed on a per-function basis rather than per-insn.\n"
-     << "  AvailableFunctionFeatures = computeAvailableFunctionFeatures(&STI, &MF);\n"
+     << "  // FIXME: This should be computed on a per-function basis rather "
+        "than per-insn.\n"
+     << "  AvailableFunctionFeatures = computeAvailableFunctionFeatures(&STI, "
+        "&MF);\n"
      << "  const PredicateBitset AvailableFeatures = getAvailableFeatures();\n";
 
   for (auto &Rule : Rules) {
diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index 12cfb93a0c4f3..bebb1a183fc76 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp
@@ -1195,7 +1195,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "\" };\n\n";
 
   // Emit SubRegIndex lane masks, including 0.
-  OS << "\nstatic const LaneBitmask SubRegIndexLaneMaskTable[] = {\n  LaneBitmask::getAll(),\n";
+  OS << "\nstatic const LaneBitmask SubRegIndexLaneMaskTable[] = {\n  "
+        "LaneBitmask::getAll(),\n";
   for (const auto &Idx : SubRegIndices) {
     printMask(OS << "  ", Idx.LaneMask);
     OS << ", // " << Idx.getName() << '\n';
@@ -1234,7 +1235,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     BitVector MaskBV(RegisterClasses.size());
 
     for (const auto &RC : RegisterClasses) {
-      OS << "static const uint32_t " << RC.getName() << "SubClassMask[] = {\n  ";
+      OS << "static const uint32_t " << RC.getName()
+         << "SubClassMask[] = {\n  ";
       printBitVectorAsHex(OS, RC.getSubClasses(), 32);
 
       // Emit super-reg class masks for any relevant SubRegIndices that can
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 7e9f552eccc0c..16d5740b79a3f 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -805,6 +805,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     return;
 
   std::vector<MCSchedClassDesc> &SCTab = SchedTables.ProcSchedClasses.back();
+  DEBUG(dbgs() << "\n+++ SCHED CLASSES (GenSchedClassTables) +++\n");
   for (const CodeGenSchedClass &SC : SchedModels.schedClasses()) {
     DEBUG(SC.dump(&SchedModels));
 
diff --git a/utils/docker/README b/utils/docker/README
new file mode 100644
index 0000000000000..be08dfa4c505a
--- /dev/null
+++ b/utils/docker/README
@@ -0,0 +1 @@
+See llvm/docs/Docker.rst for details
diff --git a/utils/docker/build_docker_image.sh b/utils/docker/build_docker_image.sh
new file mode 100755
index 0000000000000..2ec07ab6da4b9
--- /dev/null
+++ b/utils/docker/build_docker_image.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+#===- llvm/utils/docker/build_docker_image.sh ----------------------------===//
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===----------------------------------------------------------------------===//
+set -e
+
+IMAGE_SOURCE=""
+DOCKER_REPOSITORY=""
+DOCKER_TAG=""
+BUILDSCRIPT_ARGS=""
+
+function show_usage() {
+  usage=$(cat << EOF
+Usage: build_docker_image.sh [options] [-- [buildscript_args]...]
+
+Available options:
+    -s|--source             image source dir (i.e. debian8, nvidia-cuda, etc)
+    -d|--docker-repository  docker repository for the image
+    -t|--docker-tag         docker tag for the image
+Required options: --source and --docker-repository.
+
+All options after '--' are passed to buildscript (see
+scripts/build_install_llvm.sh).
+
+For example, running:
+$ build_docker_image.sh -s debian8 -d mydocker/debian8-clang -t latest \ 
+  -- -p clang -i install-clang -i install-clang-headers
+will produce two docker images:
+    mydocker/debian8-clang-build:latest - an intermediate image used to compile
+      clang.
+    mydocker/clang-debian8:latest       - a small image with preinstalled clang.
+Please note that this example produces a not very useful installation, since it
+doesn't override CMake defaults, which produces a Debug and non-boostrapped
+version of clang.
+For an example of a somewhat more useful build, see build_clang_image.sh.
+EOF
+)
+  echo "$usage"
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    -h|--help)
+      show_usage
+      exit 0
+      ;;
+    -s|--source)
+      shift
+      IMAGE_SOURCE="$1"
+      shift
+      ;;
+    -d|--docker-repository)
+      shift
+      DOCKER_REPOSITORY="$1"
+      shift
+      ;;
+    -t|--docker-tag)
+      shift
+      DOCKER_TAG="$1"
+      shift
+      ;;
+    --)
+      shift
+      BUILDSCRIPT_ARGS="$*"
+      shift $#
+      ;;
+    *)
+      echo "Unknown argument $1"
+      exit 1
+      ;;
+  esac
+done
+
+command -v docker >/dev/null ||
+  {
+    echo "Docker binary cannot be found. Please install Docker to use this script."
+    exit 1
+  }
+
+if [ "$IMAGE_SOURCE" == "" ]; then
+  echo "Required argument missing: --source"
+  exit 1
+fi
+
+if [ "$DOCKER_REPOSITORY" == "" ]; then
+  echo "Required argument missing: --docker-repository"
+  exit 1
+fi
+
+cd $(dirname $0)
+if [ ! -d $IMAGE_SOURCE ]; then
+  echo "No sources for '$IMAGE_SOURCE' were found in $PWD"
+  exit 1
+fi
+
+echo "Building from $IMAGE_SOURCE"
+
+if [ "$DOCKER_TAG" != "" ]; then
+  DOCKER_TAG=":$DOCKER_TAG"
+fi
+
+echo "Building $DOCKER_REPOSITORY-build$DOCKER_TAG"
+docker build -t "$DOCKER_REPOSITORY-build$DOCKER_TAG" \
+  --build-arg "buildscript_args=$BUILDSCRIPT_ARGS" \
+  -f "$IMAGE_SOURCE/build/Dockerfile" .
+
+echo "Copying clang installation to release image sources"
+docker run -v "$PWD/$IMAGE_SOURCE:/workspace" "$DOCKER_REPOSITORY-build$DOCKER_TAG" \
+  cp /tmp/clang.tar.gz /workspace/release
+trap "rm -f $PWD/$IMAGE_SOURCE/release/clang.tar.gz" EXIT
+
+echo "Building release image"
+docker build -t "${DOCKER_REPOSITORY}${DOCKER_TAG}" \
+  "$IMAGE_SOURCE/release"
+
+echo "Done"
diff --git a/utils/docker/debian8/build/Dockerfile b/utils/docker/debian8/build/Dockerfile
new file mode 100644
index 0000000000000..13a11a73be6c8
--- /dev/null
+++ b/utils/docker/debian8/build/Dockerfile
@@ -0,0 +1,35 @@
+#===- llvm/utils/docker/debian8/build/Dockerfile -------------------------===//
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===----------------------------------------------------------------------===//
+# Produces an image that compiles and archives clang, based on debian8.
+FROM launcher.gcr.io/google/debian8:latest
+
+LABEL maintainer "LLVM Developers"
+
+# Install build dependencies of llvm.
+# First, Update the apt's source list and include the sources of the packages.
+RUN grep deb /etc/apt/sources.list | \
+    sed 's/^deb/deb-src /g' >> /etc/apt/sources.list
+
+# Install compiler, python and subversion.
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends build-essential python2.7 wget \
+            subversion ninja-build && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install cmake version that can compile clang into /usr/local.
+# (Version in debian8 repos is is too old)
+RUN wget -O - "https://cmake.org/files/v3.7/cmake-3.7.2-Linux-x86_64.tar.gz" | \
+    tar xzf - -C /usr/local --strip-components=1
+
+# Arguments passed to build_install_clang.sh.
+ARG buildscript_args
+
+# Run the build. Results of the build will be available as /tmp/clang.tar.gz.
+ADD scripts/build_install_llvm.sh /tmp
+RUN /tmp/build_install_llvm.sh ${buildscript_args}
diff --git a/utils/docker/debian8/release/Dockerfile b/utils/docker/debian8/release/Dockerfile
new file mode 100644
index 0000000000000..d0214b9c67af7
--- /dev/null
+++ b/utils/docker/debian8/release/Dockerfile
@@ -0,0 +1,21 @@
+#===- llvm/utils/docker/debian8/release/Dockerfile -----------------------===//
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===----------------------------------------------------------------------===//
+# A release image, containing clang installation, produced by the 'build/' image
+# and adding libstdc++ and binutils.
+FROM launcher.gcr.io/google/debian8:latest
+
+LABEL maintainer "LLVM Developers"
+
+# Install packages for minimal usefull image.
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends libstdc++-4.9-dev binutils && \
+    rm -rf /var/lib/apt/lists/*
+
+# Unpack clang installation into this image.
+ADD clang.tar.gz /
diff --git a/utils/docker/example/build/Dockerfile b/utils/docker/example/build/Dockerfile
new file mode 100644
index 0000000000000..597ccfeb4f23d
--- /dev/null
+++ b/utils/docker/example/build/Dockerfile
@@ -0,0 +1,26 @@
+#===- llvm/utils/docker/example/build/Dockerfile -------------------------===//
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===----------------------------------------------------------------------===//
+# This is an example Dockerfile to build an image that compiles clang.
+# Replace FIXMEs to prepare your own image.
+
+# FIXME: Replace 'ubuntu' with your base image
+FROM ubuntu
+
+# FIXME: Change maintainer name
+LABEL maintainer "Maintainer <maintainer@email>"
+
+# FIXME: Install llvm/clang build dependencies. Including compiler to
+# build stage1, cmake, subversion, ninja, etc.
+
+# Arguments to pass to build_install_clang.sh.
+ARG buildscript_args
+
+# Run the build. Results of the build will be available as /tmp/clang.tar.gz.
+ADD scripts/build_install_llvm.sh /tmp
+RUN /tmp/build_install_llvm.sh ${buildscript_args}
diff --git a/utils/docker/example/release/Dockerfile b/utils/docker/example/release/Dockerfile
new file mode 100644
index 0000000000000..953d81fc99517
--- /dev/null
+++ b/utils/docker/example/release/Dockerfile
@@ -0,0 +1,24 @@
+#===- llvm/utils/docker/example/release/Dockerfile -----------------------===//
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===----------------------------------------------------------------------===//
+# An image that unpacks a clang installation, compiled by the 'build/'
+# container.
+# Replace FIXMEs to prepare your own image.
+
+# FIXME: Replace 'ubuntu' with your base image.
+FROM ubuntu
+
+# FIXME: Change maintainer name.
+LABEL maintainer "Maintainer <maintainer@email>"
+
+# FIXME: Install all packages you want to have in your release container.
+# A minimal usefull installation must include libstdc++ and binutils.
+
+# Unpack clang installation into this container.
+# It is copied to this directory by build_docker_image.sh script.
+ADD clang.tar.gz /
diff --git a/utils/docker/nvidia-cuda/build/Dockerfile b/utils/docker/nvidia-cuda/build/Dockerfile
new file mode 100644
index 0000000000000..619b80cbb61ae
--- /dev/null
+++ b/utils/docker/nvidia-cuda/build/Dockerfile
@@ -0,0 +1,25 @@
+#===- llvm/utils/docker/nvidia-cuda/build/Dockerfile ---------------------===//
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===----------------------------------------------------------------------===//
+# Produces an image that compiles and archives clang, based on nvidia/cuda
+# image.
+FROM nvidia/cuda:8.0-devel
+
+LABEL maintainer "LLVM Developers"
+
+# Arguments to pass to build_install_clang.sh.
+ARG buildscript_args
+
+# Install llvm build dependencies.
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends cmake python2.7 subversion ninja-build && \
+    rm -rf /var/lib/apt/lists/*
+
+# Run the build. Results of the build will be available as /tmp/clang.tar.gz.
+ADD scripts/build_install_llvm.sh /tmp
+RUN /tmp/build_install_llvm.sh ${buildscript_args}
diff --git a/utils/docker/nvidia-cuda/release/Dockerfile b/utils/docker/nvidia-cuda/release/Dockerfile
new file mode 100644
index 0000000000000..b9bcae1597805
--- /dev/null
+++ b/utils/docker/nvidia-cuda/release/Dockerfile
@@ -0,0 +1,23 @@
+#===- llvm/utils/docker/nvidia-cuda/release/Dockerfile -------------------===//
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===----------------------------------------------------------------------===//
+# This is an example Dockerfile that copies a clang installation, compiled
+# by the 'build/' container into a fresh docker image to get a container of
+# minimal size.
+# Replace FIXMEs to prepare a new Dockerfile.
+
+# FIXME: Replace 'ubuntu' with your base image.
+FROM nvidia/cuda:8.0-devel
+
+# FIXME: Change maintainer name.
+LABEL maintainer "LLVM Developers"
+
+# Unpack clang installation into this container.
+ADD clang.tar.gz /
+
+# C++ standard library and binutils are already included in the base package.
diff --git a/utils/docker/scripts/build_install_llvm.sh b/utils/docker/scripts/build_install_llvm.sh
new file mode 100755
index 0000000000000..7e0e906574162
--- /dev/null
+++ b/utils/docker/scripts/build_install_llvm.sh
@@ -0,0 +1,169 @@
+#!/usr/bin/env bash
+#===- llvm/utils/docker/scripts/build_install_llvm.sh ---------------------===//
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===-----------------------------------------------------------------------===//
+
+set -e
+
+function show_usage() {
+  usage=$(cat << EOF
+Usage: build_install_llvm.sh [options] -- [cmake-args]
+
+Checkout svn sources and run cmake with the specified arguments. Used
+inside docker container.
+Passes additional -DCMAKE_INSTALL_PREFIX and archives the contents of
+the directory to /tmp/clang.tar.gz.
+
+Available options:
+  -h|--help           show this help message
+  -b|--branch         svn branch to checkout, i.e. 'trunk',
+                      'branches/release_40'
+                      (default: 'trunk')
+  -r|--revision       svn revision to checkout
+  -p|--llvm-project   name of an svn project to checkout. Will also add the
+                      project to a list LLVM_ENABLE_PROJECTS, passed to CMake.
+                      For clang, please use 'clang', not 'cfe'.
+                      Project 'llvm' is always included and ignored, if
+                      specified.
+                      Can be specified multiple times.
+  -i|--install-target name of a cmake install target to build and include in
+                      the resulting archive. Can be specified multiple times.
+Required options: At least one --install-target.
+
+All options after '--' are passed to CMake invocation.
+EOF
+)
+  echo "$usage"
+}
+
+LLVM_SVN_REV=""
+LLVM_BRANCH=""
+CMAKE_ARGS=""
+CMAKE_INSTALL_TARGETS=""
+# We always checkout llvm
+LLVM_PROJECTS="llvm"
+CMAKE_LLVM_ENABLE_PROJECTS=""
+
+function contains_project() {
+  local TARGET_PROJ="$1"
+  local PROJ
+  for PROJ in $LLVM_PROJECTS; do
+    if [ "$PROJ" == "$TARGET_PROJ" ]; then
+      return 0
+    fi
+  done
+  return 1
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    -r|--revision)
+      shift
+      LLVM_SVN_REV="$1"
+      ;;
+    -b|--branch)
+      shift
+      LLVM_BRANCH="$1"
+      shift
+      ;;
+    -p|--llvm-project)
+      shift
+      PROJ="$1"
+      if [ "$PROJ" == "cfe" ]; then
+        PROJ="clang"
+      fi
+      if ! contains_project "$PROJ" ; then
+        LLVM_PROJECTS="$LLVM_PROJECTS $PROJ"
+        CMAKE_LLVM_ENABLE_PROJECTS="$CMAKE_LLVM_ENABLED_PROJECTS;$PROJ"
+      else
+        echo "Project '$PROJ' is already enabled, ignoring extra occurences."
+      fi
+      shift
+      ;;
+    -i|--install-target)
+      shift
+      CMAKE_INSTALL_TARGETS="$CMAKE_INSTALL_TARGETS $1"
+      shift
+      ;;
+    --)
+      shift
+      CMAKE_ARGS="$*"
+      shift $#
+      ;;
+    -h|--help)
+      show_usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1"
+      exit 1
+  esac
+done
+
+if [ "$CMAKE_INSTALL_TARGETS" == "" ]; then
+  echo "No install targets. Please pass one or more --install-target."
+  exit 1
+fi
+
+if [ "$LLVM_BRANCH" == "" ]; then
+  LLVM_BRANCH="trunk"
+fi
+
+if [ "$LLVM_SVN_REVISION" != "" ]; then
+  SVN_REV_ARG="-r$LLVM_SVN_REVISION"
+else
+  SVN_REV_ARG=""
+fi
+
+CLANG_BUILD_DIR=/tmp/clang-build
+CLANG_INSTALL_DIR=/tmp/clang-install
+
+mkdir "$CLANG_BUILD_DIR"
+
+# Get the sources from svn.
+echo "Checking out sources from svn"
+mkdir "$CLANG_BUILD_DIR/src"
+for LLVM_PROJECT in $LLVM_PROJECTS; do
+  if [ "$LLVM_PROJECT" == "clang" ]; then
+    SVN_PROJECT="cfe"
+  else
+    SVN_PROJECT="$LLVM_PROJECT"
+  fi
+
+  echo "Checking out http://llvm.org/svn/llvm-project/$SVN_PROJECT to $CLANG_BUILD_DIR/src/$LLVM_PROJECT"
+  # FIXME: --trust-server-cert is required to workaround 'SSL issuer is not
+  #        trusted' error. Using https seems preferable to http either way,
+  #        albeit this is not secure.
+  svn co -q $SVN_REV_ARG --trust-server-cert \
+    "https://llvm.org/svn/llvm-project/$SVN_PROJECT/$LLVM_BRANCH" \
+    "$CLANG_BUILD_DIR/src/$LLVM_PROJECT"
+done
+
+pushd "$CLANG_BUILD_DIR"
+
+# Run the build as specified in the build arguments.
+echo "Running build"
+mkdir "$CLANG_BUILD_DIR/build"
+cmake -GNinja \
+  -DCMAKE_INSTALL_PREFIX="$CLANG_INSTALL_DIR" \
+  -DLLVM_ENABLE_PROJECTS="$CMAKE_LLVM_ENABLE_PROJECTS" \
+  $CMAKE_ARGS \
+  "$CLANG_BUILD_DIR/src/llvm"
+ninja $CMAKE_INSTALL_TARGETS
+
+popd
+
+# Pack the installed clang into an archive.
+echo "Archiving clang installation to /tmp/clang.tar.gz"
+cd "$CLANG_INSTALL_DIR"
+tar -czf /tmp/clang.tar.gz *
+
+# Cleanup.
+rm -rf "$CLANG_BUILD_DIR" "$CLANG_INSTALL_DIR"
+
+echo "Done"
diff --git a/utils/lit/lit/formats/__init__.py b/utils/lit/lit/formats/__init__.py
index 3ff46e93ead2e..7d14ca4b535a7 100644
--- a/utils/lit/lit/formats/__init__.py
+++ b/utils/lit/lit/formats/__init__.py
@@ -1,8 +1,3 @@
-from lit.formats.base import (  # noqa: F401
-    TestFormat,
-    FileBasedTest,
-    OneCommandPerFileTest
-)
-
+from lit.formats.base import TestFormat  # noqa: F401
 from lit.formats.googletest import GoogleTest  # noqa: F401
 from lit.formats.shtest import ShTest  # noqa: F401
diff --git a/utils/lit/lit/formats/base.py b/utils/lit/lit/formats/base.py
index 6721d17e334e6..baa9ff1d3b7d7 100644
--- a/utils/lit/lit/formats/base.py
+++ b/utils/lit/lit/formats/base.py
@@ -1,117 +1,50 @@
-from __future__ import absolute_import
-import os
-
-import lit.Test
-import lit.util
+import abc
 
 class TestFormat(object):
-    pass
-
-###
-
-class FileBasedTest(TestFormat):
-    def getTestsInDirectory(self, testSuite, path_in_suite,
-                            litConfig, localConfig):
-        source_path = testSuite.getSourcePath(path_in_suite)
-        for filename in os.listdir(source_path):
-            # Ignore dot files and excluded tests.
-            if (filename.startswith('.') or
-                filename in localConfig.excludes):
-                continue
-
-            filepath = os.path.join(source_path, filename)
-            if not os.path.isdir(filepath):
-                base,ext = os.path.splitext(filename)
-                if ext in localConfig.suffixes:
-                    yield lit.Test.Test(testSuite, path_in_suite + (filename,),
-                                        localConfig)
-
-###
-
-import re
-import tempfile
-
-class OneCommandPerFileTest(TestFormat):
-    # FIXME: Refactor into generic test for running some command on a directory
-    # of inputs.
-
-    def __init__(self, command, dir, recursive=False,
-                 pattern=".*", useTempInput=False):
-        if isinstance(command, str):
-            self.command = [command]
-        else:
-            self.command = list(command)
-        if dir is not None:
-            dir = str(dir)
-        self.dir = dir
-        self.recursive = bool(recursive)
-        self.pattern = re.compile(pattern)
-        self.useTempInput = useTempInput
-
-    def getTestsInDirectory(self, testSuite, path_in_suite,
-                            litConfig, localConfig):
-        dir = self.dir
-        if dir is None:
-            dir = testSuite.getSourcePath(path_in_suite)
-
-        for dirname,subdirs,filenames in os.walk(dir):
-            if not self.recursive:
-                subdirs[:] = []
-
-            subdirs[:] = [d for d in subdirs
-                          if (d != '.svn' and
-                              d not in localConfig.excludes)]
-
-            for filename in filenames:
-                if (filename.startswith('.') or
-                    not self.pattern.match(filename) or
-                    filename in localConfig.excludes):
-                    continue
-
-                path = os.path.join(dirname,filename)
-                suffix = path[len(dir):]
-                if suffix.startswith(os.sep):
-                    suffix = suffix[1:]
-                test = lit.Test.Test(
-                    testSuite, path_in_suite + tuple(suffix.split(os.sep)),
-                    localConfig)
-                # FIXME: Hack?
-                test.source_path = path
-                yield test
-
-    def createTempInput(self, tmp, test):
-        raise NotImplementedError('This is an abstract method.')
-
+    """Base class for test formats.
+
+    A TestFormat encapsulates logic for finding and executing a certain type of
+    test. For example, a subclass FooTestFormat would contain the logic for
+    finding tests written in the 'Foo' format, and the logic for running a
+    single one.
+
+    TestFormat is an Abstract Base Class (ABC). It uses the Python abc.ABCMeta
+    type and associated @abc.abstractmethod decorator. Together, these provide
+    subclass behaviour which is notionally similar to C++ pure virtual classes:
+    only subclasses which implement all abstract methods can be instantiated
+    (the implementation may come from an intermediate base).
+
+    For details on ABCs, see: https://docs.python.org/2/library/abc.html. Note
+    that Python ABCs have extensive abilities beyond what is used here. For
+    TestFormat, we only care about enforcing that abstract methods are
+    implemented.
+    """
+
+    __metaclass__ = abc.ABCMeta
+
+    @abc.abstractmethod
+    def getTestsInDirectory(self, testSuite, path_in_suite, litConfig,
+                            localConfig):
+      """Finds tests of this format in the given directory.
+
+      Args:
+          testSuite: a Test.TestSuite object.
+          path_in_suite: the subpath under testSuite to look for tests.
+          litConfig: the LitConfig for the test suite.
+          localConfig: a LitConfig with local specializations.
+
+      Returns:
+          An iterable of Test.Test objects.
+      """
+
+    @abc.abstractmethod
     def execute(self, test, litConfig):
-        if test.config.unsupported:
-            return (lit.Test.UNSUPPORTED, 'Test is unsupported')
-
-        cmd = list(self.command)
-
-        # If using temp input, create a temporary file and hand it to the
-        # subclass.
-        if self.useTempInput:
-            tmp = tempfile.NamedTemporaryFile(suffix='.cpp')
-            self.createTempInput(tmp, test)
-            tmp.flush()
-            cmd.append(tmp.name)
-        elif hasattr(test, 'source_path'):
-            cmd.append(test.source_path)
-        else:
-            cmd.append(test.getSourcePath())
-
-        out, err, exitCode = lit.util.executeCommand(cmd)
-
-        diags = out + err
-        if not exitCode and not diags.strip():
-            return lit.Test.PASS,''
+      """Runs the given 'test', which is of this format.
 
-        # Try to include some useful information.
-        report = """Command: %s\n""" % ' '.join(["'%s'" % a
-                                                 for a in cmd])
-        if self.useTempInput:
-            report += """Temporary File: %s\n""" % tmp.name
-            report += "--\n%s--\n""" % open(tmp.name).read()
-        report += """Output:\n--\n%s--""" % diags
+      Args:
+          test: a Test.Test object describing the test to run.
+          litConfig: the LitConfig for the test suite.
 
-        return lit.Test.FAIL, report
+      Returns:
+          A tuple of (status:Test.ResultCode, message:str)
+      """
diff --git a/utils/lit/lit/formats/googletest.py b/utils/lit/lit/formats/googletest.py
index 29a92c4e960b6..b683f7c7db8ef 100644
--- a/utils/lit/lit/formats/googletest.py
+++ b/utils/lit/lit/formats/googletest.py
@@ -11,8 +11,8 @@ from .base import TestFormat
 kIsWindows = sys.platform in ['win32', 'cygwin']
 
 class GoogleTest(TestFormat):
-    def __init__(self, test_sub_dir, test_suffix):
-        self.test_sub_dir = os.path.normcase(str(test_sub_dir)).split(';')
+    def __init__(self, test_sub_dirs, test_suffix):
+        self.test_sub_dirs = os.path.normcase(str(test_sub_dirs)).split(';')
         self.test_suffix = str(test_suffix)
 
         # On Windows, assume tests will also end in '.exe'.
@@ -30,19 +30,24 @@ class GoogleTest(TestFormat):
           localConfig: TestingConfig instance"""
 
         try:
-            lines = lit.util.capture([path, '--gtest_list_tests'],
-                                     env=localConfig.environment)
-            if kIsWindows:
-              lines = lines.replace('\r', '')
-            lines = lines.split('\n')
-        except Exception as exc:
-            out = exc.output if isinstance(exc, subprocess.CalledProcessError) else ''
-            litConfig.warning("unable to discover google-tests in %r: %s. Process output: %s"
-                              % (path, sys.exc_info()[1], out))
+            output = subprocess.check_output([path, '--gtest_list_tests'],
+                                             env=localConfig.environment)
+        except subprocess.CalledProcessError as exc:
+            litConfig.warning(
+                "unable to discover google-tests in %r: %s. Process output: %s"
+                % (path, sys.exc_info()[1], exc.output))
             raise StopIteration
 
         nested_tests = []
-        for ln in lines:
+        for ln in output.splitlines(False):  # Don't keep newlines.
+            ln = lit.util.to_string(ln)
+
+            if 'Running main() from gtest_main.cc' in ln:
+                # Upstream googletest prints this to stdout prior to running
+                # tests. LLVM removed that print statement in r61540, but we
+                # handle it here in case upstream googletest is being used.
+                continue
+
             # The test name list includes trailing comments beginning with
             # a '#' on some lines, so skip those. We don't support test names
             # that use escaping to embed '#' into their name as the names come
@@ -52,12 +57,6 @@ class GoogleTest(TestFormat):
             if not ln.lstrip():
                 continue
 
-            if 'Running main() from gtest_main.cc' in ln:
-                # Upstream googletest prints this to stdout prior to running
-                # tests. LLVM removed that print statement in r61540, but we
-                # handle it here in case upstream googletest is being used.
-                continue
-
             index = 0
             while ln[index*2:index*2+2] == '  ':
                 index += 1
@@ -75,38 +74,19 @@ class GoogleTest(TestFormat):
             else:
                 yield ''.join(nested_tests) + ln
 
-    # Note: path_in_suite should not include the executable name.
-    def getTestsInExecutable(self, testSuite, path_in_suite, execpath,
-                             litConfig, localConfig):
-        if not execpath.endswith(self.test_suffix):
-            return
-        (dirname, basename) = os.path.split(execpath)
-        # Discover the tests in this executable.
-        for testname in self.getGTestTests(execpath, litConfig, localConfig):
-            testPath = path_in_suite + (basename, testname)
-            yield lit.Test.Test(testSuite, testPath, localConfig, file_path=execpath)
-
     def getTestsInDirectory(self, testSuite, path_in_suite,
                             litConfig, localConfig):
         source_path = testSuite.getSourcePath(path_in_suite)
-        for filename in os.listdir(source_path):
-            filepath = os.path.join(source_path, filename)
-            if os.path.isdir(filepath):
-                # Iterate over executables in a directory.
-                if not os.path.normcase(filename) in self.test_sub_dir:
-                    continue
-                dirpath_in_suite = path_in_suite + (filename, )
-                for subfilename in os.listdir(filepath):
-                    execpath = os.path.join(filepath, subfilename)
-                    for test in self.getTestsInExecutable(
-                            testSuite, dirpath_in_suite, execpath,
-                            litConfig, localConfig):
-                      yield test
-            elif ('.' in self.test_sub_dir):
-                for test in self.getTestsInExecutable(
-                        testSuite, path_in_suite, filepath,
-                        litConfig, localConfig):
-                    yield test
+        for subdir in self.test_sub_dirs:
+            for fn in lit.util.listdir_files(os.path.join(source_path, subdir),
+                                             suffixes={self.test_suffix}):
+                # Discover the tests in this executable.
+                execpath = os.path.join(source_path, subdir, fn)
+                testnames = self.getGTestTests(execpath, litConfig, localConfig)
+                for testname in testnames:
+                    testPath = path_in_suite + (subdir, fn, testname)
+                    yield lit.Test.Test(testSuite, testPath, localConfig,
+                                        file_path=execpath)
 
     def execute(self, test, litConfig):
         testPath,testName = os.path.split(test.getSourcePath())
diff --git a/utils/lit/lit/formats/shtest.py b/utils/lit/lit/formats/shtest.py
index 30a6a3310b011..01ecd192092e8 100644
--- a/utils/lit/lit/formats/shtest.py
+++ b/utils/lit/lit/formats/shtest.py
@@ -1,12 +1,48 @@
 from __future__ import absolute_import
 
+import os
+
+import lit.Test
 import lit.TestRunner
-from .base import FileBasedTest
+import lit.util
+from .base import TestFormat
+
+class ShTest(TestFormat):
+    """ShTest is a format with one file per test.
+
+    This is the primary format for regression tests as described in the LLVM
+    testing guide:
+
+        http://llvm.org/docs/TestingGuide.html
+
+    The ShTest files contain some number of shell-like command pipelines, along
+    with assertions about what should be in the output.
+    """
 
-class ShTest(FileBasedTest):
     def __init__(self, execute_external = False):
+        """Initializer.
+
+        The 'execute_external' argument controls whether lit uses its internal
+        logic for command pipelines, or passes the command to a shell
+        subprocess.
+
+        Args:
+            execute_external: (optional) If true, use shell subprocesses instead
+                of lit's internal pipeline logic.
+        """
         self.execute_external = execute_external
 
+    def getTestsInDirectory(self, testSuite, path_in_suite,
+                            litConfig, localConfig):
+        """Yields test files matching 'suffixes' from the localConfig."""
+        file_matches = lit.util.listdir_files(
+            testSuite.getSourcePath(path_in_suite),
+            localConfig.suffixes, localConfig.excludes)
+        for filename in file_matches:
+            yield lit.Test.Test(testSuite, path_in_suite + (filename,),
+                                localConfig)
+
     def execute(self, test, litConfig):
+        """Interprets and runs the given test file, and returns the result."""
         return lit.TestRunner.executeShTest(test, litConfig,
                                             self.execute_external)
diff --git a/utils/lit/lit/run.py b/utils/lit/lit/run.py
index aa4fdc18b8779..1290c142c834e 100644
--- a/utils/lit/lit/run.py
+++ b/utils/lit/lit/run.py
@@ -24,140 +24,6 @@ def abort_now():
     else:
         os.kill(0, 9)
 
-###
-# Test Execution Implementation
-
-class LockedValue(object):
-    def __init__(self, value):
-        self.lock = threading.Lock()
-        self._value = value
-
-    def _get_value(self):
-        self.lock.acquire()
-        try:
-            return self._value
-        finally:
-            self.lock.release()
-
-    def _set_value(self, value):
-        self.lock.acquire()
-        try:
-            self._value = value
-        finally:
-            self.lock.release()
-
-    value = property(_get_value, _set_value)
-
-class TestProvider(object):
-    def __init__(self, queue_impl, canceled_flag):
-        self.canceled_flag = canceled_flag
-
-        # Create a shared queue to provide the test indices.
-        self.queue = queue_impl()
-
-    def queue_tests(self, tests, num_jobs):
-        for i in range(len(tests)):
-            self.queue.put(i)
-        for i in range(num_jobs):
-            self.queue.put(None)
-
-    def cancel(self):
-        self.canceled_flag.value = 1
-
-    def get(self):
-        # Check if we are canceled.
-        if self.canceled_flag.value:
-          return None
-
-        # Otherwise take the next test.
-        return self.queue.get()
-
-class Tester(object):
-    def __init__(self, run_instance, provider, consumer):
-        self.run_instance = run_instance
-        self.provider = provider
-        self.consumer = consumer
-
-    def run(self):
-        while True:
-            item = self.provider.get()
-            if item is None:
-                break
-            self.run_test(item)
-        self.consumer.task_finished()
-
-    def run_test(self, test_index):
-        test = self.run_instance.tests[test_index]
-        try:
-            execute_test(test, self.run_instance.lit_config,
-                         self.run_instance.parallelism_semaphores)
-        except KeyboardInterrupt:
-            # This is a sad hack. Unfortunately subprocess goes
-            # bonkers with ctrl-c and we start forking merrily.
-            print('\nCtrl-C detected, goodbye.')
-            abort_now()
-        self.consumer.update(test_index, test)
-
-class ThreadResultsConsumer(object):
-    def __init__(self, display):
-        self.display = display
-        self.lock = threading.Lock()
-
-    def update(self, test_index, test):
-        self.lock.acquire()
-        try:
-            self.display.update(test)
-        finally:
-            self.lock.release()
-
-    def task_finished(self):
-        pass
-
-    def handle_results(self):
-        pass
-
-class MultiprocessResultsConsumer(object):
-    def __init__(self, run, display, num_jobs):
-        self.run = run
-        self.display = display
-        self.num_jobs = num_jobs
-        self.queue = multiprocessing.Queue()
-
-    def update(self, test_index, test):
-        # This method is called in the child processes, and communicates the
-        # results to the actual display implementation via an output queue.
-        self.queue.put((test_index, test.result))
-
-    def task_finished(self):
-        # This method is called in the child processes, and communicates that
-        # individual tasks are complete.
-        self.queue.put(None)
-
-    def handle_results(self):
-        # This method is called in the parent, and consumes the results from the
-        # output queue and dispatches to the actual display. The method will
-        # complete after each of num_jobs tasks has signalled completion.
-        completed = 0
-        while completed != self.num_jobs:
-            # Wait for a result item.
-            item = self.queue.get()
-            if item is None:
-                completed += 1
-                continue
-
-            # Update the test result in the parent process.
-            index,result = item
-            test = self.run.tests[index]
-            test.result = result
-
-            self.display.update(test)
-
-def run_one_tester(run, provider, display):
-    tester = Tester(run, provider, display)
-    tester.run()
-
-###
-
 class _Display(object):
     def __init__(self, display, provider, maxFailures):
         self.display = display
@@ -170,47 +36,6 @@ class _Display(object):
         if self.failedCount == self.maxFailures:
             self.provider.cancel()
 
-def handleFailures(provider, consumer, maxFailures):
-    consumer.display = _Display(consumer.display, provider, maxFailures)
-
-def execute_test(test, lit_config, parallelism_semaphores):
-    """Execute one test"""
-    pg = test.config.parallelism_group
-    if callable(pg):
-        pg = pg(test)
-
-    result = None
-    semaphore = None
-    try:
-        if pg:
-            semaphore = parallelism_semaphores[pg]
-        if semaphore:
-            semaphore.acquire()
-        start_time = time.time()
-        result = test.config.test_format.execute(test, lit_config)
-        # Support deprecated result from execute() which returned the result
-        # code and additional output as a tuple.
-        if isinstance(result, tuple):
-            code, output = result
-            result = lit.Test.Result(code, output)
-        elif not isinstance(result, lit.Test.Result):
-            raise ValueError("unexpected result from test execution")
-        result.elapsed = time.time() - start_time
-    except KeyboardInterrupt:
-        raise
-    except:
-        if lit_config.debug:
-            raise
-        output = 'Exception during script execution:\n'
-        output += traceback.format_exc()
-        output += '\n'
-        result = lit.Test.Result(lit.Test.UNRESOLVED, output)
-    finally:
-        if semaphore:
-            semaphore.release()
-
-    test.setResult(result)
-
 class Run(object):
     """
     This class represents a concrete, configured testing run.
@@ -221,7 +46,8 @@ class Run(object):
         self.tests = tests
 
     def execute_test(self, test):
-        return execute_test(test, self.lit_config, self.parallelism_semaphores)
+        return _execute_test_impl(test, self.lit_config,
+                                  self.parallelism_semaphores)
 
     def execute_tests(self, display, jobs, max_time=None):
         """
@@ -350,6 +176,44 @@ class Run(object):
                 self.failure_count == self.lit_config.maxFailures:
             self.hit_max_failures = True
 
+def _execute_test_impl(test, lit_config, parallelism_semaphores):
+    """Execute one test"""
+    pg = test.config.parallelism_group
+    if callable(pg):
+        pg = pg(test)
+
+    result = None
+    semaphore = None
+    try:
+        if pg:
+            semaphore = parallelism_semaphores[pg]
+        if semaphore:
+            semaphore.acquire()
+        start_time = time.time()
+        result = test.config.test_format.execute(test, lit_config)
+        # Support deprecated result from execute() which returned the result
+        # code and additional output as a tuple.
+        if isinstance(result, tuple):
+            code, output = result
+            result = lit.Test.Result(code, output)
+        elif not isinstance(result, lit.Test.Result):
+            raise ValueError("unexpected result from test execution")
+        result.elapsed = time.time() - start_time
+    except KeyboardInterrupt:
+        raise
+    except:
+        if lit_config.debug:
+            raise
+        output = 'Exception during script execution:\n'
+        output += traceback.format_exc()
+        output += '\n'
+        result = lit.Test.Result(lit.Test.UNRESOLVED, output)
+    finally:
+        if semaphore:
+            semaphore.release()
+
+    test.setResult(result)
+
 child_lit_config = None
 child_parallelism_semaphores = None
 
@@ -375,7 +239,7 @@ def worker_run_one_test(test_index, test):
     the display.
     """
     try:
-        execute_test(test, child_lit_config, child_parallelism_semaphores)
+        _execute_test_impl(test, child_lit_config, child_parallelism_semaphores)
         return (test_index, test)
     except KeyboardInterrupt as e:
         # If a worker process gets an interrupt, abort it immediately.
diff --git a/utils/lit/lit/util.py b/utils/lit/lit/util.py
index 8991588a868d8..1819d4d1c34ff 100644
--- a/utils/lit/lit/util.py
+++ b/utils/lit/lit/util.py
@@ -8,24 +8,57 @@ import subprocess
 import sys
 import threading
 
-def to_bytes(str):
-    # Encode to UTF-8 to get binary data.
-    if isinstance(str, bytes):
-        return str
-    return str.encode('utf-8')
-
-def to_string(bytes):
-    if isinstance(bytes, str):
-        return bytes
-    return to_bytes(bytes)
-
-def convert_string(bytes):
+def to_bytes(s):
+    """Return the parameter as type 'bytes', possibly encoding it.
+
+    In Python2, the 'bytes' type is the same as 'str'. In Python3, they are
+    distinct.
+    """
+    if isinstance(s, bytes):
+        # In Python2, this branch is taken for both 'str' and 'bytes'.
+        # In Python3, this branch is taken only for 'bytes'.
+        return s
+    # In Python2, 's' is a 'unicode' object.
+    # In Python3, 's' is a 'str' object.
+    # Encode to UTF-8 to get 'bytes' data.
+    return s.encode('utf-8')
+
+def to_string(b):
+    """Return the parameter as type 'str', possibly encoding it.
+
+    In Python2, the 'str' type is the same as 'bytes'. In Python3, the
+    'str' type is (essentially) Python2's 'unicode' type, and 'bytes' is
+    distinct.
+    """
+    if isinstance(b, str):
+        # In Python2, this branch is taken for types 'str' and 'bytes'.
+        # In Python3, this branch is taken only for 'str'.
+        return b
+    if isinstance(b, bytes):
+        # In Python2, this branch is never taken ('bytes' is handled as 'str').
+        # In Python3, this is true only for 'bytes'.
+        try:
+            return b.decode('utf-8')
+        except UnicodeDecodeError:
+            # If the value is not valid Unicode, return the default
+            # repr-line encoding.
+            return str(b)
+
+    # By this point, here's what we *don't* have:
+    #
+    #  - In Python2:
+    #    - 'str' or 'bytes' (1st branch above)
+    #  - In Python3:
+    #    - 'str' (1st branch above)
+    #    - 'bytes' (2nd branch above)
+    #
+    # The last type we might expect is the Python2 'unicode' type. There is no
+    # 'unicode' type in Python3 (all the Python3 cases were already handled). In
+    # order to get a 'str' object, we need to encode the 'unicode' object.
     try:
-        return to_string(bytes.decode('utf-8'))
-    except AttributeError: # 'str' object has no attribute 'decode'.
-        return str(bytes)
-    except UnicodeError:
-        return str(bytes)
+        return b.encode('utf-8')
+    except AttributeError:
+        raise TypeError('not sure how to convert %s to %s' % (type(b), str))
 
 def detectCPUs():
     """
@@ -39,7 +72,8 @@ def detectCPUs():
             if isinstance(ncpus, int) and ncpus > 0:
                 return ncpus
         else: # OSX:
-            return int(capture(['sysctl', '-n', 'hw.ncpu']))
+            return int(subprocess.check_output(['sysctl', '-n', 'hw.ncpu'],
+                                               stderr=subprocess.STDOUT))
     # Windows:
     if "NUMBER_OF_PROCESSORS" in os.environ:
         ncpus = int(os.environ["NUMBER_OF_PROCESSORS"])
@@ -67,20 +101,44 @@ def mkdir_p(path):
         if e.errno != errno.EEXIST:
             raise
 
-def capture(args, env=None):
-    """capture(command) - Run the given command (or argv list) in a shell and
-    return the standard output. Raises a CalledProcessError if the command
-    exits with a non-zero status."""
-    p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
-                         env=env)
-    out, err = p.communicate()
-    out = convert_string(out)
-    err = convert_string(err)
-    if p.returncode != 0:
-        raise subprocess.CalledProcessError(cmd=args,
-                                            returncode=p.returncode,
-                                            output="{}\n{}".format(out, err))
-    return out
+def listdir_files(dirname, suffixes=None, exclude_filenames=None):
+    """Yields files in a directory.
+
+    Filenames that are not excluded by rules below are yielded one at a time, as
+    basenames (i.e., without dirname).
+
+    Files starting with '.' are always skipped.
+
+    If 'suffixes' is not None, then only filenames ending with one of its
+    members will be yielded. These can be extensions, like '.exe', or strings,
+    like 'Test'. (It is a lexicographic check; so an empty sequence will yield
+    nothing, but a single empty string will yield all filenames.)
+
+    If 'exclude_filenames' is not None, then none of the file basenames in it
+    will be yielded.
+
+    If specified, the containers for 'suffixes' and 'exclude_filenames' must
+    support membership checking for strs.
+
+    Args:
+        dirname: a directory path.
+        suffixes: (optional) a sequence of strings (set, list, etc.).
+        exclude_filenames: (optional) a sequence of strings.
+
+    Yields:
+        Filenames as returned by os.listdir (generally, str).
+    """
+    if exclude_filenames is None:
+        exclude_filenames = set()
+    if suffixes is None:
+        suffixes = {''}
+    for filename in os.listdir(dirname):
+        if (os.path.isdir(os.path.join(dirname, filename)) or
+            filename.startswith('.') or
+            filename in exclude_filenames or
+            not any(filename.endswith(sfx) for sfx in suffixes)):
+            continue
+        yield filename
 
 def which(command, paths = None):
     """which(command, [paths]) - Look up the given command in the paths string
@@ -233,8 +291,8 @@ def executeCommand(command, cwd=None, env=None, input=None, timeout=0):
             timerObject.cancel()
 
     # Ensure the resulting output is always of string type.
-    out = convert_string(out)
-    err = convert_string(err)
+    out = to_string(out)
+    err = to_string(err)
 
     if hitTimeOut[0]:
         raise ExecuteCommandTimeoutException(
diff --git a/utils/opt-viewer/opt-diff.py b/utils/opt-viewer/opt-diff.py
index 2b5d1bbfabbec..9e921f8488d36 100755
--- a/utils/opt-viewer/opt-diff.py
+++ b/utils/opt-viewer/opt-diff.py
@@ -44,20 +44,21 @@ if __name__ == '__main__':
         default=cpu_count(),
         type=int,
         help='Max job count (defaults to %(default)s, the current CPU count)')
+    parser.add_argument(
+        '--no-progress-indicator',
+        '-n',
+        action='store_true',
+        default=False,
+        help='Do not display any indicator of how many YAML files were read.')
     parser.add_argument('--output', '-o', default='diff.opt.yaml')
     args = parser.parse_args()
 
-    if args.jobs == 1:
-        pmap = map
-    else:
-        pool = Pool(processes=args.jobs)
-        pmap = pool.map
-
     files1 = find_files(args.yaml_dir_or_file_1)
     files2 = find_files(args.yaml_dir_or_file_2)
 
-    all_remarks1, _, _ = optrecord.gather_results(pmap, files1)
-    all_remarks2, _, _ = optrecord.gather_results(pmap, files2)
+    print_progress = not args.no_progress_indicator
+    all_remarks1, _, _ = optrecord.gather_results(files1, args.jobs, print_progress)
+    all_remarks2, _, _ = optrecord.gather_results(files2, args.jobs, print_progress)
 
     added = set(all_remarks2.values()) - set(all_remarks1.values())
     removed = set(all_remarks1.values()) - set(all_remarks2.values())
@@ -66,5 +67,5 @@ if __name__ == '__main__':
         r.Added = True
     for r in removed:
         r.Added = False
-    stream = file(args.output, 'w')
-    yaml.dump_all(added | removed, stream)
+    with open(args.output, 'w') as stream:
+        yaml.dump_all(added | removed, stream)
diff --git a/utils/opt-viewer/opt-stats.py b/utils/opt-viewer/opt-stats.py
index 79e5c03eca9fa..a7e598fdfd026 100755
--- a/utils/opt-viewer/opt-stats.py
+++ b/utils/opt-viewer/opt-stats.py
@@ -22,15 +22,19 @@ if __name__ == '__main__':
         default=cpu_count(),
         type=int,
         help='Max job count (defaults to %(default)s, the current CPU count)')
+    parser.add_argument(
+        '--no-progress-indicator',
+        '-n',
+        action='store_true',
+        default=False,
+        help='Do not display any indicator of how many YAML files were read.')
     args = parser.parse_args()
 
-    if args.jobs == 1:
-        pmap = map
-    else:
-        pool = Pool(processes=args.jobs)
-        pmap = pool.map
-
-    all_remarks, file_remarks, _ = optrecord.gather_results(pmap, args.yaml_files)
+    print_progress = not args.no_progress_indicator
+    all_remarks, file_remarks, _ = optrecord.gather_results(
+        args.yaml_files, args.jobs, print_progress)
+    if print_progress:
+        print('\n')
 
     bypass = defaultdict(int)
     byname = defaultdict(int)
diff --git a/utils/opt-viewer/opt-viewer.py b/utils/opt-viewer/opt-viewer.py
index 3f5503f26b1ff..5e5daf7feb0de 100755
--- a/utils/opt-viewer/opt-viewer.py
+++ b/utils/opt-viewer/opt-viewer.py
@@ -2,24 +2,28 @@
 
 from __future__ import print_function
 
-desc = '''Generate HTML output to visualize optimization records from the YAML files
-generated with -fsave-optimization-record and -fdiagnostics-show-hotness.
-
-The tools requires PyYAML and Pygments Python packages.'''
-
-import optrecord
-import functools
-from multiprocessing import Pool
-from multiprocessing import Lock, cpu_count
-import errno
 import argparse
+import cgi
+import errno
+import functools
+from multiprocessing import cpu_count
 import os.path
 import re
 import shutil
+
 from pygments import highlight
 from pygments.lexers.c_cpp import CppLexer
 from pygments.formatters import HtmlFormatter
-import cgi
+
+import optpmap
+import optrecord
+
+
+desc = '''Generate HTML output to visualize optimization records from the YAML files
+generated with -fsave-optimization-record and -fdiagnostics-show-hotness.
+
+The tools requires PyYAML and Pygments Python packages.'''
+
 
 # This allows passing the global context to the child processes.
 class Context:
@@ -169,7 +173,7 @@ def _render_file(source_dir, output_dir, ctx, entry):
 def map_remarks(all_remarks):
     # Set up a map between function names and their source location for
     # function where inlining happened
-    for remark in all_remarks.itervalues():
+    for remark in optrecord.itervalues(all_remarks):
         if isinstance(remark, optrecord.Passed) and remark.Pass == "inline" and remark.Name == "Inlined":
             for arg in remark.Args:
                 caller = arg.get('Caller')
@@ -177,7 +181,13 @@ def map_remarks(all_remarks):
                     context.caller_loc[caller] = arg['DebugLoc']
 
 
-def generate_report(pmap, all_remarks, file_remarks, source_dir, output_dir, should_display_hotness):
+def generate_report(all_remarks,
+                    file_remarks,
+                    source_dir,
+                    output_dir,
+                    should_display_hotness,
+                    num_jobs,
+                    should_print_progress):
     try:
         os.makedirs(output_dir)
     except OSError as e:
@@ -187,12 +197,17 @@ def generate_report(pmap, all_remarks, file_remarks, source_dir, output_dir, sho
             raise
 
     _render_file_bound = functools.partial(_render_file, source_dir, output_dir, context)
-    pmap(_render_file_bound, file_remarks.items())
+    if should_print_progress:
+        print('Rendering HTML files...')
+    optpmap.pmap(_render_file_bound,
+                 file_remarks.items(),
+                 num_jobs,
+                 should_print_progress)
 
     if should_display_hotness:
-        sorted_remarks = sorted(all_remarks.itervalues(), key=lambda r: (r.Hotness, r.File, r.Line, r.Column, r.__dict__), reverse=True)
+        sorted_remarks = sorted(optrecord.itervalues(all_remarks), key=lambda r: (r.Hotness, r.File, r.Line, r.Column, r.PassWithDiffPrefix, r.yaml_tag, r.Function), reverse=True)
     else:
-        sorted_remarks = sorted(all_remarks.itervalues(), key=lambda r: (r.File, r.Line, r.Column, r.__dict__))
+        sorted_remarks = sorted(optrecord.itervalues(all_remarks), key=lambda r: (r.File, r.Line, r.Column, r.PassWithDiffPrefix, r.yaml_tag, r.Function))
     IndexRenderer(args.output_dir).render(sorted_remarks)
 
     shutil.copy(os.path.join(os.path.dirname(os.path.realpath(__file__)),
@@ -202,7 +217,13 @@ def generate_report(pmap, all_remarks, file_remarks, source_dir, output_dir, sho
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description=desc)
     parser.add_argument('yaml_files', nargs='+')
-    parser.add_argument('output_dir')
+    parser.add_argument(
+        '--output-dir',
+        '-o',
+        default='html',
+        help='Path to a directory where generated HTML files will be output. '
+             'If the directory does not already exist, it will be created. '
+             '"%(default)s" by default.')
     parser.add_argument(
         '--jobs',
         '-j',
@@ -214,16 +235,25 @@ if __name__ == '__main__':
         '-s',
         default='',
         help='set source directory')
+    parser.add_argument(
+        '--no-progress-indicator',
+        '-n',
+        action='store_true',
+        default=False,
+        help='Do not display any indicator of how many YAML files were read '
+             'or rendered into HTML.')
     args = parser.parse_args()
 
-    if args.jobs == 1:
-        pmap = map
-    else:
-        pool = Pool(processes=args.jobs)
-        pmap = pool.map
-
-    all_remarks, file_remarks, should_display_hotness = optrecord.gather_results(pmap, args.yaml_files)
+    print_progress = not args.no_progress_indicator
+    all_remarks, file_remarks, should_display_hotness = \
+        optrecord.gather_results(args.yaml_files, args.jobs, print_progress)
 
     map_remarks(all_remarks)
 
-    generate_report(pmap, all_remarks, file_remarks, args.source_dir, args.output_dir, should_display_hotness)
+    generate_report(all_remarks,
+                    file_remarks,
+                    args.source_dir,
+                    args.output_dir,
+                    should_display_hotness,
+                    args.jobs,
+                    print_progress)
diff --git a/utils/opt-viewer/optpmap.py b/utils/opt-viewer/optpmap.py
new file mode 100644
index 0000000000000..01e848e03976d
--- /dev/null
+++ b/utils/opt-viewer/optpmap.py
@@ -0,0 +1,53 @@
+import sys
+import multiprocessing
+
+
+_current = None
+_total = None
+
+
+def _init(current, total):
+    global _current
+    global _total
+    _current = current
+    _total = total
+
+
+def _wrapped_func(func_and_args):
+    func, argument, should_print_progress = func_and_args
+
+    if should_print_progress:
+        with _current.get_lock():
+            _current.value += 1
+        sys.stdout.write('\r\t{} of {}'.format(_current.value, _total.value))
+
+    return func(argument)
+
+
+def pmap(func, iterable, processes, should_print_progress, *args, **kwargs):
+    """
+    A parallel map function that reports on its progress.
+
+    Applies `func` to every item of `iterable` and return a list of the
+    results. If `processes` is greater than one, a process pool is used to run
+    the functions in parallel. `should_print_progress` is a boolean value that
+    indicates whether a string 'N of M' should be printed to indicate how many
+    of the functions have finished being run.
+    """
+    global _current
+    global _total
+    _current = multiprocessing.Value('i', 0)
+    _total = multiprocessing.Value('i', len(iterable))
+
+    func_and_args = [(func, arg, should_print_progress,) for arg in iterable]
+    if processes <= 1:
+        result = map(_wrapped_func, func_and_args, *args, **kwargs)
+    else:
+        pool = multiprocessing.Pool(initializer=_init,
+                                    initargs=(_current, _total,),
+                                    processes=processes)
+        result = pool.map(_wrapped_func, func_and_args, *args, **kwargs)
+
+    if should_print_progress:
+        sys.stdout.write('\r')
+    return result
diff --git a/utils/opt-viewer/optrecord.py b/utils/opt-viewer/optrecord.py
index 6dc1a32e536a2..61ed9626cffad 100644
--- a/utils/opt-viewer/optrecord.py
+++ b/utils/opt-viewer/optrecord.py
@@ -10,15 +10,14 @@ except ImportError:
     print("For faster parsing, you may want to install libYAML for PyYAML")
     from yaml import Loader
 
-import functools
-from collections import defaultdict
-import itertools
-from multiprocessing import Pool
-from multiprocessing import Lock, cpu_count
 import cgi
+from collections import defaultdict
+import functools
+from multiprocessing import Lock
 import subprocess
 
-import traceback
+import optpmap
+
 
 p = subprocess.Popen(['c++filt', '-n'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
 p_lock = Lock()
@@ -42,8 +41,9 @@ else:
 
 def demangle(name):
     with p_lock:
-        p.stdin.write(name + '\n')
-        return p.stdout.readline().rstrip()
+        p.stdin.write((name + '\n').encode('utf-8'))
+        p.stdin.flush()
+        return p.stdout.readline().rstrip().decode('utf-8')
 
 
 def html_file_name(filename):
@@ -209,8 +209,11 @@ def get_remarks(input_file):
     return max_hotness, all_remarks, file_remarks
 
 
-def gather_results(pmap, filenames):
-    remarks = pmap(get_remarks, filenames)
+def gather_results(filenames, num_jobs, should_print_progress):
+    if should_print_progress:
+        print('Reading YAML files...')
+    remarks = optpmap.pmap(
+        get_remarks, filenames, num_jobs, should_print_progress)
     max_hotness = max(entry[0] for entry in remarks)
 
     def merge_file_remarks(file_remarks_job, all_remarks, merged):